Skip to main content

apr_cli/commands/
qa.rs

1//! QA Command Implementation - Falsifiable Quality Assurance Checklist
2//!
3//! Implements a scientific QA process for model releases. Every claim must be
4//! falsifiable - if a test can't fail, it doesn't provide information.
5//!
6//! # Gates
7//!
8//! 1. **Golden Output Test** (Correctness Gate)
9//!    - Run model with known prompts, verify expected patterns in output
10//!    - Falsifiable: Output must match expected pattern or test fails
11//!
12//! 2. **Throughput Falsification** (Performance Gate)
13//!    - Run benchmark with statistical rigor (CV < 5%)
14//!    - Assert minimum tok/s threshold
15//!    - Falsifiable: If tok/s < threshold, test fails
16//!
17//! 3. **Ollama Parity Test** (Parity Gate)
18//!    - Compare against Ollama baseline (if available)
19//!    - Assert speedup factor >= target
20//!    - Falsifiable: If speedup < target, test fails
21//!
22//! 4. **GPU vs CPU Speedup Test** (F-PERF-042)
23//!    - Measure throughput on both GPU and CPU
24//!    - Assert GPU >= 2x CPU (default threshold)
25//!    - Falsifiable: If GPU speedup < threshold, test fails
26//!    - Toyota Way: Genchi Genbutsu - measure real performance
27//!
28//! 5. **Cross-Format Parity Test** (F-QUAL-032)
29//!    - Compare argmax between GGUF and SafeTensors for same model
30//!    - Invariant: argmax(forward_gguf) == argmax(forward_safetensors)
31//!    - Falsifiable: If argmax differs, cross-format parity is BROKEN
32//!    - Cornerstone of architecture's logical validity
33//!
34//! 6. **PTX Parity Test** (GH-219, F-PTX-001)
35//!    - Validate batched GPU kernels maintain structural parity with single-vector references
36//!    - Checks: batch dispatch mechanism, u64 shared memory addressing, dispatch strategy
37//!    - Falsifiable: If any of 6 kernel pairs fails structural validation, test fails
38//!    - Toyota Way: Poka-Yoke - error-proof PTX generation at compile time
39//!
40//! # Usage
41//!
42//! ```bash
43//! apr qa model.gguf                           # Run all gates
44//! apr qa model.gguf --assert-tps 100          # Custom throughput threshold
45//! apr qa model.gguf --assert-speedup 2.0      # Custom Ollama speedup
46//! apr qa model.gguf --assert-gpu-speedup 3.0  # Custom GPU vs CPU speedup
47//! apr qa model.gguf --skip-ollama             # Skip Ollama comparison
48//! apr qa model.gguf --skip-gpu-speedup        # Skip GPU vs CPU test
49//! apr qa model.gguf --skip-format-parity      # Skip cross-format test
50//! apr qa model.gguf --safetensors-path m.st   # Compare with SafeTensors model
51//! apr qa model.gguf --json                    # JSON output for CI
52//! ```
53//!
54//! # Exit Codes
55//!
56//! - 0: All gates passed
57//! - 5: One or more gates failed (ValidationFailed)
58//!
59//! Toyota Way: Jidoka - Stop and fix quality issues immediately.
60//! Scientific Method: Claims must be falsifiable to have meaning.
61
62use crate::error::{CliError, Result};
63use crate::output;
64use colored::Colorize;
65use serde::{Deserialize, Serialize};
66use std::path::Path;
67use std::time::{Duration, Instant};
68
69#[cfg(not(feature = "visualization"))]
70use brick_tracer_shim::BrickTracer as TracerImpl;
71#[cfg(feature = "visualization")]
72use renacer::brick_tracer::BrickTracer as TracerImpl;
73
74/// No-op BrickTracer shim when the `visualization` (renacer) feature is disabled.
75/// Provides the same API surface so callers compile without cfg gates on every call site.
76#[cfg(not(feature = "visualization"))]
77mod brick_tracer_shim {
78    /// Stub syscall breakdown — all zeros.
79    pub struct SyscallBreakdown {
80        pub compute_us: u64,
81        pub mmap_us: u64,
82        pub futex_us: u64,
83        pub ioctl_us: u64,
84    }
85    impl SyscallBreakdown {
86        pub fn syscall_overhead_percent(&self) -> f64 {
87            0.0
88        }
89        pub fn dominant_syscall(&self) -> &'static str {
90            "none"
91        }
92    }
93
94    /// Stub trace metadata.
95    pub struct TraceMetadata {
96        pub budget_us: u64,
97        pub actual_us: u64,
98        pub efficiency: f64,
99    }
100
101    /// Result of a traced operation — contains the closure result + timing.
102    pub struct TracedResult<T> {
103        pub result: T,
104        pub duration_us: u64,
105        pub syscall_breakdown: SyscallBreakdown,
106        pub metadata: Option<TraceMetadata>,
107    }
108
109    /// No-op tracer that just times the closure with `Instant`.
110    pub struct BrickTracer;
111    impl BrickTracer {
112        pub fn new_local() -> Self {
113            Self
114        }
115        pub fn trace<T>(
116            &self,
117            _name: &str,
118            _budget_us: u64,
119            f: impl FnOnce() -> T,
120        ) -> TracedResult<T> {
121            let start = std::time::Instant::now();
122            let result = f();
123            let duration_us = start.elapsed().as_micros() as u64;
124            TracedResult {
125                result,
126                duration_us,
127                syscall_breakdown: SyscallBreakdown {
128                    compute_us: duration_us,
129                    mmap_us: 0,
130                    futex_us: 0,
131                    ioctl_us: 0,
132                },
133                metadata: None,
134            }
135        }
136    }
137}
138
139/// QA configuration
140#[derive(Debug, Clone)]
141pub struct QaConfig {
142    /// Minimum throughput in tok/s (default: 100 for GPU, 10 for CPU)
143    pub min_tps: f64,
144    /// Minimum speedup vs Ollama (default: 2.0x)
145    pub min_speedup: f64,
146    /// Minimum GPU vs CPU speedup (default: 2.0x) - F-PERF-042
147    pub min_gpu_speedup: f64,
148    /// Skip golden output test
149    pub skip_golden: bool,
150    /// Skip throughput test
151    pub skip_throughput: bool,
152    /// Skip Ollama parity test
153    pub skip_ollama: bool,
154    /// Skip GPU vs CPU speedup test (F-PERF-042)
155    pub skip_gpu_speedup: bool,
156    /// Skip tensor contract validation (PMAT-235)
157    pub skip_contract: bool,
158    /// Skip cross-format parity test (F-QUAL-032)
159    pub skip_format_parity: bool,
160    /// Skip PTX parity validation (GH-219, F-PTX-001)
161    pub skip_ptx_parity: bool,
162    /// SafeTensors model path for cross-format parity (F-QUAL-032)
163    pub safetensors_path: Option<std::path::PathBuf>,
164    /// Number of benchmark iterations
165    pub iterations: usize,
166    /// Number of warmup iterations
167    pub warmup: usize,
168    /// Max tokens for generation
169    pub max_tokens: usize,
170    /// Output as JSON
171    pub json: bool,
172    /// Verbose output
173    pub verbose: bool,
174    /// Minimum number of gates that must execute (not be skipped)
175    pub min_executed: Option<usize>,
176    /// Path to previous QA report for regression comparison
177    pub previous_report: Option<std::path::PathBuf>,
178    /// Maximum allowed performance regression (0.10 = 10%)
179    pub regression_threshold: f64,
180    /// Skip GPU state isolation test
181    pub skip_gpu_state: bool,
182    /// Skip metadata plausibility validation (Bug 210, GH-222)
183    pub skip_metadata: bool,
184    /// Skip GPU capability match gate (GH-280)
185    pub skip_capability: bool,
186    /// Assert classifier head presence and shape (F-CLASS-004)
187    pub assert_classifier_head: bool,
188}
189
190impl Default for QaConfig {
191    fn default() -> Self {
192        Self {
193            min_tps: 100.0,       // GPU target
194            min_speedup: 0.2, // Ollama uses llama.cpp optimized kernels; 0.2x is realistic floor
195            min_gpu_speedup: 2.0, // GPU must be 2x faster than CPU (F-PERF-042)
196            skip_golden: false,
197            skip_throughput: false,
198            skip_ollama: false,
199            skip_gpu_speedup: false,
200            skip_contract: false,
201            skip_format_parity: false,
202            skip_ptx_parity: false,
203            safetensors_path: None,
204            iterations: 10,
205            warmup: 3,
206            max_tokens: 32,
207            json: false,
208            verbose: false,
209            min_executed: None,
210            previous_report: None,
211            regression_threshold: 0.10,
212            skip_gpu_state: false,
213            skip_metadata: false,
214            skip_capability: false,
215            assert_classifier_head: false,
216        }
217    }
218}
219
220/// Result of a single QA gate
221#[derive(Debug, Clone, Serialize, Deserialize)]
222pub struct GateResult {
223    /// Gate name
224    pub name: String,
225    /// Whether the gate passed
226    pub passed: bool,
227    /// Human-readable result message
228    pub message: String,
229    /// Measured value (if applicable)
230    #[serde(skip_serializing_if = "Option::is_none")]
231    pub value: Option<f64>,
232    /// Expected/threshold value (if applicable)
233    #[serde(skip_serializing_if = "Option::is_none")]
234    pub threshold: Option<f64>,
235    /// Time taken to run the gate
236    pub duration_ms: u64,
237    /// Whether the gate was skipped
238    pub skipped: bool,
239}
240
241impl GateResult {
242    pub(crate) fn passed(
243        name: &str,
244        message: &str,
245        value: Option<f64>,
246        threshold: Option<f64>,
247        duration: Duration,
248    ) -> Self {
249        Self {
250            name: name.to_string(),
251            passed: true,
252            message: message.to_string(),
253            value,
254            threshold,
255            duration_ms: duration.as_millis() as u64,
256            skipped: false,
257        }
258    }
259
260    pub(crate) fn failed(
261        name: &str,
262        message: &str,
263        value: Option<f64>,
264        threshold: Option<f64>,
265        duration: Duration,
266    ) -> Self {
267        Self {
268            name: name.to_string(),
269            passed: false,
270            message: message.to_string(),
271            value,
272            threshold,
273            duration_ms: duration.as_millis() as u64,
274            skipped: false,
275        }
276    }
277
278    fn skipped(name: &str, reason: &str) -> Self {
279        Self {
280            name: name.to_string(),
281            passed: true, // Skipped gates don't fail
282            message: format!("Skipped: {reason}"),
283            value: None,
284            threshold: None,
285            duration_ms: 0,
286            skipped: true,
287        }
288    }
289}
290
291/// System information captured during QA run
292#[derive(Debug, Clone, Serialize, Deserialize)]
293pub struct SystemInfo {
294    /// CPU model name
295    pub cpu_model: String,
296    /// GPU model name (if available)
297    #[serde(skip_serializing_if = "Option::is_none")]
298    pub gpu_model: Option<String>,
299    /// GPU driver version (if available)
300    #[serde(skip_serializing_if = "Option::is_none")]
301    pub gpu_driver: Option<String>,
302}
303
304impl SystemInfo {
305    fn capture() -> Self {
306        let cpu_model = std::fs::read_to_string("/proc/cpuinfo")
307            .ok()
308            .and_then(|s| {
309                s.lines()
310                    .find(|l| l.starts_with("model name"))
311                    .and_then(|l| l.split(':').nth(1))
312                    .map(|s| s.trim().to_string())
313            })
314            .unwrap_or_else(|| "unknown".to_string());
315
316        let (gpu_model, gpu_driver) = Self::detect_gpu();
317
318        Self {
319            cpu_model,
320            gpu_model,
321            gpu_driver,
322        }
323    }
324
325    fn detect_gpu() -> (Option<String>, Option<String>) {
326        let output = std::process::Command::new("nvidia-smi")
327            .args(["--query-gpu=name,driver_version", "--format=csv,noheader"])
328            .output()
329            .ok();
330        if let Some(out) = output {
331            if out.status.success() {
332                let text = String::from_utf8_lossy(&out.stdout);
333                let parts: Vec<&str> = text.trim().splitn(2, ',').collect();
334                return (
335                    parts.first().map(|s| s.trim().to_string()),
336                    parts.get(1).map(|s| s.trim().to_string()),
337                );
338            }
339        }
340        (None, None)
341    }
342}
343
344/// Full QA report
345#[derive(Debug, Clone, Serialize, Deserialize)]
346pub struct QaReport {
347    /// Model path
348    pub model: String,
349    /// Whether all gates passed
350    pub passed: bool,
351    /// Individual gate results
352    pub gates: Vec<GateResult>,
353    /// Number of gates that actually executed (not skipped)
354    #[serde(default)]
355    pub gates_executed: usize,
356    /// Number of gates that were skipped
357    #[serde(default)]
358    pub gates_skipped: usize,
359    /// Total duration
360    pub total_duration_ms: u64,
361    /// Timestamp (ISO 8601)
362    pub timestamp: String,
363    /// Summary message
364    pub summary: String,
365    /// System information
366    #[serde(default, skip_serializing_if = "Option::is_none")]
367    pub system_info: Option<SystemInfo>,
368}
369
370/// Run the QA command
371#[allow(clippy::too_many_arguments)]
372pub fn run(
373    path: &Path,
374    min_tps: Option<f64>,
375    min_speedup: Option<f64>,
376    min_gpu_speedup: Option<f64>,
377    skip_golden: bool,
378    skip_throughput: bool,
379    skip_ollama: bool,
380    skip_gpu_speedup: bool,
381    skip_contract: bool,
382    skip_format_parity: bool,
383    skip_ptx_parity: bool,
384    safetensors_path: Option<std::path::PathBuf>,
385    iterations: usize,
386    warmup: usize,
387    max_tokens: usize,
388    json: bool,
389    verbose: bool,
390    min_executed: Option<usize>,
391    previous_report: Option<std::path::PathBuf>,
392    regression_threshold: Option<f64>,
393    skip_gpu_state: bool,
394    skip_metadata: bool,
395    skip_capability: bool,
396    assert_classifier_head: bool,
397) -> Result<()> {
398    let config = QaConfig {
399        min_tps: min_tps.unwrap_or(100.0),
400        min_speedup: min_speedup.unwrap_or(0.2), // Ollama uses llama.cpp optimized kernels
401        min_gpu_speedup: min_gpu_speedup.unwrap_or(2.0), // GPU must be 2x faster (F-PERF-042)
402        skip_golden,
403        skip_throughput,
404        skip_ollama,
405        skip_gpu_speedup,
406        skip_contract,
407        skip_format_parity,
408        skip_ptx_parity,
409        safetensors_path,
410        iterations,
411        warmup,
412        max_tokens,
413        json,
414        verbose,
415        min_executed,
416        previous_report,
417        regression_threshold: regression_threshold.unwrap_or(0.10),
418        skip_gpu_state,
419        skip_metadata,
420        skip_capability,
421        assert_classifier_head,
422    };
423
424    let report = run_qa(path, &config)?;
425
426    if json {
427        println!(
428            "{}",
429            serde_json::to_string_pretty(&report).unwrap_or_default()
430        );
431    }
432
433    if !report.passed {
434        return Err(CliError::ValidationFailed(report.summary));
435    }
436
437    Ok(())
438}
439
440/// Dispatch a single QA gate: skip if flagged, otherwise run, then print and collect.
441fn dispatch_gate(
442    gates: &mut Vec<GateResult>,
443    json: bool,
444    skip: bool,
445    name: &str,
446    skip_reason: &str,
447    runner: impl FnOnce() -> Result<GateResult>,
448) -> Result<()> {
449    let result = if skip {
450        GateResult::skipped(name, skip_reason)
451    } else {
452        runner()?
453    };
454    if !json {
455        print_gate_result(&result);
456    }
457    gates.push(result);
458    Ok(())
459}
460
461/// Run all QA gates and produce a report
462/// Human-readable gate name for display.
463fn gate_display_name(name: &str) -> &str {
464    match name {
465        "capability_match" => "Capability Match",
466        "tensor_contract" => "Tensor Contract",
467        "golden_output" => "Golden Output",
468        "throughput" => "Throughput",
469        "ollama_parity" => "Ollama Parity",
470        "gpu_speedup" => "GPU Speedup",
471        "format_parity" => "Format Parity",
472        "ptx_parity" => "PTX Parity",
473        "gpu_state_isolation" => "GPU State Isolation",
474        "performance_regression" => "Perf Regression",
475        "metadata_plausibility" => "Metadata Plausibility",
476        "classifier_head" => "Classifier Head",
477        other => other,
478    }
479}
480
481/// Print the QA summary table and pass/fail badges.
482fn print_qa_summary(gates: &[GateResult], passed: bool, total_duration: Duration) {
483    output::header("QA Summary");
484
485    let gate_rows: Vec<Vec<String>> = gates
486        .iter()
487        .map(|g| {
488            let badge = if g.skipped {
489                output::badge_skip("SKIP")
490            } else if g.passed {
491                output::badge_pass("PASS")
492            } else {
493                output::badge_fail("FAIL")
494            };
495            let measured = g.value.map_or("—".to_string(), |v| format!("{v:.2}"));
496            let threshold = g.threshold.map_or("—".to_string(), |v| format!("{v:.2}"));
497            vec![
498                gate_display_name(&g.name).to_string(),
499                badge,
500                measured,
501                threshold,
502                output::duration_fmt(g.duration_ms),
503            ]
504        })
505        .collect();
506    println!(
507        "{}",
508        output::table(
509            &["Gate", "Status", "Measured", "Threshold", "Duration"],
510            &gate_rows,
511        )
512    );
513
514    println!();
515    if passed {
516        println!("  {}", output::badge_pass("ALL GATES PASSED"));
517    } else {
518        println!("  {}", output::badge_fail("GATES FAILED"));
519        for gate in gates.iter().filter(|g| !g.passed && !g.skipped) {
520            println!("    {} {}", "✗".red(), gate.name);
521        }
522    }
523    output::metric(
524        "Total Duration",
525        output::duration_fmt(total_duration.as_millis() as u64),
526        "",
527    );
528}
529
530include!("qa_gguf.rs");
531include!("output_verification.rs");
532include!("golden_output.rs");
533include!("speedup.rs");
534include!("forward_error.rs");
535include!("gpu_isolation_result.rs");
536include!("qa_08.rs");