Skip to main content

apr_cli/commands/
qa.rs

1//! QA Command Implementation - Falsifiable Quality Assurance Checklist
2//!
3//! Implements a scientific QA process for model releases. Every claim must be
4//! falsifiable - if a test can't fail, it doesn't provide information.
5//!
6//! # Gates
7//!
8//! 1. **Golden Output Test** (Correctness Gate)
9//!    - Run model with known prompts, verify expected patterns in output
10//!    - Falsifiable: Output must match expected pattern or test fails
11//!
12//! 2. **Throughput Falsification** (Performance Gate)
13//!    - Run benchmark with statistical rigor (CV < 5%)
14//!    - Assert minimum tok/s threshold
15//!    - Falsifiable: If tok/s < threshold, test fails
16//!
17//! 3. **Ollama Parity Test** (Parity Gate)
18//!    - Compare against Ollama baseline (if available)
19//!    - Assert speedup factor >= target
20//!    - Falsifiable: If speedup < target, test fails
21//!
22//! 4. **GPU vs CPU Speedup Test** (F-PERF-042)
23//!    - Measure throughput on both GPU and CPU
24//!    - Assert GPU >= 2x CPU (default threshold)
25//!    - Falsifiable: If GPU speedup < threshold, test fails
26//!    - Toyota Way: Genchi Genbutsu - measure real performance
27//!
28//! 5. **Cross-Format Parity Test** (F-QUAL-032)
29//!    - Compare argmax between GGUF and SafeTensors for same model
30//!    - Invariant: argmax(forward_gguf) == argmax(forward_safetensors)
31//!    - Falsifiable: If argmax differs, cross-format parity is BROKEN
32//!    - Cornerstone of architecture's logical validity
33//!
34//! 6. **PTX Parity Test** (GH-219, F-PTX-001)
35//!    - Validate batched GPU kernels maintain structural parity with single-vector references
36//!    - Checks: batch dispatch mechanism, u64 shared memory addressing, dispatch strategy
37//!    - Falsifiable: If any of 6 kernel pairs fails structural validation, test fails
38//!    - Toyota Way: Poka-Yoke - error-proof PTX generation at compile time
39//!
40//! # Usage
41//!
42//! ```bash
43//! apr qa model.gguf                           # Run all gates
44//! apr qa model.gguf --assert-tps 100          # Custom throughput threshold
45//! apr qa model.gguf --assert-speedup 2.0      # Custom Ollama speedup
46//! apr qa model.gguf --assert-gpu-speedup 3.0  # Custom GPU vs CPU speedup
47//! apr qa model.gguf --skip-ollama             # Skip Ollama comparison
48//! apr qa model.gguf --skip-gpu-speedup        # Skip GPU vs CPU test
49//! apr qa model.gguf --skip-format-parity      # Skip cross-format test
50//! apr qa model.gguf --safetensors-path m.st   # Compare with SafeTensors model
51//! apr qa model.gguf --json                    # JSON output for CI
52//! ```
53//!
54//! # Exit Codes
55//!
56//! - 0: All gates passed
57//! - 5: One or more gates failed (ValidationFailed)
58//!
59//! Toyota Way: Jidoka - Stop and fix quality issues immediately.
60//! Scientific Method: Claims must be falsifiable to have meaning.
61
62use crate::error::{CliError, Result};
63use crate::output;
64use colored::Colorize;
65use serde::{Deserialize, Serialize};
66use std::path::Path;
67use std::time::{Duration, Instant};
68
69/// QA configuration
70#[derive(Debug, Clone)]
71pub struct QaConfig {
72    /// Minimum throughput in tok/s (default: 100 for GPU, 10 for CPU)
73    pub min_tps: f64,
74    /// Minimum speedup vs Ollama (default: 2.0x)
75    pub min_speedup: f64,
76    /// Minimum GPU vs CPU speedup (default: 2.0x) - F-PERF-042
77    pub min_gpu_speedup: f64,
78    /// Skip golden output test
79    pub skip_golden: bool,
80    /// Skip throughput test
81    pub skip_throughput: bool,
82    /// Skip Ollama parity test
83    pub skip_ollama: bool,
84    /// Skip GPU vs CPU speedup test (F-PERF-042)
85    pub skip_gpu_speedup: bool,
86    /// Skip tensor contract validation (PMAT-235)
87    pub skip_contract: bool,
88    /// Skip cross-format parity test (F-QUAL-032)
89    pub skip_format_parity: bool,
90    /// Skip PTX parity validation (GH-219, F-PTX-001)
91    pub skip_ptx_parity: bool,
92    /// SafeTensors model path for cross-format parity (F-QUAL-032)
93    pub safetensors_path: Option<std::path::PathBuf>,
94    /// Number of benchmark iterations
95    pub iterations: usize,
96    /// Number of warmup iterations
97    pub warmup: usize,
98    /// Max tokens for generation
99    pub max_tokens: usize,
100    /// Output as JSON
101    pub json: bool,
102    /// Verbose output
103    pub verbose: bool,
104    /// Minimum number of gates that must execute (not be skipped)
105    pub min_executed: Option<usize>,
106    /// Path to previous QA report for regression comparison
107    pub previous_report: Option<std::path::PathBuf>,
108    /// Maximum allowed performance regression (0.10 = 10%)
109    pub regression_threshold: f64,
110    /// Skip GPU state isolation test
111    pub skip_gpu_state: bool,
112    /// Skip metadata plausibility validation (Bug 210, GH-222)
113    pub skip_metadata: bool,
114    /// Skip GPU capability match gate (GH-280)
115    pub skip_capability: bool,
116}
117
118impl Default for QaConfig {
119    fn default() -> Self {
120        Self {
121            min_tps: 100.0,       // GPU target
122            min_speedup: 0.2, // Ollama uses llama.cpp optimized kernels; 0.2x is realistic floor
123            min_gpu_speedup: 2.0, // GPU must be 2x faster than CPU (F-PERF-042)
124            skip_golden: false,
125            skip_throughput: false,
126            skip_ollama: false,
127            skip_gpu_speedup: false,
128            skip_contract: false,
129            skip_format_parity: false,
130            skip_ptx_parity: false,
131            safetensors_path: None,
132            iterations: 10,
133            warmup: 3,
134            max_tokens: 32,
135            json: false,
136            verbose: false,
137            min_executed: None,
138            previous_report: None,
139            regression_threshold: 0.10,
140            skip_gpu_state: false,
141            skip_metadata: false,
142            skip_capability: false,
143        }
144    }
145}
146
147/// Result of a single QA gate
148#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct GateResult {
150    /// Gate name
151    pub name: String,
152    /// Whether the gate passed
153    pub passed: bool,
154    /// Human-readable result message
155    pub message: String,
156    /// Measured value (if applicable)
157    #[serde(skip_serializing_if = "Option::is_none")]
158    pub value: Option<f64>,
159    /// Expected/threshold value (if applicable)
160    #[serde(skip_serializing_if = "Option::is_none")]
161    pub threshold: Option<f64>,
162    /// Time taken to run the gate
163    pub duration_ms: u64,
164    /// Whether the gate was skipped
165    pub skipped: bool,
166}
167
168impl GateResult {
169    pub(crate) fn passed(
170        name: &str,
171        message: &str,
172        value: Option<f64>,
173        threshold: Option<f64>,
174        duration: Duration,
175    ) -> Self {
176        Self {
177            name: name.to_string(),
178            passed: true,
179            message: message.to_string(),
180            value,
181            threshold,
182            duration_ms: duration.as_millis() as u64,
183            skipped: false,
184        }
185    }
186
187    pub(crate) fn failed(
188        name: &str,
189        message: &str,
190        value: Option<f64>,
191        threshold: Option<f64>,
192        duration: Duration,
193    ) -> Self {
194        Self {
195            name: name.to_string(),
196            passed: false,
197            message: message.to_string(),
198            value,
199            threshold,
200            duration_ms: duration.as_millis() as u64,
201            skipped: false,
202        }
203    }
204
205    fn skipped(name: &str, reason: &str) -> Self {
206        Self {
207            name: name.to_string(),
208            passed: true, // Skipped gates don't fail
209            message: format!("Skipped: {reason}"),
210            value: None,
211            threshold: None,
212            duration_ms: 0,
213            skipped: true,
214        }
215    }
216}
217
218/// System information captured during QA run
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct SystemInfo {
221    /// CPU model name
222    pub cpu_model: String,
223    /// GPU model name (if available)
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub gpu_model: Option<String>,
226    /// GPU driver version (if available)
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub gpu_driver: Option<String>,
229}
230
231impl SystemInfo {
232    fn capture() -> Self {
233        let cpu_model = std::fs::read_to_string("/proc/cpuinfo")
234            .ok()
235            .and_then(|s| {
236                s.lines()
237                    .find(|l| l.starts_with("model name"))
238                    .and_then(|l| l.split(':').nth(1))
239                    .map(|s| s.trim().to_string())
240            })
241            .unwrap_or_else(|| "unknown".to_string());
242
243        let (gpu_model, gpu_driver) = Self::detect_gpu();
244
245        Self {
246            cpu_model,
247            gpu_model,
248            gpu_driver,
249        }
250    }
251
252    fn detect_gpu() -> (Option<String>, Option<String>) {
253        let output = std::process::Command::new("nvidia-smi")
254            .args(["--query-gpu=name,driver_version", "--format=csv,noheader"])
255            .output()
256            .ok();
257        if let Some(out) = output {
258            if out.status.success() {
259                let text = String::from_utf8_lossy(&out.stdout);
260                let parts: Vec<&str> = text.trim().splitn(2, ',').collect();
261                return (
262                    parts.first().map(|s| s.trim().to_string()),
263                    parts.get(1).map(|s| s.trim().to_string()),
264                );
265            }
266        }
267        (None, None)
268    }
269}
270
271/// Full QA report
272#[derive(Debug, Clone, Serialize, Deserialize)]
273pub struct QaReport {
274    /// Model path
275    pub model: String,
276    /// Whether all gates passed
277    pub passed: bool,
278    /// Individual gate results
279    pub gates: Vec<GateResult>,
280    /// Number of gates that actually executed (not skipped)
281    #[serde(default)]
282    pub gates_executed: usize,
283    /// Number of gates that were skipped
284    #[serde(default)]
285    pub gates_skipped: usize,
286    /// Total duration
287    pub total_duration_ms: u64,
288    /// Timestamp (ISO 8601)
289    pub timestamp: String,
290    /// Summary message
291    pub summary: String,
292    /// System information
293    #[serde(default, skip_serializing_if = "Option::is_none")]
294    pub system_info: Option<SystemInfo>,
295}
296
297/// Run the QA command
298#[allow(clippy::too_many_arguments)]
299pub fn run(
300    path: &Path,
301    min_tps: Option<f64>,
302    min_speedup: Option<f64>,
303    min_gpu_speedup: Option<f64>,
304    skip_golden: bool,
305    skip_throughput: bool,
306    skip_ollama: bool,
307    skip_gpu_speedup: bool,
308    skip_contract: bool,
309    skip_format_parity: bool,
310    skip_ptx_parity: bool,
311    safetensors_path: Option<std::path::PathBuf>,
312    iterations: usize,
313    warmup: usize,
314    max_tokens: usize,
315    json: bool,
316    verbose: bool,
317    min_executed: Option<usize>,
318    previous_report: Option<std::path::PathBuf>,
319    regression_threshold: Option<f64>,
320    skip_gpu_state: bool,
321    skip_metadata: bool,
322    skip_capability: bool,
323) -> Result<()> {
324    let config = QaConfig {
325        min_tps: min_tps.unwrap_or(100.0),
326        min_speedup: min_speedup.unwrap_or(0.2), // Ollama uses llama.cpp optimized kernels
327        min_gpu_speedup: min_gpu_speedup.unwrap_or(2.0), // GPU must be 2x faster (F-PERF-042)
328        skip_golden,
329        skip_throughput,
330        skip_ollama,
331        skip_gpu_speedup,
332        skip_contract,
333        skip_format_parity,
334        skip_ptx_parity,
335        safetensors_path,
336        iterations,
337        warmup,
338        max_tokens,
339        json,
340        verbose,
341        min_executed,
342        previous_report,
343        regression_threshold: regression_threshold.unwrap_or(0.10),
344        skip_gpu_state,
345        skip_metadata,
346        skip_capability,
347    };
348
349    let report = run_qa(path, &config)?;
350
351    if json {
352        println!(
353            "{}",
354            serde_json::to_string_pretty(&report).unwrap_or_default()
355        );
356    }
357
358    if !report.passed {
359        return Err(CliError::ValidationFailed(report.summary));
360    }
361
362    Ok(())
363}
364
365/// Dispatch a single QA gate: skip if flagged, otherwise run, then print and collect.
366fn dispatch_gate(
367    gates: &mut Vec<GateResult>,
368    json: bool,
369    skip: bool,
370    name: &str,
371    skip_reason: &str,
372    runner: impl FnOnce() -> Result<GateResult>,
373) -> Result<()> {
374    let result = if skip {
375        GateResult::skipped(name, skip_reason)
376    } else {
377        runner()?
378    };
379    if !json {
380        print_gate_result(&result);
381    }
382    gates.push(result);
383    Ok(())
384}
385
386/// Run all QA gates and produce a report
387/// Human-readable gate name for display.
388fn gate_display_name(name: &str) -> &str {
389    match name {
390        "capability_match" => "Capability Match",
391        "tensor_contract" => "Tensor Contract",
392        "golden_output" => "Golden Output",
393        "throughput" => "Throughput",
394        "ollama_parity" => "Ollama Parity",
395        "gpu_speedup" => "GPU Speedup",
396        "format_parity" => "Format Parity",
397        "ptx_parity" => "PTX Parity",
398        "gpu_state_isolation" => "GPU State Isolation",
399        "performance_regression" => "Perf Regression",
400        "metadata_plausibility" => "Metadata Plausibility",
401        other => other,
402    }
403}
404
405/// Print the QA summary table and pass/fail badges.
406fn print_qa_summary(gates: &[GateResult], passed: bool, total_duration: Duration) {
407    output::header("QA Summary");
408
409    let gate_rows: Vec<Vec<String>> = gates
410        .iter()
411        .map(|g| {
412            let badge = if g.skipped {
413                output::badge_skip("SKIP")
414            } else if g.passed {
415                output::badge_pass("PASS")
416            } else {
417                output::badge_fail("FAIL")
418            };
419            let measured = g.value.map_or("—".to_string(), |v| format!("{v:.2}"));
420            let threshold = g.threshold.map_or("—".to_string(), |v| format!("{v:.2}"));
421            vec![
422                gate_display_name(&g.name).to_string(),
423                badge,
424                measured,
425                threshold,
426                output::duration_fmt(g.duration_ms),
427            ]
428        })
429        .collect();
430    println!(
431        "{}",
432        output::table(
433            &["Gate", "Status", "Measured", "Threshold", "Duration"],
434            &gate_rows,
435        )
436    );
437
438    println!();
439    if passed {
440        println!("  {}", output::badge_pass("ALL GATES PASSED"));
441    } else {
442        println!("  {}", output::badge_fail("GATES FAILED"));
443        for gate in gates.iter().filter(|g| !g.passed && !g.skipped) {
444            println!("    {} {}", "✗".red(), gate.name);
445        }
446    }
447    output::metric(
448        "Total Duration",
449        output::duration_fmt(total_duration.as_millis() as u64),
450        "",
451    );
452}
453
454include!("qa_gguf.rs");
455include!("output_verification.rs");
456include!("golden_output.rs");
457include!("speedup.rs");
458include!("forward_error.rs");
459include!("gpu_isolation_result.rs");
460include!("qa_08.rs");