Skip to main content

apr_cli/commands/
qa.rs

1//! QA Command Implementation - Falsifiable Quality Assurance Checklist
2//!
3//! Implements a scientific QA process for model releases. Every claim must be
4//! falsifiable - if a test can't fail, it doesn't provide information.
5//!
6//! # Gates
7//!
8//! 1. **Golden Output Test** (Correctness Gate)
9//!    - Run model with known prompts, verify expected patterns in output
10//!    - Falsifiable: Output must match expected pattern or test fails
11//!
12//! 2. **Throughput Falsification** (Performance Gate)
13//!    - Run benchmark with statistical rigor (CV < 5%)
14//!    - Assert minimum tok/s threshold
15//!    - Falsifiable: If tok/s < threshold, test fails
16//!
17//! 3. **Ollama Parity Test** (Parity Gate)
18//!    - Compare against Ollama baseline (if available)
19//!    - Assert speedup factor >= target
20//!    - Falsifiable: If speedup < target, test fails
21//!
22//! 4. **GPU vs CPU Speedup Test** (F-PERF-042)
23//!    - Measure throughput on both GPU and CPU
24//!    - Assert GPU >= 2x CPU (default threshold)
25//!    - Falsifiable: If GPU speedup < threshold, test fails
26//!    - Toyota Way: Genchi Genbutsu - measure real performance
27//!
28//! 5. **Cross-Format Parity Test** (F-QUAL-032)
29//!    - Compare argmax between GGUF and SafeTensors for same model
30//!    - Invariant: argmax(forward_gguf) == argmax(forward_safetensors)
31//!    - Falsifiable: If argmax differs, cross-format parity is BROKEN
32//!    - Cornerstone of architecture's logical validity
33//!
34//! 6. **PTX Parity Test** (GH-219, F-PTX-001)
35//!    - Validate batched GPU kernels maintain structural parity with single-vector references
36//!    - Checks: batch dispatch mechanism, u64 shared memory addressing, dispatch strategy
37//!    - Falsifiable: If any of 6 kernel pairs fails structural validation, test fails
38//!    - Toyota Way: Poka-Yoke - error-proof PTX generation at compile time
39//!
40//! # Usage
41//!
42//! ```bash
43//! apr qa model.gguf                           # Run all gates
44//! apr qa model.gguf --assert-tps 100          # Custom throughput threshold
45//! apr qa model.gguf --assert-speedup 2.0      # Custom Ollama speedup
46//! apr qa model.gguf --assert-gpu-speedup 3.0  # Custom GPU vs CPU speedup
47//! apr qa model.gguf --skip-ollama             # Skip Ollama comparison
48//! apr qa model.gguf --skip-gpu-speedup        # Skip GPU vs CPU test
49//! apr qa model.gguf --skip-format-parity      # Skip cross-format test
50//! apr qa model.gguf --safetensors-path m.st   # Compare with SafeTensors model
51//! apr qa model.gguf --json                    # JSON output for CI
52//! ```
53//!
54//! # Exit Codes
55//!
56//! - 0: All gates passed
57//! - 5: One or more gates failed (ValidationFailed)
58//!
59//! Toyota Way: Jidoka - Stop and fix quality issues immediately.
60//! Scientific Method: Claims must be falsifiable to have meaning.
61
62use crate::error::{CliError, Result};
63use crate::output;
64use colored::Colorize;
65use serde::{Deserialize, Serialize};
66use std::path::Path;
67use std::time::{Duration, Instant};
68
69/// QA configuration
70#[derive(Debug, Clone)]
71pub struct QaConfig {
72    /// Minimum throughput in tok/s (default: 100 for GPU, 10 for CPU)
73    pub min_tps: f64,
74    /// Minimum speedup vs Ollama (default: 2.0x)
75    pub min_speedup: f64,
76    /// Minimum GPU vs CPU speedup (default: 2.0x) - F-PERF-042
77    pub min_gpu_speedup: f64,
78    /// Skip golden output test
79    pub skip_golden: bool,
80    /// Skip throughput test
81    pub skip_throughput: bool,
82    /// Skip Ollama parity test
83    pub skip_ollama: bool,
84    /// Skip GPU vs CPU speedup test (F-PERF-042)
85    pub skip_gpu_speedup: bool,
86    /// Skip tensor contract validation (PMAT-235)
87    pub skip_contract: bool,
88    /// Skip cross-format parity test (F-QUAL-032)
89    pub skip_format_parity: bool,
90    /// Skip PTX parity validation (GH-219, F-PTX-001)
91    pub skip_ptx_parity: bool,
92    /// SafeTensors model path for cross-format parity (F-QUAL-032)
93    pub safetensors_path: Option<std::path::PathBuf>,
94    /// Number of benchmark iterations
95    pub iterations: usize,
96    /// Number of warmup iterations
97    pub warmup: usize,
98    /// Max tokens for generation
99    pub max_tokens: usize,
100    /// Output as JSON
101    pub json: bool,
102    /// Verbose output
103    pub verbose: bool,
104    /// Minimum number of gates that must execute (not be skipped)
105    pub min_executed: Option<usize>,
106    /// Path to previous QA report for regression comparison
107    pub previous_report: Option<std::path::PathBuf>,
108    /// Maximum allowed performance regression (0.10 = 10%)
109    pub regression_threshold: f64,
110    /// Skip GPU state isolation test
111    pub skip_gpu_state: bool,
112    /// Skip metadata plausibility validation (Bug 210, GH-222)
113    pub skip_metadata: bool,
114    /// Skip GPU capability match gate (GH-280)
115    pub skip_capability: bool,
116    /// Assert classifier head presence and shape (F-CLASS-004)
117    pub assert_classifier_head: bool,
118}
119
120impl Default for QaConfig {
121    fn default() -> Self {
122        Self {
123            min_tps: 100.0,       // GPU target
124            min_speedup: 0.2, // Ollama uses llama.cpp optimized kernels; 0.2x is realistic floor
125            min_gpu_speedup: 2.0, // GPU must be 2x faster than CPU (F-PERF-042)
126            skip_golden: false,
127            skip_throughput: false,
128            skip_ollama: false,
129            skip_gpu_speedup: false,
130            skip_contract: false,
131            skip_format_parity: false,
132            skip_ptx_parity: false,
133            safetensors_path: None,
134            iterations: 10,
135            warmup: 3,
136            max_tokens: 32,
137            json: false,
138            verbose: false,
139            min_executed: None,
140            previous_report: None,
141            regression_threshold: 0.10,
142            skip_gpu_state: false,
143            skip_metadata: false,
144            skip_capability: false,
145            assert_classifier_head: false,
146        }
147    }
148}
149
150/// Result of a single QA gate
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct GateResult {
153    /// Gate name
154    pub name: String,
155    /// Whether the gate passed
156    pub passed: bool,
157    /// Human-readable result message
158    pub message: String,
159    /// Measured value (if applicable)
160    #[serde(skip_serializing_if = "Option::is_none")]
161    pub value: Option<f64>,
162    /// Expected/threshold value (if applicable)
163    #[serde(skip_serializing_if = "Option::is_none")]
164    pub threshold: Option<f64>,
165    /// Time taken to run the gate
166    pub duration_ms: u64,
167    /// Whether the gate was skipped
168    pub skipped: bool,
169}
170
171impl GateResult {
172    pub(crate) fn passed(
173        name: &str,
174        message: &str,
175        value: Option<f64>,
176        threshold: Option<f64>,
177        duration: Duration,
178    ) -> Self {
179        Self {
180            name: name.to_string(),
181            passed: true,
182            message: message.to_string(),
183            value,
184            threshold,
185            duration_ms: duration.as_millis() as u64,
186            skipped: false,
187        }
188    }
189
190    pub(crate) fn failed(
191        name: &str,
192        message: &str,
193        value: Option<f64>,
194        threshold: Option<f64>,
195        duration: Duration,
196    ) -> Self {
197        Self {
198            name: name.to_string(),
199            passed: false,
200            message: message.to_string(),
201            value,
202            threshold,
203            duration_ms: duration.as_millis() as u64,
204            skipped: false,
205        }
206    }
207
208    fn skipped(name: &str, reason: &str) -> Self {
209        Self {
210            name: name.to_string(),
211            passed: true, // Skipped gates don't fail
212            message: format!("Skipped: {reason}"),
213            value: None,
214            threshold: None,
215            duration_ms: 0,
216            skipped: true,
217        }
218    }
219}
220
221/// System information captured during QA run
222#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct SystemInfo {
224    /// CPU model name
225    pub cpu_model: String,
226    /// GPU model name (if available)
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub gpu_model: Option<String>,
229    /// GPU driver version (if available)
230    #[serde(skip_serializing_if = "Option::is_none")]
231    pub gpu_driver: Option<String>,
232}
233
234impl SystemInfo {
235    fn capture() -> Self {
236        let cpu_model = std::fs::read_to_string("/proc/cpuinfo")
237            .ok()
238            .and_then(|s| {
239                s.lines()
240                    .find(|l| l.starts_with("model name"))
241                    .and_then(|l| l.split(':').nth(1))
242                    .map(|s| s.trim().to_string())
243            })
244            .unwrap_or_else(|| "unknown".to_string());
245
246        let (gpu_model, gpu_driver) = Self::detect_gpu();
247
248        Self {
249            cpu_model,
250            gpu_model,
251            gpu_driver,
252        }
253    }
254
255    fn detect_gpu() -> (Option<String>, Option<String>) {
256        let output = std::process::Command::new("nvidia-smi")
257            .args(["--query-gpu=name,driver_version", "--format=csv,noheader"])
258            .output()
259            .ok();
260        if let Some(out) = output {
261            if out.status.success() {
262                let text = String::from_utf8_lossy(&out.stdout);
263                let parts: Vec<&str> = text.trim().splitn(2, ',').collect();
264                return (
265                    parts.first().map(|s| s.trim().to_string()),
266                    parts.get(1).map(|s| s.trim().to_string()),
267                );
268            }
269        }
270        (None, None)
271    }
272}
273
274/// Full QA report
275#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct QaReport {
277    /// Model path
278    pub model: String,
279    /// Whether all gates passed
280    pub passed: bool,
281    /// Individual gate results
282    pub gates: Vec<GateResult>,
283    /// Number of gates that actually executed (not skipped)
284    #[serde(default)]
285    pub gates_executed: usize,
286    /// Number of gates that were skipped
287    #[serde(default)]
288    pub gates_skipped: usize,
289    /// Total duration
290    pub total_duration_ms: u64,
291    /// Timestamp (ISO 8601)
292    pub timestamp: String,
293    /// Summary message
294    pub summary: String,
295    /// System information
296    #[serde(default, skip_serializing_if = "Option::is_none")]
297    pub system_info: Option<SystemInfo>,
298}
299
300/// Run the QA command
301#[allow(clippy::too_many_arguments)]
302pub fn run(
303    path: &Path,
304    min_tps: Option<f64>,
305    min_speedup: Option<f64>,
306    min_gpu_speedup: Option<f64>,
307    skip_golden: bool,
308    skip_throughput: bool,
309    skip_ollama: bool,
310    skip_gpu_speedup: bool,
311    skip_contract: bool,
312    skip_format_parity: bool,
313    skip_ptx_parity: bool,
314    safetensors_path: Option<std::path::PathBuf>,
315    iterations: usize,
316    warmup: usize,
317    max_tokens: usize,
318    json: bool,
319    verbose: bool,
320    min_executed: Option<usize>,
321    previous_report: Option<std::path::PathBuf>,
322    regression_threshold: Option<f64>,
323    skip_gpu_state: bool,
324    skip_metadata: bool,
325    skip_capability: bool,
326    assert_classifier_head: bool,
327) -> Result<()> {
328    let config = QaConfig {
329        min_tps: min_tps.unwrap_or(100.0),
330        min_speedup: min_speedup.unwrap_or(0.2), // Ollama uses llama.cpp optimized kernels
331        min_gpu_speedup: min_gpu_speedup.unwrap_or(2.0), // GPU must be 2x faster (F-PERF-042)
332        skip_golden,
333        skip_throughput,
334        skip_ollama,
335        skip_gpu_speedup,
336        skip_contract,
337        skip_format_parity,
338        skip_ptx_parity,
339        safetensors_path,
340        iterations,
341        warmup,
342        max_tokens,
343        json,
344        verbose,
345        min_executed,
346        previous_report,
347        regression_threshold: regression_threshold.unwrap_or(0.10),
348        skip_gpu_state,
349        skip_metadata,
350        skip_capability,
351        assert_classifier_head,
352    };
353
354    let report = run_qa(path, &config)?;
355
356    if json {
357        println!(
358            "{}",
359            serde_json::to_string_pretty(&report).unwrap_or_default()
360        );
361    }
362
363    if !report.passed {
364        return Err(CliError::ValidationFailed(report.summary));
365    }
366
367    Ok(())
368}
369
370/// Dispatch a single QA gate: skip if flagged, otherwise run, then print and collect.
371fn dispatch_gate(
372    gates: &mut Vec<GateResult>,
373    json: bool,
374    skip: bool,
375    name: &str,
376    skip_reason: &str,
377    runner: impl FnOnce() -> Result<GateResult>,
378) -> Result<()> {
379    let result = if skip {
380        GateResult::skipped(name, skip_reason)
381    } else {
382        runner()?
383    };
384    if !json {
385        print_gate_result(&result);
386    }
387    gates.push(result);
388    Ok(())
389}
390
391/// Run all QA gates and produce a report
392/// Human-readable gate name for display.
393fn gate_display_name(name: &str) -> &str {
394    match name {
395        "capability_match" => "Capability Match",
396        "tensor_contract" => "Tensor Contract",
397        "golden_output" => "Golden Output",
398        "throughput" => "Throughput",
399        "ollama_parity" => "Ollama Parity",
400        "gpu_speedup" => "GPU Speedup",
401        "format_parity" => "Format Parity",
402        "ptx_parity" => "PTX Parity",
403        "gpu_state_isolation" => "GPU State Isolation",
404        "performance_regression" => "Perf Regression",
405        "metadata_plausibility" => "Metadata Plausibility",
406        "classifier_head" => "Classifier Head",
407        other => other,
408    }
409}
410
411/// Print the QA summary table and pass/fail badges.
412fn print_qa_summary(gates: &[GateResult], passed: bool, total_duration: Duration) {
413    output::header("QA Summary");
414
415    let gate_rows: Vec<Vec<String>> = gates
416        .iter()
417        .map(|g| {
418            let badge = if g.skipped {
419                output::badge_skip("SKIP")
420            } else if g.passed {
421                output::badge_pass("PASS")
422            } else {
423                output::badge_fail("FAIL")
424            };
425            let measured = g.value.map_or("—".to_string(), |v| format!("{v:.2}"));
426            let threshold = g.threshold.map_or("—".to_string(), |v| format!("{v:.2}"));
427            vec![
428                gate_display_name(&g.name).to_string(),
429                badge,
430                measured,
431                threshold,
432                output::duration_fmt(g.duration_ms),
433            ]
434        })
435        .collect();
436    println!(
437        "{}",
438        output::table(
439            &["Gate", "Status", "Measured", "Threshold", "Duration"],
440            &gate_rows,
441        )
442    );
443
444    println!();
445    if passed {
446        println!("  {}", output::badge_pass("ALL GATES PASSED"));
447    } else {
448        println!("  {}", output::badge_fail("GATES FAILED"));
449        for gate in gates.iter().filter(|g| !g.passed && !g.skipped) {
450            println!("    {} {}", "✗".red(), gate.name);
451        }
452    }
453    output::metric(
454        "Total Duration",
455        output::duration_fmt(total_duration.as_millis() as u64),
456        "",
457    );
458}
459
460include!("qa_gguf.rs");
461include!("output_verification.rs");
462include!("golden_output.rs");
463include!("speedup.rs");
464include!("forward_error.rs");
465include!("gpu_isolation_result.rs");
466include!("qa_08.rs");