use serde::{Deserialize, Serialize};
use std::process::Command;
use std::time::{Duration, Instant};
use tracing::{debug, info, warn};
use crate::evidence::Evidence;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "UPPERCASE")]
pub enum Confidence {
High,
Medium,
Low,
}
impl std::fmt::Display for Confidence {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::High => write!(f, "HIGH"),
Self::Medium => write!(f, "MEDIUM"),
Self::Low => write!(f, "LOW"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "status", content = "reason")]
pub enum CheckStatus {
Pending,
Falsified(String),
Corroborated,
}
impl std::fmt::Display for CheckStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Pending => write!(f, "PENDING"),
Self::Falsified(reason) => write!(f, "FALSIFIED: {reason}"),
Self::Corroborated => write!(f, "CORROBORATED"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FalsificationCheckItem {
pub gate_id: String,
pub hypothesis: String,
pub test_procedure: String,
pub falsified_if: String,
pub status: CheckStatus,
pub confidence: Confidence,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RankedHypothesis {
pub id: String,
pub description: String,
pub confidence: Confidence,
pub evidence_for: Vec<String>,
pub evidence_against: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrossReference {
pub source: String,
pub section: String,
pub relevance: f32,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct OracleContext {
pub checklist: Vec<FalsificationCheckItem>,
pub hypotheses: Vec<RankedHypothesis>,
pub cross_references: Vec<CrossReference>,
pub investigation_commands: Vec<String>,
pub oracle_available: bool,
pub query_latency_ms: u64,
}
pub struct OracleEnhancer {
timeout: Duration,
min_relevance: f32,
}
impl Default for OracleEnhancer {
fn default() -> Self {
Self {
timeout: Duration::from_millis(
std::env::var("APR_QA_ORACLE_TIMEOUT_MS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(30_000),
),
min_relevance: std::env::var("APR_QA_ORACLE_MIN_RELEVANCE")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(0.5),
}
}
}
impl OracleEnhancer {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
#[must_use]
pub fn with_min_relevance(mut self, min_relevance: f32) -> Self {
self.min_relevance = min_relevance;
self
}
#[must_use]
pub fn is_available() -> bool {
Command::new("batuta")
.arg("--version")
.output()
.map(|o| o.status.success())
.unwrap_or(false)
}
pub fn enhance_failure(&self, evidence: &Evidence) -> OracleContext {
if !evidence.outcome.is_fail() {
debug!("Skipping oracle enhancement for non-failure");
return OracleContext::default();
}
match self.query_oracle(evidence) {
Ok(context) => context,
Err(e) => {
warn!(error = %e, "Oracle unavailable, using fallback");
OracleContext {
oracle_available: false,
checklist: self.generate_static_checklist(evidence),
hypotheses: vec![],
cross_references: vec![],
investigation_commands: self.generate_static_commands(evidence),
query_latency_ms: 0,
}
}
}
}
#[must_use]
pub fn enhance_failures(&self, evidences: &[Evidence]) -> Vec<(String, OracleContext)> {
evidences
.iter()
.filter(|e| e.outcome.is_fail())
.map(|e| (e.id.clone(), self.enhance_failure(e)))
.collect()
}
fn query_oracle(&self, evidence: &Evidence) -> Result<OracleContext, OracleError> {
let start = Instant::now();
let query = self.build_query(evidence);
debug!(query = %query, "Querying batuta oracle");
let output = Command::new("batuta")
.args(["oracle", "--rag", &query])
.output()
.map_err(|e| OracleError::ExecutionFailed(e.to_string()))?;
let latency = start.elapsed().as_millis() as u64;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(OracleError::QueryFailed(stderr.to_string()));
}
let stdout = String::from_utf8_lossy(&output.stdout);
info!(latency_ms = latency, "Oracle query completed");
Ok(self.parse_oracle_output(&stdout, evidence, latency))
}
fn build_query(&self, evidence: &Evidence) -> String {
format!(
"Generate Popperian falsification checklist for {} failure. \
Gate: {}. Reason: {}. \
Check LAYOUT-002, tensor transpose, file extension handling, conversion fidelity.",
evidence.scenario.format, evidence.gate_id, evidence.reason
)
}
fn parse_oracle_output(
&self,
_output: &str,
evidence: &Evidence,
latency_ms: u64,
) -> OracleContext {
let checklist = self.generate_checklist_from_gate(evidence);
let hypotheses = self.generate_hypotheses_from_evidence(evidence);
let cross_references = self.generate_cross_references(evidence);
let investigation_commands = self.generate_investigation_commands(evidence);
OracleContext {
oracle_available: true,
checklist,
hypotheses,
cross_references,
investigation_commands,
query_latency_ms: latency_ms,
}
}
fn generate_checklist_from_gate(&self, evidence: &Evidence) -> Vec<FalsificationCheckItem> {
let mut items = vec![];
if evidence.gate_id.starts_with("F-CONV") {
items.push(FalsificationCheckItem {
gate_id: "F-LAYOUT-002".to_string(),
hypothesis: "All tensors are in row-major layout after conversion".to_string(),
test_procedure: "Run inference on converted model, check for gibberish output"
.to_string(),
falsified_if: "Output contains garbage or diff > 1e-6".to_string(),
status: if evidence.reason.contains("diff") {
CheckStatus::Falsified("High diff observed".to_string())
} else {
CheckStatus::Pending
},
confidence: Confidence::High,
});
}
if evidence.reason.contains("No file extension") {
items.push(FalsificationCheckItem {
gate_id: "F-PATH-EXT".to_string(),
hypothesis: "ConversionTest receives file path, not directory".to_string(),
test_procedure: "assert!(path.extension().is_some()) before conversion".to_string(),
falsified_if: "Invalid model format: No file extension found".to_string(),
status: CheckStatus::Falsified("Error message confirms".to_string()),
confidence: Confidence::High,
});
}
if evidence.gate_id.contains("CONV") && evidence.gate_id.contains("G-A") {
items.push(FalsificationCheckItem {
gate_id: "F-CONV-TRANSPOSE".to_string(),
hypothesis: "Q4K tensor transpose applied during GGUFโAPR".to_string(),
test_procedure: "Check transpose_q4k called in converter".to_string(),
falsified_if: "Transpose not applied, causing layout mismatch".to_string(),
status: CheckStatus::Pending,
confidence: Confidence::Medium,
});
}
if evidence.gate_id.contains("INF") {
items.push(FalsificationCheckItem {
gate_id: "F-CONV-INF-EQ".to_string(),
hypothesis: "Inference output identical across formats".to_string(),
test_procedure: "Compare token IDs from each format".to_string(),
falsified_if: "Token IDs differ beyond numerical tolerance".to_string(),
status: CheckStatus::Pending,
confidence: Confidence::Medium,
});
}
items
}
fn generate_hypotheses_from_evidence(&self, evidence: &Evidence) -> Vec<RankedHypothesis> {
let mut hypotheses = vec![];
if evidence.reason.contains("No file extension") {
hypotheses.push(RankedHypothesis {
id: "H1".to_string(),
description: "Path resolution bug - directory passed instead of file".to_string(),
confidence: Confidence::High,
evidence_for: vec!["Error message confirms: 'No file extension found'".to_string()],
evidence_against: vec![],
});
}
if evidence.reason.contains("diff") {
hypotheses.push(RankedHypothesis {
id: "H2".to_string(),
description: "LAYOUT-002 violation - transpose not applied".to_string(),
confidence: Confidence::Medium,
evidence_for: vec!["58-90% diff across all conversions".to_string()],
evidence_against: vec!["SafeTensors arithmetic tests pass".to_string()],
});
}
if evidence.gate_id.contains("CONV") {
hypotheses.push(RankedHypothesis {
id: "H3".to_string(),
description: "Quantization mismatch - Q4K block layout differs".to_string(),
confidence: Confidence::Low,
evidence_for: vec!["Conversion involves quantized formats".to_string()],
evidence_against: vec![],
});
}
hypotheses
}
fn generate_cross_references(&self, evidence: &Evidence) -> Vec<CrossReference> {
let mut refs = vec![];
refs.push(CrossReference {
source: "apr-playbook-spec.md".to_string(),
section: "ยง4.1.1 LAYOUT-002".to_string(),
relevance: 0.95,
});
if evidence.gate_id.contains("CONV") {
refs.push(CrossReference {
source: "aprender/CLAUDE.md".to_string(),
section: "LAYOUT-002".to_string(),
relevance: 0.92,
});
}
if evidence.reason.contains("garbage") || evidence.reason.contains("diff") {
refs.push(CrossReference {
source: "GH-190".to_string(),
section: "GGUFโAPR Garbage Output".to_string(),
relevance: 0.88,
});
}
refs.into_iter()
.filter(|r| r.relevance >= self.min_relevance)
.collect()
}
fn generate_investigation_commands(&self, evidence: &Evidence) -> Vec<String> {
let mut commands = vec![];
if evidence.gate_id.contains("CONV") {
commands.push(
"apr inspect ~/.cache/apr-models/MODEL/apr/model.apr | grep layout".to_string(),
);
commands
.push("grep -n 'transpose_q4k' ../aprender/src/format/converter/*.rs".to_string());
}
commands.push("apr rosetta MODEL.gguf -o /tmp/test.safetensors --verify".to_string());
if evidence.gate_id.contains("G-A") {
commands.push("apr convert MODEL.gguf --to apr --verify".to_string());
}
commands
}
fn generate_static_checklist(&self, evidence: &Evidence) -> Vec<FalsificationCheckItem> {
let mut items = vec![];
if evidence.gate_id.starts_with("F-CONV") {
items.push(FalsificationCheckItem {
gate_id: "F-LAYOUT-002".to_string(),
hypothesis: "Tensors in row-major layout".to_string(),
test_procedure: "Check APR header layout flag".to_string(),
falsified_if: "Garbage output or high diff".to_string(),
status: CheckStatus::Pending,
confidence: Confidence::Medium,
});
}
if evidence.reason.contains("extension") {
items.push(FalsificationCheckItem {
gate_id: "F-PATH-EXT".to_string(),
hypothesis: "File path has valid extension".to_string(),
test_procedure: "Check path.extension().is_some()".to_string(),
falsified_if: "No file extension found".to_string(),
status: CheckStatus::Pending,
confidence: Confidence::High,
});
}
items
}
fn generate_static_commands(&self, evidence: &Evidence) -> Vec<String> {
let mut commands = vec![];
if evidence.gate_id.contains("CONV") {
commands.push("# Check layout flag".to_string());
commands.push("apr inspect MODEL.apr | grep layout".to_string());
}
commands
}
}
#[derive(Debug)]
pub enum OracleError {
ExecutionFailed(String),
QueryFailed(String),
Timeout,
}
impl std::fmt::Display for OracleError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ExecutionFailed(e) => write!(f, "Failed to execute batuta: {e}"),
Self::QueryFailed(e) => write!(f, "Oracle query failed: {e}"),
Self::Timeout => write!(f, "Oracle query timed out"),
}
}
}
impl std::error::Error for OracleError {}
#[must_use]
pub fn generate_checklist_markdown(
model_id: &str,
mqs_score: u32,
grade: &str,
total_scenarios: usize,
failed_scenarios: usize,
context: &OracleContext,
) -> String {
use std::fmt::Write;
let mut md = String::new();
let _ = writeln!(md, "# Falsification Checklist: {model_id}\n");
let _ = writeln!(md, "**Generated:** {}", chrono::Utc::now().to_rfc3339());
let _ = writeln!(md, "**MQS Score:** {mqs_score}/1000 (Grade {grade})");
let _ = writeln!(
md,
"**Failures:** {failed_scenarios}/{total_scenarios} scenarios\n"
);
md.push_str("---\n\n");
md.push_str("## Checklist Items\n\n");
for item in &context.checklist {
let _ = writeln!(md, "- [ ] **{}**: {}", item.gate_id, item.hypothesis);
let _ = writeln!(md, " - *Test:* {}", item.test_procedure);
let _ = writeln!(md, " - *Falsified if:* {}", item.falsified_if);
let _ = writeln!(md, " - *Status:* {}", item.status);
let _ = writeln!(md, " - *Confidence:* {}\n", item.confidence);
}
if !context.hypotheses.is_empty() {
md.push_str("## Root Cause Hypotheses\n\n");
for h in &context.hypotheses {
let _ = writeln!(md, "### {}: {} ({})\n", h.id, h.description, h.confidence);
if !h.evidence_for.is_empty() {
md.push_str("**Evidence For:**\n");
for e in &h.evidence_for {
let _ = writeln!(md, "- {e}");
}
md.push('\n');
}
if !h.evidence_against.is_empty() {
md.push_str("**Evidence Against:**\n");
for e in &h.evidence_against {
let _ = writeln!(md, "- {e}");
}
md.push('\n');
}
}
}
if !context.investigation_commands.is_empty() {
md.push_str("## Investigation Commands\n\n");
md.push_str("```bash\n");
for cmd in &context.investigation_commands {
let _ = writeln!(md, "{cmd}");
}
md.push_str("```\n\n");
}
if !context.cross_references.is_empty() {
md.push_str("## Cross-References\n\n");
for r in &context.cross_references {
let _ = writeln!(
md,
"- `{}` ยง {} (relevance: {:.2})",
r.source, r.section, r.relevance
);
}
md.push('\n');
}
md.push_str("---\n\n");
md.push_str("*Generated by apr-qa with --oracle-enhance*\n");
md
}
#[cfg(test)]
mod tests {
use super::*;
use apr_qa_gen::{Backend, Format, Modality, ModelId, QaScenario};
fn make_test_scenario() -> QaScenario {
QaScenario {
id: "test_scenario".to_string(),
model: ModelId {
org: "test".to_string(),
name: "model".to_string(),
variant: None,
},
modality: Modality::Run,
backend: Backend::Cpu,
format: Format::Apr,
prompt: "test".to_string(),
temperature: 0.0,
max_tokens: 32,
seed: 0,
trace_level: apr_qa_gen::TraceLevel::None,
oracle_type: "garbage".to_string(),
}
}
#[test]
fn test_oracle_enhancer_default() {
let enhancer = OracleEnhancer::new();
assert_eq!(enhancer.timeout, Duration::from_millis(30_000));
assert!((enhancer.min_relevance - 0.5).abs() < f32::EPSILON);
}
#[test]
fn test_generate_static_checklist_for_conv_failure() {
let enhancer = OracleEnhancer::new();
let evidence = Evidence::falsified(
"F-CONV-G-A",
make_test_scenario(),
"Conversion diff: 7.61e-1",
"output",
1000,
);
let checklist = enhancer.generate_static_checklist(&evidence);
assert!(!checklist.is_empty());
assert_eq!(checklist[0].gate_id, "F-LAYOUT-002");
}
#[test]
fn test_generate_static_checklist_for_path_failure() {
let enhancer = OracleEnhancer::new();
let evidence = Evidence::falsified(
"F-CONV-RT-001",
make_test_scenario(),
"No file extension found",
"output",
1000,
);
let checklist = enhancer.generate_static_checklist(&evidence);
assert!(checklist.iter().any(|c| c.gate_id == "F-PATH-EXT"));
}
#[test]
fn test_check_status_display() {
assert_eq!(format!("{}", CheckStatus::Pending), "PENDING");
assert_eq!(
format!("{}", CheckStatus::Falsified("reason".to_string())),
"FALSIFIED: reason"
);
assert_eq!(format!("{}", CheckStatus::Corroborated), "CORROBORATED");
}
#[test]
fn test_confidence_display() {
assert_eq!(format!("{}", Confidence::High), "HIGH");
assert_eq!(format!("{}", Confidence::Medium), "MEDIUM");
assert_eq!(format!("{}", Confidence::Low), "LOW");
}
#[test]
fn test_generate_checklist_markdown() {
let context = OracleContext {
oracle_available: true,
checklist: vec![FalsificationCheckItem {
gate_id: "F-LAYOUT-002".to_string(),
hypothesis: "Row-major layout".to_string(),
test_procedure: "Check layout flag".to_string(),
falsified_if: "Garbage output".to_string(),
status: CheckStatus::Falsified("High diff".to_string()),
confidence: Confidence::High,
}],
hypotheses: vec![RankedHypothesis {
id: "H1".to_string(),
description: "Layout bug".to_string(),
confidence: Confidence::High,
evidence_for: vec!["High diff".to_string()],
evidence_against: vec![],
}],
cross_references: vec![CrossReference {
source: "spec.md".to_string(),
section: "LAYOUT-002".to_string(),
relevance: 0.95,
}],
investigation_commands: vec!["apr inspect model.apr".to_string()],
query_latency_ms: 1000,
};
let md = generate_checklist_markdown("test-model", 320, "F", 24, 13, &context);
assert!(md.contains("# Falsification Checklist: test-model"));
assert!(md.contains("F-LAYOUT-002"));
assert!(md.contains("Row-major layout"));
assert!(md.contains("H1"));
assert!(md.contains("apr inspect"));
}
#[test]
fn test_enhance_failure_non_failure() {
let enhancer = OracleEnhancer::new();
let evidence = Evidence::corroborated("F-TEST-001", make_test_scenario(), "output", 1000);
let context = enhancer.enhance_failure(&evidence);
assert!(!context.oracle_available);
assert!(context.checklist.is_empty());
}
#[test]
fn test_generate_hypotheses() {
let enhancer = OracleEnhancer::new();
let evidence = Evidence::falsified(
"F-CONV-G-A",
make_test_scenario(),
"No file extension found",
"output",
1000,
);
let hypotheses = enhancer.generate_hypotheses_from_evidence(&evidence);
assert!(!hypotheses.is_empty());
assert!(hypotheses.iter().any(|h| h.id == "H1"));
}
#[test]
fn test_generate_cross_references() {
let enhancer = OracleEnhancer::new();
let evidence = Evidence::falsified(
"F-CONV-G-A",
make_test_scenario(),
"Conversion diff: 7.61e-1",
"output",
1000,
);
let refs = enhancer.generate_cross_references(&evidence);
assert!(!refs.is_empty());
assert!(refs.iter().any(|r| r.source.contains("spec")));
}
#[test]
fn test_generate_investigation_commands() {
let enhancer = OracleEnhancer::new();
let evidence = Evidence::falsified(
"F-CONV-G-A",
make_test_scenario(),
"Conversion failed",
"output",
1000,
);
let commands = enhancer.generate_investigation_commands(&evidence);
assert!(!commands.is_empty());
assert!(commands.iter().any(|c| c.contains("apr")));
}
}