apr-qa-runner 0.1.0

//! Batuta Oracle Enhancement for failure analysis (v1.5.0)
//!
//! This module provides integration with `batuta oracle --rag` to enhance
//! failure reports with historical context, generate falsification checklists,
//! and enrich metrics.
//!
//! See spec §12.1.1 for full specification.

use serde::{Deserialize, Serialize};
use std::process::Command;
use std::time::{Duration, Instant};
use tracing::{debug, info, warn};

use crate::evidence::Evidence;

/// Confidence level for hypotheses and checklist items
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "UPPERCASE")]
pub enum Confidence {
    /// High confidence based on strong evidence
    High,
    /// Medium confidence, requires investigation
    Medium,
    /// Low confidence, speculative
    Low,
}

impl std::fmt::Display for Confidence {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::High => write!(f, "HIGH"),
            Self::Medium => write!(f, "MEDIUM"),
            Self::Low => write!(f, "LOW"),
        }
    }
}

/// Status of a falsification check item
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "status", content = "reason")]
pub enum CheckStatus {
    /// Not yet tested
    Pending,
    /// Evidence suggests hypothesis is false
    Falsified(String),
    /// Hypothesis survived refutation attempt
    Corroborated,
}

impl std::fmt::Display for CheckStatus {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Pending => write!(f, "PENDING"),
            Self::Falsified(reason) => write!(f, "FALSIFIED: {reason}"),
            Self::Corroborated => write!(f, "CORROBORATED"),
        }
    }
}

/// A falsification checklist item generated by batuta oracle
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FalsificationCheckItem {
    /// Gate ID this check relates to (e.g., "F-LAYOUT-002")
    pub gate_id: String,

    /// Hypothesis to falsify
    pub hypothesis: String,

    /// Test procedure to falsify the hypothesis
    pub test_procedure: String,

    /// What outcome would falsify the hypothesis
    pub falsified_if: String,

    /// Current status based on evidence
    pub status: CheckStatus,

    /// Confidence level (HIGH/MEDIUM/LOW)
    pub confidence: Confidence,
}

/// A ranked hypothesis for root cause analysis
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RankedHypothesis {
    /// Hypothesis ID (e.g., "H1", "H2")
    pub id: String,

    /// Description of the hypothesis
    pub description: String,

    /// Confidence level
    pub confidence: Confidence,

    /// Evidence supporting this hypothesis
    pub evidence_for: Vec<String>,

    /// Evidence against this hypothesis
    pub evidence_against: Vec<String>,
}

/// Cross-reference to related documentation
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrossReference {
    /// Source file or issue (e.g., "aprender/CLAUDE.md")
    pub source: String,

    /// Section within the source (e.g., "LAYOUT-002")
    pub section: String,

    /// Relevance score 0.0 - 1.0
    pub relevance: f32,
}

/// Oracle-generated context for a failure
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct OracleContext {
    /// Generated falsification checklist
    pub checklist: Vec<FalsificationCheckItem>,

    /// Ranked hypotheses for root cause
    pub hypotheses: Vec<RankedHypothesis>,

    /// Cross-references to related documentation
    pub cross_references: Vec<CrossReference>,

    /// Investigation commands to run
    pub investigation_commands: Vec<String>,

    /// Whether oracle was available
    pub oracle_available: bool,

    /// Query latency in milliseconds
    pub query_latency_ms: u64,
}

/// Oracle enhancer for failure analysis
pub struct OracleEnhancer {
    /// Timeout for oracle queries
    timeout: Duration,

    /// Minimum relevance threshold for cross-references
    min_relevance: f32,
}

impl Default for OracleEnhancer {
    fn default() -> Self {
        Self {
            timeout: Duration::from_millis(
                std::env::var("APR_QA_ORACLE_TIMEOUT_MS")
                    .ok()
                    .and_then(|s| s.parse().ok())
                    .unwrap_or(30_000),
            ),
            min_relevance: std::env::var("APR_QA_ORACLE_MIN_RELEVANCE")
                .ok()
                .and_then(|s| s.parse().ok())
                .unwrap_or(0.5),
        }
    }
}

impl OracleEnhancer {
    /// Create a new oracle enhancer with default settings
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Create a new oracle enhancer with custom timeout
    #[must_use]
    pub fn with_timeout(mut self, timeout: Duration) -> Self {
        self.timeout = timeout;
        self
    }

    /// Create a new oracle enhancer with custom minimum relevance
    #[must_use]
    pub fn with_min_relevance(mut self, min_relevance: f32) -> Self {
        self.min_relevance = min_relevance;
        self
    }

    /// Check if batuta is available
    #[must_use]
    pub fn is_available() -> bool {
        Command::new("batuta")
            .arg("--version")
            .output()
            .map(|o| o.status.success())
            .unwrap_or(false)
    }

    /// Enhance a failure with oracle context
    pub fn enhance_failure(&self, evidence: &Evidence) -> OracleContext {
        if !evidence.outcome.is_fail() {
            debug!("Skipping oracle enhancement for non-failure");
            return OracleContext::default();
        }

        match self.query_oracle(evidence) {
            Ok(context) => context,
            Err(e) => {
                warn!(error = %e, "Oracle unavailable, using fallback");
                OracleContext {
                    oracle_available: false,
                    checklist: self.generate_static_checklist(evidence),
                    hypotheses: vec![],
                    cross_references: vec![],
                    investigation_commands: self.generate_static_commands(evidence),
                    query_latency_ms: 0,
                }
            }
        }
    }

    /// Enhance multiple failures
    #[must_use]
    pub fn enhance_failures(&self, evidences: &[Evidence]) -> Vec<(String, OracleContext)> {
        evidences
            .iter()
            .filter(|e| e.outcome.is_fail())
            .map(|e| (e.id.clone(), self.enhance_failure(e)))
            .collect()
    }

    /// Query batuta oracle for context
    fn query_oracle(&self, evidence: &Evidence) -> Result<OracleContext, OracleError> {
        let start = Instant::now();

        // Build query from evidence
        let query = self.build_query(evidence);
        debug!(query = %query, "Querying batuta oracle");

        // Run batuta oracle --rag
        let output = Command::new("batuta")
            .args(["oracle", "--rag", &query])
            .output()
            .map_err(|e| OracleError::ExecutionFailed(e.to_string()))?;

        let latency = start.elapsed().as_millis() as u64;

        if !output.status.success() {
            let stderr = String::from_utf8_lossy(&output.stderr);
            return Err(OracleError::QueryFailed(stderr.to_string()));
        }

        let stdout = String::from_utf8_lossy(&output.stdout);
        info!(latency_ms = latency, "Oracle query completed");

        // Parse oracle output and generate context
        Ok(self.parse_oracle_output(&stdout, evidence, latency))
    }

    /// Build query string from evidence
    fn build_query(&self, evidence: &Evidence) -> String {
        format!(
            "Generate Popperian falsification checklist for {} failure. \
             Gate: {}. Reason: {}. \
             Check LAYOUT-002, tensor transpose, file extension handling, conversion fidelity.",
            evidence.scenario.format, evidence.gate_id, evidence.reason
        )
    }

    /// Parse oracle output into structured context
    fn parse_oracle_output(
        &self,
        _output: &str,
        evidence: &Evidence,
        latency_ms: u64,
    ) -> OracleContext {
        // Generate checklist based on gate type
        let checklist = self.generate_checklist_from_gate(evidence);
        let hypotheses = self.generate_hypotheses_from_evidence(evidence);
        let cross_references = self.generate_cross_references(evidence);
        let investigation_commands = self.generate_investigation_commands(evidence);

        OracleContext {
            oracle_available: true,
            checklist,
            hypotheses,
            cross_references,
            investigation_commands,
            query_latency_ms: latency_ms,
        }
    }

    /// Generate checklist based on gate type
    fn generate_checklist_from_gate(&self, evidence: &Evidence) -> Vec<FalsificationCheckItem> {
        let mut items = vec![];

        // LAYOUT-002 check for all conversion failures
        if evidence.gate_id.starts_with("F-CONV") {
            items.push(FalsificationCheckItem {
                gate_id: "F-LAYOUT-002".to_string(),
                hypothesis: "All tensors are in row-major layout after conversion".to_string(),
                test_procedure: "Run inference on converted model, check for gibberish output"
                    .to_string(),
                falsified_if: "Output contains garbage or diff > 1e-6".to_string(),
                status: if evidence.reason.contains("diff") {
                    CheckStatus::Falsified("High diff observed".to_string())
                } else {
                    CheckStatus::Pending
                },
                confidence: Confidence::High,
            });
        }

        // Path extension check for "No file extension" errors
        if evidence.reason.contains("No file extension") {
            items.push(FalsificationCheckItem {
                gate_id: "F-PATH-EXT".to_string(),
                hypothesis: "ConversionTest receives file path, not directory".to_string(),
                test_procedure: "assert!(path.extension().is_some()) before conversion".to_string(),
                falsified_if: "Invalid model format: No file extension found".to_string(),
                status: CheckStatus::Falsified("Error message confirms".to_string()),
                confidence: Confidence::High,
            });
        }

        // Transpose check for conversion gates
        if evidence.gate_id.contains("CONV") && evidence.gate_id.contains("G-A") {
            items.push(FalsificationCheckItem {
                gate_id: "F-CONV-TRANSPOSE".to_string(),
                hypothesis: "Q4K tensor transpose applied during GGUF→APR".to_string(),
                test_procedure: "Check transpose_q4k called in converter".to_string(),
                falsified_if: "Transpose not applied, causing layout mismatch".to_string(),
                status: CheckStatus::Pending,
                confidence: Confidence::Medium,
            });
        }

        // Inference equivalence check
        if evidence.gate_id.contains("INF") {
            items.push(FalsificationCheckItem {
                gate_id: "F-CONV-INF-EQ".to_string(),
                hypothesis: "Inference output identical across formats".to_string(),
                test_procedure: "Compare token IDs from each format".to_string(),
                falsified_if: "Token IDs differ beyond numerical tolerance".to_string(),
                status: CheckStatus::Pending,
                confidence: Confidence::Medium,
            });
        }

        items
    }

    /// Generate hypotheses from evidence
    fn generate_hypotheses_from_evidence(&self, evidence: &Evidence) -> Vec<RankedHypothesis> {
        let mut hypotheses = vec![];

        // Path resolution hypothesis
        if evidence.reason.contains("No file extension") {
            hypotheses.push(RankedHypothesis {
                id: "H1".to_string(),
                description: "Path resolution bug - directory passed instead of file".to_string(),
                confidence: Confidence::High,
                evidence_for: vec!["Error message confirms: 'No file extension found'".to_string()],
                evidence_against: vec![],
            });
        }

        // LAYOUT-002 hypothesis for high diffs
        if evidence.reason.contains("diff") {
            hypotheses.push(RankedHypothesis {
                id: "H2".to_string(),
                description: "LAYOUT-002 violation - transpose not applied".to_string(),
                confidence: Confidence::Medium,
                evidence_for: vec!["58-90% diff across all conversions".to_string()],
                evidence_against: vec!["SafeTensors arithmetic tests pass".to_string()],
            });
        }

        // Quantization mismatch hypothesis
        if evidence.gate_id.contains("CONV") {
            hypotheses.push(RankedHypothesis {
                id: "H3".to_string(),
                description: "Quantization mismatch - Q4K block layout differs".to_string(),
                confidence: Confidence::Low,
                evidence_for: vec!["Conversion involves quantized formats".to_string()],
                evidence_against: vec![],
            });
        }

        hypotheses
    }

    /// Generate cross-references for the failure
    fn generate_cross_references(&self, evidence: &Evidence) -> Vec<CrossReference> {
        let mut refs = vec![];

        // Always reference the spec
        refs.push(CrossReference {
            source: "apr-playbook-spec.md".to_string(),
            section: "§4.1.1 LAYOUT-002".to_string(),
            relevance: 0.95,
        });

        // Reference aprender CLAUDE.md for conversion issues
        if evidence.gate_id.contains("CONV") {
            refs.push(CrossReference {
                source: "aprender/CLAUDE.md".to_string(),
                section: "LAYOUT-002".to_string(),
                relevance: 0.92,
            });
        }

        // Reference GH-190 for garbage output
        if evidence.reason.contains("garbage") || evidence.reason.contains("diff") {
            refs.push(CrossReference {
                source: "GH-190".to_string(),
                section: "GGUF→APR Garbage Output".to_string(),
                relevance: 0.88,
            });
        }

        // Filter by minimum relevance
        refs.into_iter()
            .filter(|r| r.relevance >= self.min_relevance)
            .collect()
    }

    /// Generate investigation commands
    fn generate_investigation_commands(&self, evidence: &Evidence) -> Vec<String> {
        let mut commands = vec![];

        // Model inspection
        if evidence.gate_id.contains("CONV") {
            commands.push(
                "apr inspect ~/.cache/apr-models/MODEL/apr/model.apr | grep layout".to_string(),
            );
            commands
                .push("grep -n 'transpose_q4k' ../aprender/src/format/converter/*.rs".to_string());
        }

        // Rosetta verification
        commands.push("apr rosetta MODEL.gguf -o /tmp/test.safetensors --verify".to_string());

        // Test the specific conversion
        if evidence.gate_id.contains("G-A") {
            commands.push("apr convert MODEL.gguf --to apr --verify".to_string());
        }

        commands
    }

    /// Generate basic checklist without oracle (fallback)
    fn generate_static_checklist(&self, evidence: &Evidence) -> Vec<FalsificationCheckItem> {
        let mut items = vec![];

        // Always check LAYOUT-002 for conversion failures
        if evidence.gate_id.starts_with("F-CONV") {
            items.push(FalsificationCheckItem {
                gate_id: "F-LAYOUT-002".to_string(),
                hypothesis: "Tensors in row-major layout".to_string(),
                test_procedure: "Check APR header layout flag".to_string(),
                falsified_if: "Garbage output or high diff".to_string(),
                status: CheckStatus::Pending,
                confidence: Confidence::Medium,
            });
        }

        // Path extension check
        if evidence.reason.contains("extension") {
            items.push(FalsificationCheckItem {
                gate_id: "F-PATH-EXT".to_string(),
                hypothesis: "File path has valid extension".to_string(),
                test_procedure: "Check path.extension().is_some()".to_string(),
                falsified_if: "No file extension found".to_string(),
                status: CheckStatus::Pending,
                confidence: Confidence::High,
            });
        }

        items
    }

    /// Generate static investigation commands (fallback)
    fn generate_static_commands(&self, evidence: &Evidence) -> Vec<String> {
        let mut commands = vec![];

        if evidence.gate_id.contains("CONV") {
            commands.push("# Check layout flag".to_string());
            commands.push("apr inspect MODEL.apr | grep layout".to_string());
        }

        commands
    }
}

/// Error type for oracle operations
#[derive(Debug)]
pub enum OracleError {
    /// Failed to execute batuta
    ExecutionFailed(String),
    /// Oracle query returned error
    QueryFailed(String),
    /// Timeout waiting for oracle
    Timeout,
}

impl std::fmt::Display for OracleError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::ExecutionFailed(e) => write!(f, "Failed to execute batuta: {e}"),
            Self::QueryFailed(e) => write!(f, "Oracle query failed: {e}"),
            Self::Timeout => write!(f, "Oracle query timed out"),
        }
    }
}

impl std::error::Error for OracleError {}

/// Generate a checklist markdown file from oracle context
#[must_use]
pub fn generate_checklist_markdown(
    model_id: &str,
    mqs_score: u32,
    grade: &str,
    total_scenarios: usize,
    failed_scenarios: usize,
    context: &OracleContext,
) -> String {
    use std::fmt::Write;

    let mut md = String::new();

    let _ = writeln!(md, "# Falsification Checklist: {model_id}\n");
    let _ = writeln!(md, "**Generated:** {}", chrono::Utc::now().to_rfc3339());
    let _ = writeln!(md, "**MQS Score:** {mqs_score}/1000 (Grade {grade})");
    let _ = writeln!(
        md,
        "**Failures:** {failed_scenarios}/{total_scenarios} scenarios\n"
    );
    md.push_str("---\n\n");

    // Checklist items
    md.push_str("## Checklist Items\n\n");
    for item in &context.checklist {
        let _ = writeln!(md, "- [ ] **{}**: {}", item.gate_id, item.hypothesis);
        let _ = writeln!(md, "  - *Test:* {}", item.test_procedure);
        let _ = writeln!(md, "  - *Falsified if:* {}", item.falsified_if);
        let _ = writeln!(md, "  - *Status:* {}", item.status);
        let _ = writeln!(md, "  - *Confidence:* {}\n", item.confidence);
    }

    // Hypotheses
    if !context.hypotheses.is_empty() {
        md.push_str("## Root Cause Hypotheses\n\n");
        for h in &context.hypotheses {
            let _ = writeln!(md, "### {}: {} ({})\n", h.id, h.description, h.confidence);
            if !h.evidence_for.is_empty() {
                md.push_str("**Evidence For:**\n");
                for e in &h.evidence_for {
                    let _ = writeln!(md, "- {e}");
                }
                md.push('\n');
            }
            if !h.evidence_against.is_empty() {
                md.push_str("**Evidence Against:**\n");
                for e in &h.evidence_against {
                    let _ = writeln!(md, "- {e}");
                }
                md.push('\n');
            }
        }
    }

    // Investigation commands
    if !context.investigation_commands.is_empty() {
        md.push_str("## Investigation Commands\n\n");
        md.push_str("```bash\n");
        for cmd in &context.investigation_commands {
            let _ = writeln!(md, "{cmd}");
        }
        md.push_str("```\n\n");
    }

    // Cross-references
    if !context.cross_references.is_empty() {
        md.push_str("## Cross-References\n\n");
        for r in &context.cross_references {
            let _ = writeln!(
                md,
                "- `{}` § {} (relevance: {:.2})",
                r.source, r.section, r.relevance
            );
        }
        md.push('\n');
    }

    md.push_str("---\n\n");
    md.push_str("*Generated by apr-qa with --oracle-enhance*\n");

    md
}

#[cfg(test)]
mod tests {
    use super::*;
    use apr_qa_gen::{Backend, Format, Modality, ModelId, QaScenario};

    fn make_test_scenario() -> QaScenario {
        QaScenario {
            id: "test_scenario".to_string(),
            model: ModelId {
                org: "test".to_string(),
                name: "model".to_string(),
                variant: None,
            },
            modality: Modality::Run,
            backend: Backend::Cpu,
            format: Format::Apr,
            prompt: "test".to_string(),
            temperature: 0.0,
            max_tokens: 32,
            seed: 0,
            trace_level: apr_qa_gen::TraceLevel::None,
            oracle_type: "garbage".to_string(),
        }
    }

    #[test]
    fn test_oracle_enhancer_default() {
        let enhancer = OracleEnhancer::new();
        assert_eq!(enhancer.timeout, Duration::from_millis(30_000));
        assert!((enhancer.min_relevance - 0.5).abs() < f32::EPSILON);
    }

    #[test]
    fn test_generate_static_checklist_for_conv_failure() {
        let enhancer = OracleEnhancer::new();
        let evidence = Evidence::falsified(
            "F-CONV-G-A",
            make_test_scenario(),
            "Conversion diff: 7.61e-1",
            "output",
            1000,
        );

        let checklist = enhancer.generate_static_checklist(&evidence);
        assert!(!checklist.is_empty());
        assert_eq!(checklist[0].gate_id, "F-LAYOUT-002");
    }

    #[test]
    fn test_generate_static_checklist_for_path_failure() {
        let enhancer = OracleEnhancer::new();
        let evidence = Evidence::falsified(
            "F-CONV-RT-001",
            make_test_scenario(),
            "No file extension found",
            "output",
            1000,
        );

        let checklist = enhancer.generate_static_checklist(&evidence);
        assert!(checklist.iter().any(|c| c.gate_id == "F-PATH-EXT"));
    }

    #[test]
    fn test_check_status_display() {
        assert_eq!(format!("{}", CheckStatus::Pending), "PENDING");
        assert_eq!(
            format!("{}", CheckStatus::Falsified("reason".to_string())),
            "FALSIFIED: reason"
        );
        assert_eq!(format!("{}", CheckStatus::Corroborated), "CORROBORATED");
    }

    #[test]
    fn test_confidence_display() {
        assert_eq!(format!("{}", Confidence::High), "HIGH");
        assert_eq!(format!("{}", Confidence::Medium), "MEDIUM");
        assert_eq!(format!("{}", Confidence::Low), "LOW");
    }

    #[test]
    fn test_generate_checklist_markdown() {
        let context = OracleContext {
            oracle_available: true,
            checklist: vec![FalsificationCheckItem {
                gate_id: "F-LAYOUT-002".to_string(),
                hypothesis: "Row-major layout".to_string(),
                test_procedure: "Check layout flag".to_string(),
                falsified_if: "Garbage output".to_string(),
                status: CheckStatus::Falsified("High diff".to_string()),
                confidence: Confidence::High,
            }],
            hypotheses: vec![RankedHypothesis {
                id: "H1".to_string(),
                description: "Layout bug".to_string(),
                confidence: Confidence::High,
                evidence_for: vec!["High diff".to_string()],
                evidence_against: vec![],
            }],
            cross_references: vec![CrossReference {
                source: "spec.md".to_string(),
                section: "LAYOUT-002".to_string(),
                relevance: 0.95,
            }],
            investigation_commands: vec!["apr inspect model.apr".to_string()],
            query_latency_ms: 1000,
        };

        let md = generate_checklist_markdown("test-model", 320, "F", 24, 13, &context);

        assert!(md.contains("# Falsification Checklist: test-model"));
        assert!(md.contains("F-LAYOUT-002"));
        assert!(md.contains("Row-major layout"));
        assert!(md.contains("H1"));
        assert!(md.contains("apr inspect"));
    }

    #[test]
    fn test_enhance_failure_non_failure() {
        let enhancer = OracleEnhancer::new();
        let evidence = Evidence::corroborated("F-TEST-001", make_test_scenario(), "output", 1000);

        let context = enhancer.enhance_failure(&evidence);
        assert!(!context.oracle_available);
        assert!(context.checklist.is_empty());
    }

    #[test]
    fn test_generate_hypotheses() {
        let enhancer = OracleEnhancer::new();
        let evidence = Evidence::falsified(
            "F-CONV-G-A",
            make_test_scenario(),
            "No file extension found",
            "output",
            1000,
        );

        let hypotheses = enhancer.generate_hypotheses_from_evidence(&evidence);
        assert!(!hypotheses.is_empty());
        assert!(hypotheses.iter().any(|h| h.id == "H1"));
    }

    #[test]
    fn test_generate_cross_references() {
        let enhancer = OracleEnhancer::new();
        let evidence = Evidence::falsified(
            "F-CONV-G-A",
            make_test_scenario(),
            "Conversion diff: 7.61e-1",
            "output",
            1000,
        );

        let refs = enhancer.generate_cross_references(&evidence);
        assert!(!refs.is_empty());
        assert!(refs.iter().any(|r| r.source.contains("spec")));
    }

    #[test]
    fn test_generate_investigation_commands() {
        let enhancer = OracleEnhancer::new();
        let evidence = Evidence::falsified(
            "F-CONV-G-A",
            make_test_scenario(),
            "Conversion failed",
            "output",
            1000,
        );

        let commands = enhancer.generate_investigation_commands(&evidence);
        assert!(!commands.is_empty());
        assert!(commands.iter().any(|c| c.contains("apr")));
    }
}