car-inference 0.14.0

//! Outcome tracking — learn from inference results to improve routing.
//!
//! Two observation channels:
//! 1. **Conversation signals** — implicit feedback from what happens after an inference
//!    call (user moved on = accepted, user corrected = rejected, re-asked = rejected).
//! 2. **Git-diff tracking** — for code generation, compare suggestions to actual commits
//!    (ground truth, no classification model needed).
//!
//! Every inference call produces an `InferenceOutcome`. Outcomes accumulate into
//! `ModelProfile`s with per-task statistics. The adaptive router uses profiles
//! to make data-driven model selection.

use std::collections::{HashMap, HashSet};
use std::time::{SystemTime, UNIX_EPOCH};

use serde::{Deserialize, Serialize};

/// Task type for outcome tracking. Maps to ModelCapability but at the call level.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum InferenceTask {
    Generate,
    Embed,
    Classify,
    Code,
    Reasoning,
}

impl std::fmt::Display for InferenceTask {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            InferenceTask::Generate => write!(f, "generate"),
            InferenceTask::Embed => write!(f, "embed"),
            InferenceTask::Classify => write!(f, "classify"),
            InferenceTask::Code => write!(f, "code"),
            InferenceTask::Reasoning => write!(f, "reasoning"),
        }
    }
}

/// A single inference invocation record.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InferenceOutcome {
    /// Unique trace ID for this invocation.
    pub trace_id: String,
    /// Model that was used.
    pub model_id: String,
    /// Task type.
    pub task: InferenceTask,
    /// How the model was selected.
    pub routing_reason: String,
    /// Wall-clock latency in milliseconds.
    pub latency_ms: u64,
    /// Input tokens (estimated).
    pub input_tokens: usize,
    /// Output tokens (estimated).
    pub output_tokens: usize,
    /// Outcome from conversation signal inference.
    pub inferred_outcome: Option<InferredOutcome>,
    /// Outcome from git-diff tracking (code only).
    pub code_outcome: Option<CodeOutcome>,
    /// Error message if inference failed.
    pub error: Option<String>,
    /// Unix timestamp.
    pub timestamp: u64,
}

/// Outcome inferred from conversation flow (implicit feedback).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum InferredOutcome {
    /// User moved on, built on the response.
    Accepted { confidence: f64 },
    /// User used the result but modified it.
    AcceptedWithEdits { confidence: f64 },
    /// User corrected, re-asked, or explicitly rejected.
    Rejected { confidence: f64 },
    /// No follow-up signal (session ended, inconclusive).
    Inconclusive,
}

impl InferredOutcome {
    /// Convert to a quality score (0.0 = bad, 1.0 = good).
    pub fn quality_score(&self) -> Option<f64> {
        match self {
            InferredOutcome::Accepted { confidence } => Some(*confidence),
            InferredOutcome::AcceptedWithEdits { confidence } => Some(confidence * 0.7),
            InferredOutcome::Rejected { confidence } => Some((1.0 - confidence) * 0.3),
            InferredOutcome::Inconclusive => None,
        }
    }

    pub fn is_success(&self) -> Option<bool> {
        match self {
            InferredOutcome::Accepted { .. } => Some(true),
            InferredOutcome::AcceptedWithEdits { .. } => Some(true),
            InferredOutcome::Rejected { .. } => Some(false),
            InferredOutcome::Inconclusive => None,
        }
    }
}

/// Outcome from git-diff comparison (code generation ground truth).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum CodeOutcome {
    /// Suggestion was applied as-is (exact or near-exact match in diff).
    Applied,
    /// User changed the same file but differently (partial adoption).
    Modified,
    /// File unchanged despite suggestion (rejected / not used).
    Ignored,
    /// AST structural diff: signature was changed (breaking change).
    SignatureChanged,
    /// AST structural diff: body was modified but signature preserved (non-breaking).
    BodyModified,
    /// AST structural diff: new symbol was added.
    SymbolAdded,
}

impl CodeOutcome {
    pub fn quality_score(&self) -> f64 {
        match self {
            CodeOutcome::Applied => 1.0,
            CodeOutcome::SignatureChanged => 0.8,
            CodeOutcome::BodyModified => 0.7,
            CodeOutcome::SymbolAdded => 0.7,
            CodeOutcome::Modified => 0.6,
            CodeOutcome::Ignored => 0.1,
        }
    }

    pub fn is_success(&self) -> bool {
        !matches!(self, CodeOutcome::Ignored)
    }
}

/// Per-task statistics within a model profile.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TaskStats {
    pub calls: u64,
    pub successes: u64,
    pub failures: u64,
    /// Running average latency in ms.
    pub avg_latency_ms: f64,
    /// Exponential moving average of quality score.
    pub ema_quality: f64,
}

impl TaskStats {
    pub fn success_rate(&self) -> f64 {
        let total = self.successes + self.failures;
        if total == 0 {
            return 0.5;
        } // prior: assume neutral
        self.successes as f64 / total as f64
    }
}

/// Per-model performance profile, built from observed outcomes.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelProfile {
    pub model_id: String,
    pub total_calls: u64,
    pub success_count: u64,
    pub fail_count: u64,
    pub total_latency_ms: u64,
    /// Total estimated input tokens across all calls.
    #[serde(default)]
    pub total_input_tokens: u64,
    /// Total estimated output tokens across all calls.
    #[serde(default)]
    pub total_output_tokens: u64,
    /// Per-task statistics.
    pub task_stats: HashMap<String, TaskStats>,
    /// Overall EMA quality score (0.0 - 1.0).
    pub ema_quality: f64,
    /// Derived metric: quality per 1K total tokens. Populated on export
    /// (not on every update) so it always reflects the latest snapshot.
    /// Inspired by Meta-Harness: context-token efficiency is a first-class
    /// optimization target, so it needs to be visible in model_stats.
    #[serde(default)]
    pub quality_per_1k_tokens: f64,
    /// Last updated (unix timestamp).
    pub updated_at: u64,
}

impl ModelProfile {
    pub fn new(model_id: String) -> Self {
        Self {
            model_id,
            total_calls: 0,
            success_count: 0,
            fail_count: 0,
            total_latency_ms: 0,
            total_input_tokens: 0,
            total_output_tokens: 0,
            task_stats: HashMap::new(),
            ema_quality: 0.5, // neutral prior
            quality_per_1k_tokens: 0.0,
            updated_at: now_unix(),
        }
    }

    pub fn success_rate(&self) -> f64 {
        let total = self.success_count + self.fail_count;
        if total == 0 {
            return 0.5;
        }
        self.success_count as f64 / total as f64
    }

    pub fn avg_latency_ms(&self) -> f64 {
        if self.total_calls == 0 {
            return 0.0;
        }
        self.total_latency_ms as f64 / self.total_calls as f64
    }

    /// Same degradation pattern as SkillStats: fail_count > success_count + threshold.
    pub fn should_degrade(&self, threshold: u64) -> bool {
        self.fail_count > self.success_count + threshold
    }

    /// Get stats for a specific task type.
    pub fn task_stats(&self, task: InferenceTask) -> Option<&TaskStats> {
        self.task_stats.get(&task.to_string())
    }

    /// Total tokens observed across all calls (input + output).
    pub fn total_tokens(&self) -> u64 {
        self.total_input_tokens + self.total_output_tokens
    }

    /// Quality per 1000 tokens: `ema_quality * 1000 / total_tokens`.
    /// Returns 0.0 before any tokens have been observed.
    pub fn compute_quality_per_1k_tokens(&self) -> f64 {
        let total = self.total_tokens();
        if total == 0 {
            return 0.0;
        }
        self.ema_quality * 1000.0 / total as f64
    }
}

/// EMA smoothing factor. Higher = more weight on recent observations.
const EMA_ALPHA: f64 = 0.2;

/// Tracks inference outcomes and builds performance profiles.
pub struct OutcomeTracker {
    /// In-memory profiles, keyed by model_id.
    profiles: HashMap<String, ModelProfile>,
    /// Pending outcomes: completed inference calls awaiting outcome signal.
    /// Keyed by trace_id.
    pending: HashMap<String, InferenceOutcome>,
    /// Counter for generating trace IDs.
    trace_counter: u64,
    /// Models excluded for this session (429/rate-limited). Hard exclusion.
    excluded: HashSet<String>,
}

impl OutcomeTracker {
    pub fn new() -> Self {
        Self {
            profiles: HashMap::new(),
            pending: HashMap::new(),
            trace_counter: 0,
            excluded: HashSet::new(),
        }
    }

    /// Check if a model is excluded (rate-limited) for this session.
    pub fn is_excluded(&self, model_id: &str) -> bool {
        self.excluded.contains(model_id)
    }

    /// Record that an inference call started. Returns a trace_id.
    pub fn record_start(
        &mut self,
        model_id: &str,
        task: InferenceTask,
        routing_reason: &str,
    ) -> String {
        self.trace_counter += 1;
        let trace_id = format!("t-{}-{}", now_unix(), self.trace_counter);

        let outcome = InferenceOutcome {
            trace_id: trace_id.clone(),
            model_id: model_id.to_string(),
            task,
            routing_reason: routing_reason.to_string(),
            latency_ms: 0,
            input_tokens: 0,
            output_tokens: 0,
            inferred_outcome: None,
            code_outcome: None,
            error: None,
            timestamp: now_unix(),
        };

        self.pending.insert(trace_id.clone(), outcome);
        trace_id
    }

    /// Record completion of an inference call (timing + token counts).
    pub fn record_complete(
        &mut self,
        trace_id: &str,
        latency_ms: u64,
        input_tokens: usize,
        output_tokens: usize,
    ) {
        if let Some(outcome) = self.pending.get_mut(trace_id) {
            outcome.latency_ms = latency_ms;
            outcome.input_tokens = input_tokens;
            outcome.output_tokens = output_tokens;

            // Update profile with timing data
            let profile = self
                .profiles
                .entry(outcome.model_id.clone())
                .or_insert_with(|| ModelProfile::new(outcome.model_id.clone()));

            profile.total_calls += 1;
            profile.total_latency_ms += latency_ms;
            profile.total_input_tokens += input_tokens as u64;
            profile.total_output_tokens += output_tokens as u64;

            let task_key = outcome.task.to_string();
            let ts = profile.task_stats.entry(task_key).or_default();
            ts.calls += 1;
            ts.avg_latency_ms =
                ts.avg_latency_ms + (latency_ms as f64 - ts.avg_latency_ms) / ts.calls as f64;

            profile.updated_at = now_unix();
        }
    }

    /// Record a failure.
    pub fn record_failure(&mut self, trace_id: &str, error: &str) {
        if let Some(outcome) = self.pending.get_mut(trace_id) {
            outcome.error = Some(error.to_string());

            let profile = self
                .profiles
                .entry(outcome.model_id.clone())
                .or_insert_with(|| ModelProfile::new(outcome.model_id.clone()));

            profile.fail_count += 1;

            // Rate-limit errors (429) get a harsher penalty — the model is
            // guaranteed to fail again, so drop quality aggressively.
            let is_rate_limited = error.contains("429") || error.contains("RESOURCE_EXHAUSTED");
            if is_rate_limited {
                // Hard-exclude for the rest of this session (#13)
                self.excluded.insert(outcome.model_id.clone());
                profile.ema_quality *= 0.1;
            } else {
                profile.ema_quality = profile.ema_quality * (1.0 - EMA_ALPHA) + 0.0 * EMA_ALPHA;
            }

            let task_key = outcome.task.to_string();
            let ts = profile.task_stats.entry(task_key).or_default();
            ts.failures += 1;
            if is_rate_limited {
                ts.ema_quality *= 0.1;
            } else {
                ts.ema_quality = ts.ema_quality * (1.0 - EMA_ALPHA);
            }

            profile.updated_at = now_unix();
        }

        // Failed outcomes don't need further tracking
        self.pending.remove(trace_id);
    }

    /// Record an inferred outcome from conversation signals.
    pub fn record_inferred_outcome(&mut self, trace_id: &str, outcome: InferredOutcome) {
        if let Some(pending) = self.pending.remove(trace_id) {
            self.apply_outcome(&pending, outcome.quality_score(), outcome.is_success());
        }
    }

    /// Record an outcome from git-diff comparison (code generation).
    pub fn record_code_outcome(&mut self, trace_id: &str, outcome: CodeOutcome) {
        if let Some(pending) = self.pending.remove(trace_id) {
            self.apply_outcome(
                &pending,
                Some(outcome.quality_score()),
                Some(outcome.is_success()),
            );
        }
    }

    /// Resolve all pending outcomes for a completed conversation turn.
    /// Called with the inferred outcomes from conversation signal analysis.
    pub fn resolve_pending_from_signals(&mut self, outcomes: Vec<(String, InferredOutcome)>) {
        for (trace_id, inferred) in outcomes {
            self.record_inferred_outcome(&trace_id, inferred);
        }
    }

    /// Infer outcomes from a sequence of action results.
    ///
    /// In a reasoning session, each action's output feeds the next. If action N
    /// produced output and action N+1 succeeded using it, N was implicitly accepted.
    /// If N produced empty output or N+1 failed, N was implicitly rejected.
    ///
    /// Returns (trace_id, inferred_outcome) pairs ready for `resolve_pending_from_signals`.
    pub fn infer_outcomes_from_action_sequence(
        &self,
        action_results: &[(String, bool, f64, String)], // (trace_id, success, confidence, output)
    ) -> Vec<(String, InferredOutcome)> {
        let mut outcomes = Vec::new();

        for (i, (trace_id, success, confidence, output)) in action_results.iter().enumerate() {
            if trace_id.is_empty() {
                continue; // No trace (e.g., memgine-only action)
            }

            if !success {
                outcomes.push((
                    trace_id.clone(),
                    InferredOutcome::Rejected {
                        confidence: *confidence,
                    },
                ));
                continue;
            }

            // Check if the next action used this one's output (implicit acceptance)
            let next_succeeded = action_results
                .get(i + 1)
                .map(|(_, s, _, _)| *s)
                .unwrap_or(true); // Last action: assume accepted if successful

            let has_output = !output.trim().is_empty();

            if has_output && next_succeeded {
                outcomes.push((
                    trace_id.clone(),
                    InferredOutcome::Accepted {
                        confidence: *confidence,
                    },
                ));
            } else if has_output && !next_succeeded {
                // Output existed but downstream failed — may not be this action's fault
                outcomes.push((
                    trace_id.clone(),
                    InferredOutcome::AcceptedWithEdits {
                        confidence: confidence * 0.7,
                    },
                ));
            } else {
                outcomes.push((trace_id.clone(), InferredOutcome::Inconclusive));
            }
        }

        outcomes
    }

    /// Get the profile for a model.
    pub fn profile(&self, model_id: &str) -> Option<&ModelProfile> {
        self.profiles.get(model_id)
    }

    /// Get all profiles.
    pub fn all_profiles(&self) -> &HashMap<String, ModelProfile> {
        &self.profiles
    }

    /// Get pending trace IDs (for conversation signal analysis).
    pub fn pending_trace_ids(&self) -> Vec<String> {
        self.pending.keys().cloned().collect()
    }

    /// Get a pending outcome by trace_id.
    pub fn get_pending(&self, trace_id: &str) -> Option<&InferenceOutcome> {
        self.pending.get(trace_id)
    }

    /// Export profiles for serialization / persistence. Derived metrics
    /// (quality_per_1k_tokens) are recomputed on the way out so callers
    /// always see a consistent snapshot.
    pub fn export_profiles(&self) -> Vec<ModelProfile> {
        self.profiles
            .values()
            .cloned()
            .map(|mut p| {
                p.quality_per_1k_tokens = p.compute_quality_per_1k_tokens();
                p
            })
            .collect()
    }

    /// Import profiles (from persistence).
    pub fn import_profiles(&mut self, profiles: Vec<ModelProfile>) {
        for p in profiles {
            self.profiles.insert(p.model_id.clone(), p);
        }
    }

    /// Save profiles to a JSON file for cross-session persistence (#13).
    pub fn save_to_file(&self, path: &std::path::Path) -> Result<(), std::io::Error> {
        let profiles = self.export_profiles();
        let json = serde_json::to_string_pretty(&profiles)
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
        if let Some(parent) = path.parent() {
            std::fs::create_dir_all(parent)?;
        }
        std::fs::write(path, json)
    }

    /// Load profiles from a JSON file for cross-session persistence (#13).
    pub fn load_from_file(&mut self, path: &std::path::Path) -> Result<usize, std::io::Error> {
        if !path.exists() {
            return Ok(0);
        }
        let json = std::fs::read_to_string(path)?;
        let profiles: Vec<ModelProfile> = serde_json::from_str(&json)
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
        let count = profiles.len();
        self.import_profiles(profiles);
        Ok(count)
    }

    /// Apply a quality signal to the model's profile.
    fn apply_outcome(
        &mut self,
        pending: &InferenceOutcome,
        quality: Option<f64>,
        success: Option<bool>,
    ) {
        let profile = self
            .profiles
            .entry(pending.model_id.clone())
            .or_insert_with(|| ModelProfile::new(pending.model_id.clone()));

        if let Some(q) = quality {
            profile.ema_quality = profile.ema_quality * (1.0 - EMA_ALPHA) + q * EMA_ALPHA;

            let task_key = pending.task.to_string();
            let ts = profile.task_stats.entry(task_key).or_default();
            ts.ema_quality = ts.ema_quality * (1.0 - EMA_ALPHA) + q * EMA_ALPHA;
        }

        if let Some(ok) = success {
            if ok {
                profile.success_count += 1;
                let task_key = pending.task.to_string();
                let ts = profile.task_stats.entry(task_key).or_default();
                ts.successes += 1;
            } else {
                profile.fail_count += 1;
                let task_key = pending.task.to_string();
                let ts = profile.task_stats.entry(task_key).or_default();
                ts.failures += 1;
            }
        }

        profile.updated_at = now_unix();
    }

    /// Check git diff for pending code suggestions and resolve outcomes.
    ///
    /// Two strategies:
    /// 1. **AST structural diff** (when `ast` feature is enabled): parse the old
    ///    and new versions of changed files and compare at the symbol level.
    ///    This gives precise outcomes: SignatureChanged, BodyModified, SymbolAdded.
    /// 2. **Text diff fallback**: token matching against the combined git diff.
    pub fn check_git_outcomes(&mut self, repo_dir: &std::path::Path) {
        let diff = match std::process::Command::new("git")
            .args(["diff", "--no-color"])
            .current_dir(repo_dir)
            .output()
        {
            Ok(output) => String::from_utf8_lossy(&output.stdout).to_string(),
            Err(_) => return,
        };

        let staged_diff = match std::process::Command::new("git")
            .args(["diff", "--cached", "--no-color"])
            .current_dir(repo_dir)
            .output()
        {
            Ok(output) => String::from_utf8_lossy(&output.stdout).to_string(),
            Err(_) => String::new(),
        };

        let combined_diff = format!("{}\n{}", diff, staged_diff);

        if combined_diff.trim().is_empty() {
            return; // No changes at all
        }

        // Try AST structural diff on changed files
        #[cfg(feature = "ast")]
        let ast_outcome = Self::check_git_outcomes_ast(repo_dir);

        let code_traces: Vec<(String, String)> = self
            .pending
            .iter()
            .filter(|(_, o)| matches!(o.task, InferenceTask::Code))
            .map(|(id, o)| (id.clone(), o.model_id.clone()))
            .collect();

        for (trace_id, _model_id) in code_traces {
            if let Some(pending) = self.pending.get(&trace_id) {
                // Try AST-based outcome first
                #[cfg(feature = "ast")]
                if let Some(ref ast_out) = ast_outcome {
                    let pending_clone = pending.clone();
                    self.apply_outcome(
                        &pending_clone,
                        Some(ast_out.quality_score()),
                        Some(ast_out.is_success()),
                    );
                    continue;
                }

                // Fallback: text token matching
                let output_tokens: Vec<&str> = pending
                    .routing_reason
                    .split_whitespace()
                    .filter(|t| t.len() > 5)
                    .collect();

                let outcome = if output_tokens.iter().any(|t| combined_diff.contains(t)) {
                    CodeOutcome::Applied
                } else {
                    CodeOutcome::Modified
                };

                let pending_clone = pending.clone();
                self.apply_outcome(
                    &pending_clone,
                    Some(outcome.quality_score()),
                    Some(outcome.is_success()),
                );
            }
        }
    }

    /// AST-based git outcome: parse changed files before and after, diff symbols.
    #[cfg(feature = "ast")]
    fn check_git_outcomes_ast(repo_dir: &std::path::Path) -> Option<CodeOutcome> {
        // Get list of changed files
        let name_only = std::process::Command::new("git")
            .args(["diff", "--name-only"])
            .current_dir(repo_dir)
            .output()
            .ok()?;
        let changed_files: Vec<&str> = std::str::from_utf8(&name_only.stdout)
            .ok()?
            .lines()
            .filter(|f| !f.is_empty())
            .collect();

        if changed_files.is_empty() {
            return None;
        }

        let mut has_sig_change = false;
        let mut has_body_change = false;
        let mut has_addition = false;

        for file in &changed_files {
            // Only parse files tree-sitter supports
            if car_ast::Language::from_filename(file).is_none() {
                continue;
            }

            // Get the HEAD version
            let old_content = std::process::Command::new("git")
                .args(["show", &format!("HEAD:{}", file)])
                .current_dir(repo_dir)
                .output()
                .ok()
                .and_then(|o| {
                    if o.status.success() {
                        String::from_utf8(o.stdout).ok()
                    } else {
                        None
                    }
                });

            // Get the working tree version
            let new_path = repo_dir.join(file);
            let new_content = std::fs::read_to_string(&new_path).ok();

            match (old_content, new_content) {
                (Some(old), Some(new)) => {
                    let old_parsed = car_ast::parse_file(&old, file);
                    let new_parsed = car_ast::parse_file(&new, file);

                    if let (Some(old_p), Some(new_p)) = (old_parsed, new_parsed) {
                        let changes = car_ast::diff_symbols(&old_p, &new_p);
                        for change in &changes {
                            match change {
                                car_ast::SymbolChange::Added(_) => has_addition = true,
                                car_ast::SymbolChange::Modified {
                                    signature_changed, ..
                                } => {
                                    if *signature_changed {
                                        has_sig_change = true;
                                    } else {
                                        has_body_change = true;
                                    }
                                }
                                car_ast::SymbolChange::Removed(_) => has_sig_change = true,
                            }
                        }
                    }
                }
                (None, Some(_)) => has_addition = true, // New file
                _ => {}
            }
        }

        // Return the most significant outcome
        if has_sig_change {
            Some(CodeOutcome::SignatureChanged)
        } else if has_body_change {
            Some(CodeOutcome::BodyModified)
        } else if has_addition {
            Some(CodeOutcome::SymbolAdded)
        } else {
            None // No structural changes detected (maybe non-code files changed)
        }
    }
}

impl Default for OutcomeTracker {
    fn default() -> Self {
        Self::new()
    }
}

fn now_unix() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn lifecycle() {
        let mut tracker = OutcomeTracker::new();

        // Start an inference call
        let trace = tracker.record_start(
            "qwen/qwen3-4b:q4_k_m",
            InferenceTask::Code,
            "Code task -> Qwen3-4B",
        );

        // Complete it
        tracker.record_complete(&trace, 1200, 100, 50);

        // Profile should have 1 call
        let profile = tracker.profile("qwen/qwen3-4b:q4_k_m").unwrap();
        assert_eq!(profile.total_calls, 1);
        assert_eq!(profile.avg_latency_ms(), 1200.0);

        // Record positive outcome
        tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.9 });

        let profile = tracker.profile("qwen/qwen3-4b:q4_k_m").unwrap();
        assert_eq!(profile.success_count, 1);
        assert!(profile.ema_quality > 0.5); // should have gone up from 0.5
    }

    #[test]
    fn failure_degrades() {
        let mut tracker = OutcomeTracker::new();

        // Simulate 5 failures
        for i in 0..5 {
            let trace = tracker.record_start("bad-model", InferenceTask::Generate, "test");
            tracker.record_complete(&trace, 100, 10, 5);
            tracker.record_failure(&format!("t-fail-{i}"), "timeout");
        }

        // But record_failure removes from pending, so we need to use the actual trace_ids
        // Let's redo this properly
        let mut tracker = OutcomeTracker::new();
        for _ in 0..5 {
            let trace = tracker.record_start("bad-model", InferenceTask::Generate, "test");
            tracker.record_complete(&trace, 100, 10, 5);
            tracker.record_failure(&trace, "timeout");
        }

        let profile = tracker.profile("bad-model").unwrap();
        assert_eq!(profile.fail_count, 5);
        assert!(profile.should_degrade(2)); // 5 > 0 + 2
        assert!(profile.ema_quality < 0.3); // decayed toward 0
    }

    #[test]
    fn code_outcome_ground_truth() {
        let mut tracker = OutcomeTracker::new();

        let trace = tracker.record_start("qwen/qwen3-4b:q4_k_m", InferenceTask::Code, "code");
        tracker.record_complete(&trace, 500, 200, 100);
        tracker.record_code_outcome(&trace, CodeOutcome::Applied);

        let profile = tracker.profile("qwen/qwen3-4b:q4_k_m").unwrap();
        assert_eq!(profile.success_count, 1);
        // EMA should reflect Applied quality (1.0): 0.5 * 0.8 + 1.0 * 0.2 = 0.6
        assert!((profile.ema_quality - 0.6).abs() < 0.01);
    }

    #[test]
    fn per_task_stats() {
        let mut tracker = OutcomeTracker::new();

        // Two code calls, one generate call
        for _ in 0..2 {
            let trace = tracker.record_start("m1", InferenceTask::Code, "code");
            tracker.record_complete(&trace, 1000, 100, 50);
            tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.8 });
        }
        let trace = tracker.record_start("m1", InferenceTask::Generate, "gen");
        tracker.record_complete(&trace, 500, 50, 25);
        tracker.record_inferred_outcome(&trace, InferredOutcome::Rejected { confidence: 0.9 });

        let profile = tracker.profile("m1").unwrap();
        assert_eq!(profile.total_calls, 3);

        let code_stats = profile.task_stats(InferenceTask::Code).unwrap();
        assert_eq!(code_stats.calls, 2);
        assert_eq!(code_stats.successes, 2);

        let gen_stats = profile.task_stats(InferenceTask::Generate).unwrap();
        assert_eq!(gen_stats.calls, 1);
        assert_eq!(gen_stats.failures, 1);
    }

    #[test]
    fn export_populates_quality_per_1k_tokens() {
        let mut tracker = OutcomeTracker::new();
        let trace = tracker.record_start("m1", InferenceTask::Generate, "test");
        tracker.record_complete(&trace, 100, 800, 200); // 1000 tokens total
        tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 1.0 });

        let exported = tracker.export_profiles();
        assert_eq!(exported.len(), 1);
        let p = &exported[0];
        // ema_quality after one Accepted{1.0}: 0.5 * 0.8 + 1.0 * 0.2 = 0.6
        // quality_per_1k = 0.6 * 1000 / 1000 = 0.6
        assert!(
            (p.quality_per_1k_tokens - 0.6).abs() < 1e-6,
            "got {}",
            p.quality_per_1k_tokens
        );
    }

    #[test]
    fn quality_per_1k_tokens_zero_without_tokens() {
        let profile = ModelProfile::new("x".into());
        assert_eq!(profile.compute_quality_per_1k_tokens(), 0.0);
    }

    #[test]
    fn export_import() {
        let mut tracker = OutcomeTracker::new();
        let trace = tracker.record_start("m1", InferenceTask::Generate, "test");
        tracker.record_complete(&trace, 100, 10, 5);
        tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.9 });

        let exported = tracker.export_profiles();
        assert_eq!(exported.len(), 1);

        let mut tracker2 = OutcomeTracker::new();
        tracker2.import_profiles(exported);
        assert!(tracker2.profile("m1").is_some());
    }
}