car-inference 0.25.0

//! Outcome tracking — learn from inference results to improve routing.
//!
//! Two observation channels:
//! 1. **Conversation signals** — implicit feedback from what happens after an inference
//!    call (user moved on = accepted, user corrected = rejected, re-asked = rejected).
//! 2. **Git-diff tracking** — for code generation, compare suggestions to actual commits
//!    (ground truth, no classification model needed).
//!
//! Every inference call produces an `InferenceOutcome`. Outcomes accumulate into
//! `ModelProfile`s with per-task statistics. The adaptive router uses profiles
//! to make data-driven model selection.

use std::collections::{HashMap, HashSet};
use std::time::{SystemTime, UNIX_EPOCH};

use serde::{Deserialize, Serialize};

/// Task type for outcome tracking. Maps to ModelCapability but at the call level.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum InferenceTask {
    Generate,
    Embed,
    Classify,
    Code,
    Reasoning,
}

impl std::fmt::Display for InferenceTask {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            InferenceTask::Generate => write!(f, "generate"),
            InferenceTask::Embed => write!(f, "embed"),
            InferenceTask::Classify => write!(f, "classify"),
            InferenceTask::Code => write!(f, "code"),
            InferenceTask::Reasoning => write!(f, "reasoning"),
        }
    }
}

/// A single inference invocation record.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InferenceOutcome {
    /// Unique trace ID for this invocation.
    pub trace_id: String,
    /// Model that was used.
    pub model_id: String,
    /// Task type.
    pub task: InferenceTask,
    /// How the model was selected.
    pub routing_reason: String,
    /// Wall-clock latency in milliseconds.
    pub latency_ms: u64,
    /// Input tokens (estimated).
    pub input_tokens: usize,
    /// Output tokens (estimated).
    pub output_tokens: usize,
    /// Outcome from conversation signal inference.
    pub inferred_outcome: Option<InferredOutcome>,
    /// Outcome from git-diff tracking (code only).
    pub code_outcome: Option<CodeOutcome>,
    /// Error message if inference failed.
    pub error: Option<String>,
    /// Unix timestamp.
    pub timestamp: u64,
    /// Whether a *mechanical* success has already been credited to the
    /// model profile for this call (booked at completion when it produced
    /// output with no error — see [`OutcomeTracker::record_complete`]).
    /// Guards against the later pending-sweep or a downstream quality
    /// signal double-counting the same call. In-memory only (`pending` is
    /// never persisted), so it's skipped from (de)serialization.
    #[serde(skip)]
    pub success_credited: bool,
}

/// Outcome inferred from conversation flow (implicit feedback).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum InferredOutcome {
    /// User moved on, built on the response.
    Accepted { confidence: f64 },
    /// User used the result but modified it.
    AcceptedWithEdits { confidence: f64 },
    /// User corrected, re-asked, or explicitly rejected.
    Rejected { confidence: f64 },
    /// No follow-up signal (session ended, inconclusive).
    Inconclusive,
}

impl InferredOutcome {
    /// Convert to a quality score (0.0 = bad, 1.0 = good).
    pub fn quality_score(&self) -> Option<f64> {
        match self {
            InferredOutcome::Accepted { confidence } => Some(*confidence),
            InferredOutcome::AcceptedWithEdits { confidence } => Some(confidence * 0.7),
            InferredOutcome::Rejected { confidence } => Some((1.0 - confidence) * 0.3),
            InferredOutcome::Inconclusive => None,
        }
    }

    pub fn is_success(&self) -> Option<bool> {
        match self {
            InferredOutcome::Accepted { .. } => Some(true),
            InferredOutcome::AcceptedWithEdits { .. } => Some(true),
            InferredOutcome::Rejected { .. } => Some(false),
            InferredOutcome::Inconclusive => None,
        }
    }
}

/// Outcome from git-diff comparison (code generation ground truth).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum CodeOutcome {
    /// Suggestion was applied as-is (exact or near-exact match in diff).
    Applied,
    /// User changed the same file but differently (partial adoption).
    Modified,
    /// File unchanged despite suggestion (rejected / not used).
    Ignored,
    /// AST structural diff: signature was changed (breaking change).
    SignatureChanged,
    /// AST structural diff: body was modified but signature preserved (non-breaking).
    BodyModified,
    /// AST structural diff: new symbol was added.
    SymbolAdded,
}

impl CodeOutcome {
    pub fn quality_score(&self) -> f64 {
        match self {
            CodeOutcome::Applied => 1.0,
            CodeOutcome::SignatureChanged => 0.8,
            CodeOutcome::BodyModified => 0.7,
            CodeOutcome::SymbolAdded => 0.7,
            CodeOutcome::Modified => 0.6,
            CodeOutcome::Ignored => 0.1,
        }
    }

    pub fn is_success(&self) -> bool {
        !matches!(self, CodeOutcome::Ignored)
    }
}

/// Per-task statistics within a model profile.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TaskStats {
    pub calls: u64,
    pub successes: u64,
    pub failures: u64,
    /// Running average latency in ms.
    pub avg_latency_ms: f64,
    /// Exponential moving average of quality score.
    pub ema_quality: f64,
}

impl TaskStats {
    pub fn success_rate(&self) -> f64 {
        let total = self.successes + self.failures;
        if total == 0 {
            return 0.5;
        } // prior: assume neutral
        self.successes as f64 / total as f64
    }
}

/// Per-model performance profile, built from observed outcomes.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelProfile {
    pub model_id: String,
    pub total_calls: u64,
    pub success_count: u64,
    pub fail_count: u64,
    pub total_latency_ms: u64,
    /// Total estimated input tokens across all calls.
    #[serde(default)]
    pub total_input_tokens: u64,
    /// Total estimated output tokens across all calls.
    #[serde(default)]
    pub total_output_tokens: u64,
    /// Per-task statistics.
    pub task_stats: HashMap<String, TaskStats>,
    /// Overall EMA quality score (0.0 - 1.0).
    pub ema_quality: f64,
    /// Derived metric: quality per 1K total tokens. Populated on export
    /// (not on every update) so it always reflects the latest snapshot.
    /// Inspired by Meta-Harness: context-token efficiency is a first-class
    /// optimization target, so it needs to be visible in model_stats.
    #[serde(default)]
    pub quality_per_1k_tokens: f64,
    /// Last updated (unix timestamp).
    pub updated_at: u64,
}

impl ModelProfile {
    pub fn new(model_id: String) -> Self {
        Self {
            model_id,
            total_calls: 0,
            success_count: 0,
            fail_count: 0,
            total_latency_ms: 0,
            total_input_tokens: 0,
            total_output_tokens: 0,
            task_stats: HashMap::new(),
            ema_quality: 0.5, // neutral prior
            quality_per_1k_tokens: 0.0,
            updated_at: now_unix(),
        }
    }

    pub fn success_rate(&self) -> f64 {
        let total = self.success_count + self.fail_count;
        if total == 0 {
            return 0.5;
        }
        self.success_count as f64 / total as f64
    }

    pub fn avg_latency_ms(&self) -> f64 {
        if self.total_calls == 0 {
            return 0.0;
        }
        self.total_latency_ms as f64 / self.total_calls as f64
    }

    /// Same degradation pattern as SkillStats: fail_count > success_count + threshold.
    pub fn should_degrade(&self, threshold: u64) -> bool {
        self.fail_count > self.success_count + threshold
    }

    /// Get stats for a specific task type.
    pub fn task_stats(&self, task: InferenceTask) -> Option<&TaskStats> {
        self.task_stats.get(&task.to_string())
    }

    /// Total tokens observed across all calls (input + output).
    pub fn total_tokens(&self) -> u64 {
        self.total_input_tokens + self.total_output_tokens
    }

    /// Quality per 1000 tokens: `ema_quality * 1000 / total_tokens`.
    /// Returns 0.0 before any tokens have been observed.
    pub fn compute_quality_per_1k_tokens(&self) -> f64 {
        let total = self.total_tokens();
        if total == 0 {
            return 0.0;
        }
        self.ema_quality * 1000.0 / total as f64
    }
}

/// EMA smoothing factor. Higher = more weight on recent observations.
const EMA_ALPHA: f64 = 0.2;

/// One resolved inference outcome — the durable, attributable "receipt"
/// the concierge quotes ("routed to X at T because Y; latency 1.2s;
/// outcome success q=0.9"). Append-only; one JSON line per resolution.
/// Deliberately flat (success/quality/error pulled out of the richer
/// `InferenceOutcome`) so it's stable to read back and cheap to reason
/// over. Carries no prompt/output text — only routing-adjacent metadata.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutcomeLedgerEntry {
    pub trace_id: String,
    pub model_id: String,
    pub task: InferenceTask,
    pub routing_reason: String,
    pub latency_ms: u64,
    pub input_tokens: usize,
    pub output_tokens: usize,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub success: Option<bool>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub quality: Option<f64>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub error: Option<String>,
    /// Project/workspace the call belonged to — the key B1 groups paired
    /// comparisons on. `None` until the capture path threads it (B1); kept
    /// in the schema now so today's receipts are forward-compatible.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub project_id: Option<String>,
    /// Intent/use-case lane (finer than `task`), if the router computed
    /// one. `None` until threaded (B1).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub intent: Option<String>,
    pub timestamp: u64,
}

/// Cap on the in-memory ledger buffer between flushes — backstop against
/// unbounded growth if the flush timer never runs. Oldest entries drop.
const MAX_LEDGER_BUFFER: usize = 5000;

/// Max characters of an error string kept in a ledger receipt. Errors are
/// useful signal ("429", "context overflow") but can echo provider text
/// or paths — cap the length so the privacy-bounded ledger never grows a
/// large/sensitive blob. The classification lives in the prefix.
const MAX_LEDGER_ERROR_CHARS: usize = 256;

/// Truncate an error to [`MAX_LEDGER_ERROR_CHARS`] on a char boundary.
fn redact_error(error: &str) -> String {
    if error.chars().count() <= MAX_LEDGER_ERROR_CHARS {
        return error.to_string();
    }
    let truncated: String = error.chars().take(MAX_LEDGER_ERROR_CHARS).collect();
    format!("{truncated}…")
}

/// Rewrite the ledger keeping only the most recent `max_entries` receipts
/// (retention bound). No-op when under the cap or the file is absent.
/// Atomic (temp + rename) so a crash mid-prune can't corrupt the ledger.
pub fn prune_ledger(path: &std::path::Path, max_entries: usize) -> std::io::Result<()> {
    if max_entries == 0 || !path.exists() {
        return Ok(());
    }
    let entries = read_ledger(path, 0);
    if entries.len() <= max_entries {
        return Ok(());
    }
    let keep = &entries[entries.len() - max_entries..];
    let mut body = String::new();
    for e in keep {
        let line = serde_json::to_string(e)
            .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?;
        body.push_str(&line);
        body.push('\n');
    }
    let tmp = path.with_extension("jsonl.tmp");
    std::fs::write(&tmp, body)?;
    std::fs::rename(&tmp, path)
}

/// Append resolved outcome receipts to the JSONL ledger (one line each).
/// Append-only: no full-file rewrite, so this never triggers the
/// write-storm the profiles file had. Creates the file/parent on first
/// write.
pub fn append_ledger_entries(
    path: &std::path::Path,
    entries: &[OutcomeLedgerEntry],
) -> std::io::Result<()> {
    if entries.is_empty() {
        return Ok(());
    }
    if let Some(parent) = path.parent() {
        std::fs::create_dir_all(parent)?;
    }
    use std::io::Write;
    let mut opts = std::fs::OpenOptions::new();
    opts.create(true).append(true);
    // Local, privacy-bounded file (error text, soon project paths): owner
    // read/write only.
    #[cfg(unix)]
    {
        use std::os::unix::fs::OpenOptionsExt;
        opts.mode(0o600);
    }
    let mut f = opts.open(path)?;
    for e in entries {
        let line = serde_json::to_string(e)
            .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?;
        f.write_all(line.as_bytes())?;
        f.write_all(b"\n")?;
    }
    Ok(())
}

/// Read the most recent `limit` ledger entries (0 = all). Tolerant of
/// partial/garbage lines (skips them) so a torn append never poisons a
/// read — the consumer (UsageProfile) gets whatever is well-formed.
pub fn read_ledger(path: &std::path::Path, limit: usize) -> Vec<OutcomeLedgerEntry> {
    let Ok(content) = std::fs::read_to_string(path) else {
        return Vec::new();
    };
    let mut out: Vec<OutcomeLedgerEntry> = content
        .lines()
        .filter(|l| !l.trim().is_empty())
        .filter_map(|l| serde_json::from_str(l).ok())
        .collect();
    if limit > 0 && out.len() > limit {
        out = out.split_off(out.len() - limit);
    }
    out
}

/// Tracks inference outcomes and builds performance profiles.
pub struct OutcomeTracker {
    /// In-memory profiles, keyed by model_id.
    profiles: HashMap<String, ModelProfile>,
    /// Pending outcomes: completed inference calls awaiting outcome signal.
    /// Keyed by trace_id.
    pending: HashMap<String, InferenceOutcome>,
    /// Counter for generating trace IDs.
    trace_counter: u64,
    /// Models excluded for this session (429/rate-limited). Hard exclusion.
    excluded: HashSet<String>,
    /// Set whenever a persisted field (a `ModelProfile`) changes; cleared
    /// on save. Lets the engine debounce disk writes — persist only when
    /// there's something new, instead of serializing the whole file after
    /// every inference call. `excluded`/`pending` are session-only and do
    /// not flip this.
    dirty: bool,
    /// Resolved outcome receipts awaiting append to the JSONL ledger.
    /// Drained by the engine on flush. Bounded by [`MAX_LEDGER_BUFFER`].
    ledger_buffer: std::collections::VecDeque<OutcomeLedgerEntry>,
}

impl OutcomeTracker {
    pub fn new() -> Self {
        Self {
            profiles: HashMap::new(),
            pending: HashMap::new(),
            trace_counter: 0,
            excluded: HashSet::new(),
            dirty: false,
            ledger_buffer: std::collections::VecDeque::new(),
        }
    }

    /// Push a resolved receipt to the ledger buffer (capped; oldest drops).
    fn push_ledger(&mut self, entry: OutcomeLedgerEntry) {
        if self.ledger_buffer.len() >= MAX_LEDGER_BUFFER {
            self.ledger_buffer.pop_front();
        }
        self.ledger_buffer.push_back(entry);
    }

    /// Drain buffered receipts for the engine to append to the ledger file.
    pub fn drain_ledger(&mut self) -> Vec<OutcomeLedgerEntry> {
        self.ledger_buffer.drain(..).collect()
    }

    /// Check if a model is excluded (rate-limited) for this session.
    pub fn is_excluded(&self, model_id: &str) -> bool {
        self.excluded.contains(model_id)
    }

    /// Record that an inference call started. Returns a trace_id.
    pub fn record_start(
        &mut self,
        model_id: &str,
        task: InferenceTask,
        routing_reason: &str,
    ) -> String {
        self.trace_counter += 1;
        let trace_id = format!("t-{}-{}", now_unix(), self.trace_counter);

        let outcome = InferenceOutcome {
            trace_id: trace_id.clone(),
            model_id: model_id.to_string(),
            task,
            // Length-capped: `routing_reason` is the only free-text field
            // that reaches the persisted ledger. INVARIANT: it must NEVER
            // embed prompt or output text — only a routing rationale
            // ("Code task -> Qwen3-4B"). The cap bounds accidental growth.
            routing_reason: redact_error(routing_reason),
            latency_ms: 0,
            input_tokens: 0,
            output_tokens: 0,
            inferred_outcome: None,
            code_outcome: None,
            error: None,
            timestamp: now_unix(),
            success_credited: false,
        };

        self.pending.insert(trace_id.clone(), outcome);
        trace_id
    }

    /// Record completion of an inference call (timing + token counts).
    pub fn record_complete(
        &mut self,
        trace_id: &str,
        latency_ms: u64,
        input_tokens: usize,
        output_tokens: usize,
    ) {
        if let Some(outcome) = self.pending.get_mut(trace_id) {
            outcome.latency_ms = latency_ms;
            outcome.input_tokens = input_tokens;
            outcome.output_tokens = output_tokens;

            // Credit a *mechanical* success immediately when the call ran
            // and produced output with no error. This is symmetric with
            // `record_failure`, which books a failure the moment it happens
            // — previously a success was credited only by the 300s pending
            // sweep, so short-lived processes (the `car infer` CLI, eval
            // harnesses) exited before the sweep and NEVER recorded a
            // success, pinning working models at the 0.5 EMA prior / "0
            // successes" in the health UI (#312, only partially fixed by
            // the deferred sweep). `ema_quality` is untouched — only a real
            // downstream accept/edit signal moves quality. `success_credited`
            // stops the later sweep and any quality-signal resolution from
            // double-counting this same call.
            let mechanical_success = output_tokens > 0 && outcome.error.is_none();
            if mechanical_success {
                outcome.success_credited = true;
            }
            let model_id = outcome.model_id.clone();
            let task_key = outcome.task.to_string();

            // Update profile with timing data (the `outcome` borrow of
            // `self.pending` ends here; `self.profiles` is a disjoint field).
            let profile = self
                .profiles
                .entry(model_id.clone())
                .or_insert_with(|| ModelProfile::new(model_id));

            profile.total_calls += 1;
            profile.total_latency_ms += latency_ms;
            profile.total_input_tokens += input_tokens as u64;
            profile.total_output_tokens += output_tokens as u64;
            if mechanical_success {
                profile.success_count += 1;
            }

            let ts = profile.task_stats.entry(task_key).or_default();
            ts.calls += 1;
            if mechanical_success {
                ts.successes += 1;
            }
            ts.avg_latency_ms =
                ts.avg_latency_ms + (latency_ms as f64 - ts.avg_latency_ms) / ts.calls as f64;

            profile.updated_at = now_unix();
            self.dirty = true;
        }
    }

    /// Record a failure.
    pub fn record_failure(&mut self, trace_id: &str, error: &str) {
        let mut ledger_entry = None;
        if let Some(outcome) = self.pending.get_mut(trace_id) {
            outcome.error = Some(error.to_string());

            let profile = self
                .profiles
                .entry(outcome.model_id.clone())
                .or_insert_with(|| ModelProfile::new(outcome.model_id.clone()));

            // A failed call is still a call: count it in `total_calls` so the
            // denominator includes failures (record_complete counts successes
            // the same way). Without this, `total_calls` tallied only
            // completions and could read *smaller* than `fail_count` — the
            // live symptom on parslee/* models (6 calls, 14 fails).
            profile.total_calls += 1;
            profile.fail_count += 1;

            // Rate-limit errors (429) get a harsher penalty — the model is
            // guaranteed to fail again, so drop quality aggressively.
            let is_rate_limited = error.contains("429") || error.contains("RESOURCE_EXHAUSTED");
            if is_rate_limited {
                // Hard-exclude for the rest of this session (#13)
                self.excluded.insert(outcome.model_id.clone());
                profile.ema_quality *= 0.1;
            } else {
                profile.ema_quality = profile.ema_quality * (1.0 - EMA_ALPHA) + 0.0 * EMA_ALPHA;
            }

            let task_key = outcome.task.to_string();
            let ts = profile.task_stats.entry(task_key).or_default();
            ts.failures += 1;
            if is_rate_limited {
                ts.ema_quality *= 0.1;
            } else {
                ts.ema_quality = ts.ema_quality * (1.0 - EMA_ALPHA);
            }

            profile.updated_at = now_unix();
            self.dirty = true;

            ledger_entry = Some(OutcomeLedgerEntry {
                trace_id: outcome.trace_id.clone(),
                model_id: outcome.model_id.clone(),
                task: outcome.task,
                routing_reason: outcome.routing_reason.clone(),
                latency_ms: outcome.latency_ms,
                input_tokens: outcome.input_tokens,
                output_tokens: outcome.output_tokens,
                success: Some(false),
                quality: None,
                error: Some(redact_error(error)),
                project_id: None,
                intent: None,
                timestamp: now_unix(),
            });
        }

        if let Some(entry) = ledger_entry {
            self.push_ledger(entry);
        }

        // Failed outcomes don't need further tracking
        self.pending.remove(trace_id);
    }

    /// Record an inferred outcome from conversation signals.
    pub fn record_inferred_outcome(&mut self, trace_id: &str, outcome: InferredOutcome) {
        if let Some(pending) = self.pending.remove(trace_id) {
            self.apply_outcome(&pending, outcome.quality_score(), outcome.is_success());
        }
    }

    /// Record an outcome from git-diff comparison (code generation).
    pub fn record_code_outcome(&mut self, trace_id: &str, outcome: CodeOutcome) {
        if let Some(pending) = self.pending.remove(trace_id) {
            self.apply_outcome(
                &pending,
                Some(outcome.quality_score()),
                Some(outcome.is_success()),
            );
        }
    }

    /// Resolve all pending outcomes for a completed conversation turn.
    /// Called with the inferred outcomes from conversation signal analysis.
    pub fn resolve_pending_from_signals(&mut self, outcomes: Vec<(String, InferredOutcome)>) {
        for (trace_id, inferred) in outcomes {
            self.record_inferred_outcome(&trace_id, inferred);
        }
    }

    /// Infer outcomes from a sequence of action results.
    ///
    /// In a reasoning session, each action's output feeds the next. If action N
    /// produced output and action N+1 succeeded using it, N was implicitly accepted.
    /// If N produced empty output or N+1 failed, N was implicitly rejected.
    ///
    /// Returns (trace_id, inferred_outcome) pairs ready for `resolve_pending_from_signals`.
    pub fn infer_outcomes_from_action_sequence(
        &self,
        action_results: &[(String, bool, f64, String)], // (trace_id, success, confidence, output)
    ) -> Vec<(String, InferredOutcome)> {
        let mut outcomes = Vec::new();

        for (i, (trace_id, success, confidence, output)) in action_results.iter().enumerate() {
            if trace_id.is_empty() {
                continue; // No trace (e.g., memgine-only action)
            }

            if !success {
                outcomes.push((
                    trace_id.clone(),
                    InferredOutcome::Rejected {
                        confidence: *confidence,
                    },
                ));
                continue;
            }

            // Check if the next action used this one's output (implicit acceptance)
            let next_succeeded = action_results
                .get(i + 1)
                .map(|(_, s, _, _)| *s)
                .unwrap_or(true); // Last action: assume accepted if successful

            let has_output = !output.trim().is_empty();

            if has_output && next_succeeded {
                outcomes.push((
                    trace_id.clone(),
                    InferredOutcome::Accepted {
                        confidence: *confidence,
                    },
                ));
            } else if has_output && !next_succeeded {
                // Output existed but downstream failed — may not be this action's fault
                outcomes.push((
                    trace_id.clone(),
                    InferredOutcome::AcceptedWithEdits {
                        confidence: confidence * 0.7,
                    },
                ));
            } else {
                outcomes.push((trace_id.clone(), InferredOutcome::Inconclusive));
            }
        }

        outcomes
    }

    /// Get the profile for a model.
    pub fn profile(&self, model_id: &str) -> Option<&ModelProfile> {
        self.profiles.get(model_id)
    }

    /// Get all profiles.
    pub fn all_profiles(&self) -> &HashMap<String, ModelProfile> {
        &self.profiles
    }

    /// Get pending trace IDs (for conversation signal analysis).
    pub fn pending_trace_ids(&self) -> Vec<String> {
        self.pending.keys().cloned().collect()
    }

    /// Get a pending outcome by trace_id.
    pub fn get_pending(&self, trace_id: &str) -> Option<&InferenceOutcome> {
        self.pending.get(trace_id)
    }

    /// Export profiles for serialization / persistence. Derived metrics
    /// (quality_per_1k_tokens) are recomputed on the way out so callers
    /// always see a consistent snapshot.
    pub fn export_profiles(&self) -> Vec<ModelProfile> {
        self.profiles
            .values()
            .cloned()
            .map(|mut p| {
                p.quality_per_1k_tokens = p.compute_quality_per_1k_tokens();
                p
            })
            .collect()
    }

    /// Import profiles as a genuine mutation (benchmark priors, CLI import
    /// from the memgine fact graph, router merges). Marks the tracker
    /// dirty so the new profiles actually reach disk on the next flush.
    /// NOTE: hydration from disk does NOT go through here — see
    /// [`load_from_file`], which inserts directly and stays clean (loading
    /// is not a change).
    pub fn import_profiles(&mut self, profiles: Vec<ModelProfile>) {
        for p in profiles {
            self.profiles.insert(p.model_id.clone(), p);
        }
        self.dirty = true;
    }

    /// Save profiles to a JSON file for cross-session persistence (#13).
    ///
    /// Atomic: serialize to a sibling temp file, then rename over the
    /// target. This is the durable receipt store — a torn `write()`
    /// (crash mid-write) would corrupt it, and `load_from_file` treats a
    /// parse failure as a hard error, which on next boot loses *all*
    /// history. The temp+rename makes a partial write impossible.
    pub fn save_to_file(&self, path: &std::path::Path) -> Result<(), std::io::Error> {
        let profiles = self.export_profiles();
        let json = serde_json::to_string_pretty(&profiles)
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
        if let Some(parent) = path.parent() {
            std::fs::create_dir_all(parent)?;
        }
        let tmp = path.with_extension("json.tmp");
        std::fs::write(&tmp, json)?;
        std::fs::rename(&tmp, path)
    }

    /// True if a persisted profile has changed since the last save.
    pub fn is_dirty(&self) -> bool {
        self.dirty
    }

    /// Save only if dirty, clearing the flag on success. Returns whether
    /// a write happened. Lets callers persist cheaply on a timer without
    /// rewriting the whole file when nothing changed.
    pub fn save_if_dirty(&mut self, path: &std::path::Path) -> Result<bool, std::io::Error> {
        if !self.dirty {
            return Ok(false);
        }
        self.save_to_file(path)?;
        self.dirty = false;
        Ok(true)
    }

    /// Load profiles from a JSON file for cross-session persistence (#13).
    pub fn load_from_file(&mut self, path: &std::path::Path) -> Result<usize, std::io::Error> {
        if !path.exists() {
            return Ok(0);
        }
        let json = std::fs::read_to_string(path)?;
        let profiles: Vec<ModelProfile> = serde_json::from_str(&json)
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
        let count = profiles.len();
        // Insert directly (NOT via import_profiles): hydration from disk is
        // not a change, so it must not flip the dirty flag — otherwise the
        // first idle flush would needlessly rewrite the file we just read.
        for p in profiles {
            self.profiles.insert(p.model_id.clone(), p);
        }
        Ok(count)
    }

    /// Apply a quality signal to the model's profile.
    fn apply_outcome(
        &mut self,
        pending: &InferenceOutcome,
        quality: Option<f64>,
        success: Option<bool>,
    ) {
        let profile = self
            .profiles
            .entry(pending.model_id.clone())
            .or_insert_with(|| ModelProfile::new(pending.model_id.clone()));

        if let Some(q) = quality {
            profile.ema_quality = profile.ema_quality * (1.0 - EMA_ALPHA) + q * EMA_ALPHA;

            let task_key = pending.task.to_string();
            let ts = profile.task_stats.entry(task_key).or_default();
            ts.ema_quality = ts.ema_quality * (1.0 - EMA_ALPHA) + q * EMA_ALPHA;
        }

        if let Some(ok) = success {
            // The call may have already been credited a mechanical success at
            // completion (record_complete). Don't double-count it here.
            let already_credited = pending.success_credited;
            let task_key = pending.task.to_string();
            if ok {
                if !already_credited {
                    profile.success_count += 1;
                    let ts = profile.task_stats.entry(task_key).or_default();
                    ts.successes += 1;
                }
                // else: already counted; the quality EMA above is the new info.
            } else {
                // A real downstream signal says this call was bad. If we
                // mechanically credited it as a success at completion,
                // reclassify: undo the success and book a failure instead.
                if already_credited {
                    profile.success_count = profile.success_count.saturating_sub(1);
                    let ts = profile.task_stats.entry(task_key.clone()).or_default();
                    ts.successes = ts.successes.saturating_sub(1);
                }
                profile.fail_count += 1;
                let ts = profile.task_stats.entry(task_key).or_default();
                ts.failures += 1;
            }
        }

        profile.updated_at = now_unix();
        self.dirty = true;

        self.push_ledger(OutcomeLedgerEntry {
            trace_id: pending.trace_id.clone(),
            model_id: pending.model_id.clone(),
            task: pending.task,
            routing_reason: pending.routing_reason.clone(),
            latency_ms: pending.latency_ms,
            input_tokens: pending.input_tokens,
            output_tokens: pending.output_tokens,
            success,
            quality,
            error: None,
            project_id: None,
            intent: None,
            timestamp: now_unix(),
        });
    }

    /// Evict `pending` entries older than `ttl_secs` so the map can't grow
    /// without bound in a long-running daemon (only a few resolution paths
    /// ever drain it). For each evicted entry that actually *completed*
    /// (latency recorded), emit a terminal `Inconclusive` receipt — this
    /// also de-biases the ledger: without it, the ledger would only ever
    /// contain calls that happened to get a follow-up signal (a
    /// systematic skew toward reasoning-session traffic), and downstream
    /// stats would be computed over a non-representative sample. Returns
    /// the number of entries swept.
    pub fn sweep_pending(&mut self, ttl_secs: u64) -> usize {
        self.sweep_pending_at(ttl_secs, now_unix())
    }

    /// Clock-injectable core of [`sweep_pending`] for deterministic tests.
    fn sweep_pending_at(&mut self, ttl_secs: u64, now: u64) -> usize {
        let cutoff = now.saturating_sub(ttl_secs);
        let expired: Vec<String> = self
            .pending
            .iter()
            .filter(|(_, o)| o.timestamp < cutoff)
            .map(|(id, _)| id.clone())
            .collect();
        for id in &expired {
            if let Some(o) = self.pending.remove(id) {
                // Only completed-but-unresolved calls become receipts; a
                // never-completed trace (latency 0) is in-flight/crashed —
                // a zero-everything receipt would be noise.
                if o.latency_ms > 0 {
                    // A swept call completed but never received a downstream
                    // quality signal. Recording *every* such call Inconclusive
                    // (`success: None`) left signal-less models — notably ALL
                    // local inference, which has no accept/edit feedback loop —
                    // pinned at the 0.5 EMA prior with zero successes, so the
                    // health UI rendered working models as "50%"/"0%" (#312).
                    // Instead, credit a *mechanical* success when the call
                    // actually returned output with no error: it ran and
                    // produced tokens; we simply never learned whether the
                    // answer was *good*. `quality` stays None so `ema_quality`
                    // is only ever moved by a real quality signal. A completion
                    // that produced no output stays Inconclusive (ambiguous).
                    // Credit a mechanical success here ONLY if it wasn't
                    // already booked at completion (record_complete now does
                    // this immediately for the common case). A call credited
                    // at completion still gets a `Some(true)` receipt below,
                    // but its count is not incremented twice.
                    let credit_now =
                        o.output_tokens > 0 && o.error.is_none() && !o.success_credited;
                    let was_success = o.success_credited || credit_now;
                    if credit_now {
                        let profile = self
                            .profiles
                            .entry(o.model_id.clone())
                            .or_insert_with(|| ModelProfile::new(o.model_id.clone()));
                        profile.success_count += 1;
                        let ts = profile.task_stats.entry(o.task.to_string()).or_default();
                        ts.successes += 1;
                        profile.updated_at = now_unix();
                        self.dirty = true;
                    }
                    self.push_ledger(OutcomeLedgerEntry {
                        trace_id: o.trace_id,
                        model_id: o.model_id,
                        task: o.task,
                        routing_reason: o.routing_reason,
                        latency_ms: o.latency_ms,
                        input_tokens: o.input_tokens,
                        output_tokens: o.output_tokens,
                        success: if was_success { Some(true) } else { None },
                        quality: None,
                        error: None,
                        project_id: None,
                        intent: None,
                        timestamp: now_unix(),
                    });
                }
            }
        }
        expired.len()
    }

    /// Check git diff for pending code suggestions and resolve outcomes.
    ///
    /// Two strategies:
    /// 1. **AST structural diff** (when `ast` feature is enabled): parse the old
    ///    and new versions of changed files and compare at the symbol level.
    ///    This gives precise outcomes: SignatureChanged, BodyModified, SymbolAdded.
    /// 2. **Text diff fallback**: token matching against the combined git diff.
    pub fn check_git_outcomes(&mut self, repo_dir: &std::path::Path) {
        let diff = match std::process::Command::new("git")
            .args(["diff", "--no-color"])
            .current_dir(repo_dir)
            .output()
        {
            Ok(output) => String::from_utf8_lossy(&output.stdout).to_string(),
            Err(_) => return,
        };

        let staged_diff = match std::process::Command::new("git")
            .args(["diff", "--cached", "--no-color"])
            .current_dir(repo_dir)
            .output()
        {
            Ok(output) => String::from_utf8_lossy(&output.stdout).to_string(),
            Err(_) => String::new(),
        };

        let combined_diff = format!("{}\n{}", diff, staged_diff);

        if combined_diff.trim().is_empty() {
            return; // No changes at all
        }

        // Try AST structural diff on changed files
        #[cfg(feature = "ast")]
        let ast_outcome = Self::check_git_outcomes_ast(repo_dir);

        let code_traces: Vec<(String, String)> = self
            .pending
            .iter()
            .filter(|(_, o)| matches!(o.task, InferenceTask::Code))
            .map(|(id, o)| (id.clone(), o.model_id.clone()))
            .collect();

        for (trace_id, _model_id) in code_traces {
            if let Some(pending) = self.pending.get(&trace_id) {
                // Try AST-based outcome first
                #[cfg(feature = "ast")]
                if let Some(ref ast_out) = ast_outcome {
                    let pending_clone = pending.clone();
                    self.apply_outcome(
                        &pending_clone,
                        Some(ast_out.quality_score()),
                        Some(ast_out.is_success()),
                    );
                    continue;
                }

                // Fallback: text token matching
                let output_tokens: Vec<&str> = pending
                    .routing_reason
                    .split_whitespace()
                    .filter(|t| t.len() > 5)
                    .collect();

                let outcome = if output_tokens.iter().any(|t| combined_diff.contains(t)) {
                    CodeOutcome::Applied
                } else {
                    CodeOutcome::Modified
                };

                let pending_clone = pending.clone();
                self.apply_outcome(
                    &pending_clone,
                    Some(outcome.quality_score()),
                    Some(outcome.is_success()),
                );
            }
        }
    }

    /// AST-based git outcome: parse changed files before and after, diff symbols.
    #[cfg(feature = "ast")]
    fn check_git_outcomes_ast(repo_dir: &std::path::Path) -> Option<CodeOutcome> {
        // Get list of changed files
        let name_only = std::process::Command::new("git")
            .args(["diff", "--name-only"])
            .current_dir(repo_dir)
            .output()
            .ok()?;
        let changed_files: Vec<&str> = std::str::from_utf8(&name_only.stdout)
            .ok()?
            .lines()
            .filter(|f| !f.is_empty())
            .collect();

        if changed_files.is_empty() {
            return None;
        }

        let mut has_sig_change = false;
        let mut has_body_change = false;
        let mut has_addition = false;

        for file in &changed_files {
            // Only parse files tree-sitter supports
            if car_ast::Language::from_filename(file).is_none() {
                continue;
            }

            // Get the HEAD version
            let old_content = std::process::Command::new("git")
                .args(["show", &format!("HEAD:{}", file)])
                .current_dir(repo_dir)
                .output()
                .ok()
                .and_then(|o| {
                    if o.status.success() {
                        String::from_utf8(o.stdout).ok()
                    } else {
                        None
                    }
                });

            // Get the working tree version
            let new_path = repo_dir.join(file);
            let new_content = std::fs::read_to_string(&new_path).ok();

            match (old_content, new_content) {
                (Some(old), Some(new)) => {
                    let old_parsed = car_ast::parse_file(&old, file);
                    let new_parsed = car_ast::parse_file(&new, file);

                    if let (Some(old_p), Some(new_p)) = (old_parsed, new_parsed) {
                        let changes = car_ast::diff_symbols(&old_p, &new_p);
                        for change in &changes {
                            match change {
                                car_ast::SymbolChange::Added(_) => has_addition = true,
                                car_ast::SymbolChange::Modified {
                                    signature_changed, ..
                                } => {
                                    if *signature_changed {
                                        has_sig_change = true;
                                    } else {
                                        has_body_change = true;
                                    }
                                }
                                car_ast::SymbolChange::Removed(_) => has_sig_change = true,
                            }
                        }
                    }
                }
                (None, Some(_)) => has_addition = true, // New file
                _ => {}
            }
        }

        // Return the most significant outcome
        if has_sig_change {
            Some(CodeOutcome::SignatureChanged)
        } else if has_body_change {
            Some(CodeOutcome::BodyModified)
        } else if has_addition {
            Some(CodeOutcome::SymbolAdded)
        } else {
            None // No structural changes detected (maybe non-code files changed)
        }
    }
}

impl Default for OutcomeTracker {
    fn default() -> Self {
        Self::new()
    }
}

fn now_unix() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn lifecycle() {
        let mut tracker = OutcomeTracker::new();

        // Start an inference call
        let trace = tracker.record_start(
            "qwen/qwen3-4b:q4_k_m",
            InferenceTask::Code,
            "Code task -> Qwen3-4B",
        );

        // Complete it
        tracker.record_complete(&trace, 1200, 100, 50);

        // Profile should have 1 call
        let profile = tracker.profile("qwen/qwen3-4b:q4_k_m").unwrap();
        assert_eq!(profile.total_calls, 1);
        assert_eq!(profile.avg_latency_ms(), 1200.0);

        // Record positive outcome
        tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.9 });

        let profile = tracker.profile("qwen/qwen3-4b:q4_k_m").unwrap();
        assert_eq!(profile.success_count, 1);
        assert!(profile.ema_quality > 0.5); // should have gone up from 0.5
    }

    #[test]
    fn failure_degrades() {
        // A failed call goes through record_failure ONLY — it never also
        // hits record_complete (a call either completes or errors, not both).
        // record_complete now credits a mechanical success, so mixing the two
        // on one trace would model a flow that does not occur.
        let mut tracker = OutcomeTracker::new();
        for _ in 0..5 {
            let trace = tracker.record_start("bad-model", InferenceTask::Generate, "test");
            tracker.record_failure(&trace, "timeout");
        }

        let profile = tracker.profile("bad-model").unwrap();
        assert_eq!(profile.fail_count, 5);
        assert_eq!(profile.success_count, 0);
        assert!(profile.should_degrade(2)); // 5 > 0 + 2
        assert!(profile.ema_quality < 0.3); // decayed toward 0
    }

    #[test]
    fn code_outcome_ground_truth() {
        let mut tracker = OutcomeTracker::new();

        let trace = tracker.record_start("qwen/qwen3-4b:q4_k_m", InferenceTask::Code, "code");
        tracker.record_complete(&trace, 500, 200, 100);
        tracker.record_code_outcome(&trace, CodeOutcome::Applied);

        let profile = tracker.profile("qwen/qwen3-4b:q4_k_m").unwrap();
        assert_eq!(profile.success_count, 1);
        // EMA should reflect Applied quality (1.0): 0.5 * 0.8 + 1.0 * 0.2 = 0.6
        assert!((profile.ema_quality - 0.6).abs() < 0.01);
    }

    #[test]
    fn per_task_stats() {
        let mut tracker = OutcomeTracker::new();

        // Two code calls, one generate call
        for _ in 0..2 {
            let trace = tracker.record_start("m1", InferenceTask::Code, "code");
            tracker.record_complete(&trace, 1000, 100, 50);
            tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.8 });
        }
        let trace = tracker.record_start("m1", InferenceTask::Generate, "gen");
        tracker.record_complete(&trace, 500, 50, 25);
        tracker.record_inferred_outcome(&trace, InferredOutcome::Rejected { confidence: 0.9 });

        let profile = tracker.profile("m1").unwrap();
        assert_eq!(profile.total_calls, 3);

        let code_stats = profile.task_stats(InferenceTask::Code).unwrap();
        assert_eq!(code_stats.calls, 2);
        assert_eq!(code_stats.successes, 2);

        let gen_stats = profile.task_stats(InferenceTask::Generate).unwrap();
        assert_eq!(gen_stats.calls, 1);
        assert_eq!(gen_stats.failures, 1);
    }

    #[test]
    fn export_populates_quality_per_1k_tokens() {
        let mut tracker = OutcomeTracker::new();
        let trace = tracker.record_start("m1", InferenceTask::Generate, "test");
        tracker.record_complete(&trace, 100, 800, 200); // 1000 tokens total
        tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 1.0 });

        let exported = tracker.export_profiles();
        assert_eq!(exported.len(), 1);
        let p = &exported[0];
        // ema_quality after one Accepted{1.0}: 0.5 * 0.8 + 1.0 * 0.2 = 0.6
        // quality_per_1k = 0.6 * 1000 / 1000 = 0.6
        assert!(
            (p.quality_per_1k_tokens - 0.6).abs() < 1e-6,
            "got {}",
            p.quality_per_1k_tokens
        );
    }

    #[test]
    fn quality_per_1k_tokens_zero_without_tokens() {
        let profile = ModelProfile::new("x".into());
        assert_eq!(profile.compute_quality_per_1k_tokens(), 0.0);
    }

    #[test]
    fn dirty_flag_and_save_if_dirty() {
        let dir = std::env::temp_dir().join("car-outcome-dirty-test");
        let _ = std::fs::remove_dir_all(&dir);
        let path = dir.join("outcome_profiles.json");

        let mut tracker = OutcomeTracker::new();
        // Fresh tracker is clean → save_if_dirty is a no-op.
        assert!(!tracker.is_dirty());
        assert!(!tracker.save_if_dirty(&path).unwrap());
        assert!(!path.exists());

        // A recorded outcome dirties it.
        let trace = tracker.record_start("m1", InferenceTask::Generate, "router");
        tracker.record_complete(&trace, 100, 10, 20);
        assert!(tracker.is_dirty());

        // save_if_dirty writes once and clears the flag.
        assert!(tracker.save_if_dirty(&path).unwrap());
        assert!(path.exists());
        assert!(!tracker.is_dirty());

        // Second call with no new changes does not rewrite.
        assert!(!tracker.save_if_dirty(&path).unwrap());

        // Loading profiles must NOT mark the tracker dirty.
        let mut fresh = OutcomeTracker::new();
        fresh.load_from_file(&path).unwrap();
        assert!(!fresh.is_dirty());

        // But a genuine import (benchmark priors / CLI) MUST dirty, so the
        // imported profiles actually persist on the next flush.
        fresh.import_profiles(vec![ModelProfile::new("seeded".into())]);
        assert!(fresh.is_dirty());

        let _ = std::fs::remove_dir_all(&dir);
    }

    #[test]
    fn ledger_captures_resolved_outcomes() {
        let dir = std::env::temp_dir().join("car-outcome-ledger-test");
        let _ = std::fs::remove_dir_all(&dir);
        let path = dir.join("outcome_ledger.jsonl");

        let mut tracker = OutcomeTracker::new();

        // A success with a quality signal → one resolved receipt.
        let t1 = tracker.record_start("good-model", InferenceTask::Generate, "router:test");
        tracker.record_complete(&t1, 1200, 50, 100);
        tracker.record_inferred_outcome(&t1, InferredOutcome::Accepted { confidence: 0.9 });

        // A failure → one resolved receipt with the error.
        let t2 = tracker.record_start("bad-model", InferenceTask::Code, "router:test");
        tracker.record_failure(&t2, "boom: 500");

        let drained = tracker.drain_ledger();
        assert_eq!(drained.len(), 2);
        assert!(tracker.drain_ledger().is_empty(), "drain clears the buffer");

        append_ledger_entries(&path, &drained).unwrap();
        let read = read_ledger(&path, 0);
        assert_eq!(read.len(), 2);

        let good = read.iter().find(|e| e.model_id == "good-model").unwrap();
        assert_eq!(good.success, Some(true));
        assert!(good.quality.is_some());
        assert_eq!(good.routing_reason, "router:test");
        assert_eq!(good.latency_ms, 1200);

        let bad = read.iter().find(|e| e.model_id == "bad-model").unwrap();
        assert_eq!(bad.success, Some(false));
        assert_eq!(bad.error.as_deref(), Some("boom: 500"));

        // Read back limited to most recent 1.
        assert_eq!(read_ledger(&path, 1).len(), 1);

        let _ = std::fs::remove_dir_all(&dir);
    }

    #[test]
    fn ledger_redacts_long_errors_and_prunes() {
        let dir = std::env::temp_dir().join("car-outcome-privacy-test");
        let _ = std::fs::remove_dir_all(&dir);
        let path = dir.join("outcome_ledger.jsonl");

        // Long error is truncated on capture.
        let mut tracker = OutcomeTracker::new();
        let t = tracker.record_start("m", InferenceTask::Generate, "r");
        let huge = "x".repeat(5000);
        tracker.record_failure(&t, &huge);
        let drained = tracker.drain_ledger();
        let err = drained[0].error.as_ref().unwrap();
        assert!(err.chars().count() <= MAX_LEDGER_ERROR_CHARS + 1, "error truncated");

        // Pruning keeps only the most recent N.
        let entries: Vec<OutcomeLedgerEntry> = (0..10)
            .map(|i| OutcomeLedgerEntry {
                trace_id: format!("t{i}"),
                model_id: "m".into(),
                task: InferenceTask::Generate,
                routing_reason: "r".into(),
                latency_ms: 1,
                input_tokens: 1,
                output_tokens: 1,
                success: Some(true),
                quality: Some(1.0),
                error: None,
                project_id: None,
                intent: None,
                timestamp: i,
            })
            .collect();
        append_ledger_entries(&path, &entries).unwrap();
        prune_ledger(&path, 3).unwrap();
        let kept = read_ledger(&path, 0);
        assert_eq!(kept.len(), 3);
        assert_eq!(kept[0].trace_id, "t7"); // most recent 3: t7,t8,t9
        assert_eq!(kept[2].trace_id, "t9");

        let _ = std::fs::remove_dir_all(&dir);
    }

    #[test]
    fn sweep_pending_credits_mechanical_success_or_inconclusive() {
        let mut tracker = OutcomeTracker::new();

        // Completed with output, no signal -> mechanical success (#312).
        let t1 = tracker.record_start("m", InferenceTask::Generate, "r");
        tracker.record_complete(&t1, 500, 10, 20);
        // Completed but produced NO output -> stays Inconclusive.
        let t2 = tracker.record_start("m", InferenceTask::Generate, "r");
        tracker.record_complete(&t2, 300, 5, 0);
        // A never-completed call (in-flight / crashed) -> no receipt.
        let _t3 = tracker.record_start("m", InferenceTask::Generate, "r");

        // Evaluate the sweep against a clock 10s in the future so the
        // same-second pending entries are unambiguously past the cutoff.
        let swept = tracker.sweep_pending_at(0, now_unix() + 10);
        assert_eq!(swept, 3, "all pending entries evicted");

        // Only the two completed calls become receipts.
        let mut receipts = tracker.drain_ledger();
        receipts.sort_by_key(|r| r.latency_ms);
        assert_eq!(receipts.len(), 2);
        // 300ms, no output -> Inconclusive
        assert_eq!(receipts[0].latency_ms, 300);
        assert_eq!(receipts[0].success, None);
        // 500ms, produced output -> mechanical success, quality untouched
        assert_eq!(receipts[1].latency_ms, 500);
        assert_eq!(receipts[1].success, Some(true));
        assert_eq!(receipts[1].quality, None);

        // success_count is credited; ema_quality stays the neutral prior
        // (no real quality signal ever arrived).
        let p = tracker.profile("m").expect("profile exists");
        assert_eq!(p.success_count, 1);
        assert_eq!(p.ema_quality, 0.5);
    }

    #[test]
    fn record_complete_credits_success_immediately() {
        // The headline fix: a call that completes with output is credited a
        // success at completion, not deferred to the 300s sweep — so a
        // short-lived process (CLI) that exits before any sweep still records
        // the success. Previously success_count stayed 0 and health read 0.5.
        let mut tracker = OutcomeTracker::new();
        let t = tracker.record_start("m", InferenceTask::Generate, "r");
        tracker.record_complete(&t, 500, 12, 20);

        let p = tracker.profile("m").expect("profile exists");
        assert_eq!(p.success_count, 1, "success credited at completion");
        assert_eq!(p.total_calls, 1);
        assert_eq!(p.total_input_tokens, 12, "input tokens recorded, not 0");
        assert_eq!(p.fail_count, 0);

        // A later sweep of the same (still-pending) call must NOT double-count.
        tracker.sweep_pending_at(0, now_unix() + 10);
        let p = tracker.profile("m").unwrap();
        assert_eq!(p.success_count, 1, "sweep does not re-credit");
    }

    #[test]
    fn record_complete_no_output_is_not_a_success() {
        let mut tracker = OutcomeTracker::new();
        let t = tracker.record_start("m", InferenceTask::Generate, "r");
        tracker.record_complete(&t, 300, 5, 0); // no output
        let p = tracker.profile("m").unwrap();
        assert_eq!(p.success_count, 0, "no output -> no mechanical success");
        assert_eq!(p.total_calls, 1);
    }

    #[test]
    fn record_failure_counts_total_calls() {
        // A failed call is still a call. total_calls must include it so it can
        // never read smaller than fail_count (the parslee/* 6-calls/14-fails
        // symptom).
        let mut tracker = OutcomeTracker::new();
        for _ in 0..3 {
            let t = tracker.record_start("m", InferenceTask::Generate, "r");
            tracker.record_failure(&t, "boom 500");
        }
        let p = tracker.profile("m").unwrap();
        assert_eq!(p.fail_count, 3);
        assert_eq!(p.total_calls, 3, "failures counted in total_calls");
        assert!(p.fail_count <= p.total_calls);
    }

    #[test]
    fn real_failure_signal_reclassifies_mechanical_success() {
        // Completed with output -> mechanical success. A later Rejected signal
        // overrides it: undo the success, book a failure. No phantom success.
        let mut tracker = OutcomeTracker::new();
        let t = tracker.record_start("m", InferenceTask::Generate, "r");
        tracker.record_complete(&t, 100, 8, 15);
        assert_eq!(tracker.profile("m").unwrap().success_count, 1);

        tracker.record_inferred_outcome(&t, InferredOutcome::Rejected { confidence: 0.9 });
        let p = tracker.profile("m").unwrap();
        assert_eq!(p.success_count, 0, "mechanical success undone");
        assert_eq!(p.fail_count, 1, "failure booked");
    }

    #[test]
    fn real_success_signal_does_not_double_count() {
        let mut tracker = OutcomeTracker::new();
        let t = tracker.record_start("m", InferenceTask::Generate, "r");
        tracker.record_complete(&t, 100, 8, 15);
        tracker.record_inferred_outcome(&t, InferredOutcome::Accepted { confidence: 0.9 });
        let p = tracker.profile("m").unwrap();
        assert_eq!(p.success_count, 1, "Accepted on an already-credited call is not +2");
    }

    #[test]
    fn export_import() {
        let mut tracker = OutcomeTracker::new();
        let trace = tracker.record_start("m1", InferenceTask::Generate, "test");
        tracker.record_complete(&trace, 100, 10, 5);
        tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.9 });

        let exported = tracker.export_profiles();
        assert_eq!(exported.len(), 1);

        let mut tracker2 = OutcomeTracker::new();
        tracker2.import_profiles(exported);
        assert!(tracker2.profile("m1").is_some());
    }
}