car-inference 0.13.0

//! Adaptive model routing — three-phase routing with learned performance profiles.
//!
//! Phase 1: **Filter** — hard constraints (capability, availability, memory, cost).
//! Phase 2: **Score** — blend quality, latency, and cost using observed profiles
//!          or schema defaults on cold start.
//! Phase 3: **Select** — Thompson Sampling (Beta distribution per model) for
//!          natural exploration-exploitation balance. Models with fewer observations
//!          have wider distributions, giving them chances to prove themselves.
//!
//! Replaces the hardcoded `ModelRouter` from `router.rs`.

use rand::Rng;
use serde::{Deserialize, Serialize};

use std::sync::{Arc, Mutex};

use crate::hardware::HardwareInfo;
use crate::outcome::{InferenceTask, OutcomeTracker};
use crate::registry::UnifiedRegistry;
use crate::routing_ext::CircuitBreakerRegistry;
use crate::schema::{ModelCapability, ModelSchema};
use crate::tasks::RoutingWorkload;

/// Prompt complexity assessment (migrated from router.rs).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TaskComplexity {
    Simple,
    Medium,
    Code,
    Complex,
}

impl TaskComplexity {
    /// Assess complexity of a prompt string.
    ///
    /// Uses tree-sitter AST parsing (when the `ast` feature is enabled) for
    /// accurate code detection: if a code block parses successfully as any
    /// supported language, it's definitively code. Falls back to keyword
    /// heuristics for prompts that mention code without containing code blocks.
    pub fn assess(prompt: &str) -> Self {
        let lower = prompt.to_lowercase();
        let word_count = prompt.split_whitespace().count();
        let estimated_tokens = (word_count as f64 * 1.3) as usize;

        let has_code = Self::detect_code(prompt);

        let repair_markers = [
            "fix", "repair", "debug", "refactor", "broken", "failing", "error", "bug",
        ];
        let has_repair = repair_markers.iter().any(|m| lower.contains(m));

        let reasoning_markers = [
            "analyze",
            "compare",
            "explain why",
            "step by step",
            "think through",
            "evaluate",
            "trade-off",
            "tradeoff",
            "pros and cons",
            "architecture",
            "design",
            "strategy",
            "optimize",
            "comprehensive",
        ];
        let has_reasoning = reasoning_markers.iter().any(|m| lower.contains(m));

        let simple_patterns = [
            "what is",
            "who is",
            "when did",
            "where is",
            "how many",
            "yes or no",
            "true or false",
            "name the",
            "list the",
            "define ",
        ];
        let is_simple = simple_patterns.iter().any(|p| lower.contains(p));

        if has_code || has_repair {
            TaskComplexity::Code
        } else if has_reasoning || estimated_tokens > 500 {
            TaskComplexity::Complex
        } else if is_simple || estimated_tokens < 30 {
            TaskComplexity::Simple
        } else {
            TaskComplexity::Medium
        }
    }

    /// Detect whether a prompt contains code.
    ///
    /// With `ast` feature: extracts code blocks (``` delimited), attempts to
    /// parse each with tree-sitter. If any parses into symbols, it's real code.
    /// Without `ast` feature: falls back to keyword heuristics.
    fn detect_code(prompt: &str) -> bool {
        // First try AST-based detection on code blocks
        #[cfg(feature = "ast")]
        {
            if let Some(is_code) = Self::detect_code_ast(prompt) {
                return is_code;
            }
        }

        // Fallback: keyword heuristics
        let code_markers = [
            "```",
            "fn ",
            "def ",
            "class ",
            "import ",
            "require(",
            "async fn",
            "pub fn",
            "function ",
            "const ",
            "let ",
            "var ",
            "#include",
            "package ",
            "impl ",
        ];
        code_markers.iter().any(|m| prompt.contains(m))
    }

    /// AST-based code detection: parse code blocks with tree-sitter.
    /// Returns Some(true) if code found, Some(false) if blocks exist but
    /// don't parse, None if no code blocks found (fall through to heuristics).
    #[cfg(feature = "ast")]
    fn detect_code_ast(prompt: &str) -> Option<bool> {
        // Extract code blocks between ``` markers
        let mut blocks = Vec::new();
        let mut rest = prompt;
        while let Some(start) = rest.find("```") {
            let after_fence = &rest[start + 3..];
            // Skip optional language tag on the opening fence
            let code_start = after_fence.find('\n').map(|i| i + 1).unwrap_or(0);
            if let Some(end) = after_fence[code_start..].find("```") {
                blocks.push(&after_fence[code_start..code_start + end]);
                rest = &after_fence[code_start + end + 3..];
            } else {
                break;
            }
        }

        if blocks.is_empty() {
            return None; // No code blocks — let heuristics decide
        }

        // Try to parse each block with tree-sitter
        let languages = [
            car_ast::Language::Rust,
            car_ast::Language::Python,
            car_ast::Language::TypeScript,
            car_ast::Language::JavaScript,
            car_ast::Language::Go,
        ];

        for block in &blocks {
            let trimmed = block.trim();
            if trimmed.is_empty() {
                continue;
            }

            for lang in &languages {
                if let Some(parsed) = car_ast::parse(trimmed, *lang) {
                    // If it parsed into any symbols, it's definitely code
                    if !parsed.symbols.is_empty() {
                        return Some(true);
                    }
                }
            }
        }

        // Had code blocks but none parsed into symbols — could be
        // pseudocode, output, or unsupported language
        Some(false)
    }

    /// Map complexity to required capabilities.
    pub fn required_capabilities(&self) -> Vec<ModelCapability> {
        match self {
            TaskComplexity::Simple => vec![ModelCapability::Generate],
            TaskComplexity::Medium => vec![ModelCapability::Generate],
            TaskComplexity::Code => vec![ModelCapability::Code],
            TaskComplexity::Complex => vec![ModelCapability::Reasoning],
        }
    }

    /// Map complexity to the InferenceTask type.
    pub fn inference_task(&self) -> InferenceTask {
        match self {
            TaskComplexity::Simple | TaskComplexity::Medium => InferenceTask::Generate,
            TaskComplexity::Code => InferenceTask::Code,
            TaskComplexity::Complex => InferenceTask::Reasoning,
        }
    }
}

/// Configuration for routing behavior.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RoutingConfig {
    /// Minimum observations before trusting a model's profile over schema defaults.
    pub min_observations: u64,
    /// Scoring weights (must sum to 1.0).
    pub quality_weight: f64,
    pub latency_weight: f64,
    pub cost_weight: f64,
    /// Hard constraint: maximum latency budget in ms.
    pub max_latency_ms: Option<u64>,
    /// Hard constraint: maximum cost per call in USD.
    pub max_cost_usd: Option<f64>,
    /// Prefer local models over remote (all else being equal).
    pub prefer_local: bool,
    /// Thompson Sampling prior strength. Higher = more weight on the Phase 2 score
    /// as a prior, lower = more influenced by observed outcomes.
    /// Equivalent to the number of "virtual" observations from the prior.
    pub prior_strength: f64,
    /// Prefer trusted remote models for quality-critical tasks until local models
    /// have enough task-specific evidence to be promoted.
    pub quality_first_cold_start: bool,
    /// Minimum task-specific observations required before a local model can
    /// compete with trusted remote models during cold start.
    pub bootstrap_min_task_observations: u64,
    /// Minimum task-specific EMA quality required before a local model can
    /// displace trusted remote models during cold start.
    pub bootstrap_quality_floor: f64,
}

impl Default for RoutingConfig {
    fn default() -> Self {
        Self {
            min_observations: 2,
            quality_weight: 0.45,
            latency_weight: 0.4,
            cost_weight: 0.15,
            max_latency_ms: None,
            max_cost_usd: None,
            prefer_local: true,
            prior_strength: 2.0,
            quality_first_cold_start: true,
            bootstrap_min_task_observations: 8,
            bootstrap_quality_floor: 0.8,
        }
    }
}

/// How a model was selected.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RoutingStrategy {
    /// Using declared schema capabilities (no observed data).
    SchemaBased,
    /// Using observed performance profiles (exploitation).
    ProfileBased,
    /// Deliberately trying an under-tested model (exploration).
    Exploration,
    /// User explicitly specified the model.
    Explicit,
}

/// The result of adaptive routing.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AdaptiveRoutingDecision {
    /// Selected model id.
    pub model_id: String,
    /// Selected model name (display).
    pub model_name: String,
    /// Task type.
    pub task: InferenceTask,
    /// Assessed complexity.
    pub complexity: TaskComplexity,
    /// Human-readable reason.
    pub reason: String,
    /// How the model was selected.
    pub strategy: RoutingStrategy,
    /// Predicted quality (0.0-1.0).
    pub predicted_quality: f64,
    /// Fallback chain (ordered list of alternative model ids).
    pub fallbacks: Vec<String>,
    /// Context window of the selected model (tokens). 0 = unknown.
    pub context_length: usize,
    /// Whether the prompt needs compaction to fit the selected model's context window.
    pub needs_compaction: bool,
}

/// Adaptive router with three-phase model selection.
pub struct AdaptiveRouter {
    hw: HardwareInfo,
    config: RoutingConfig,
    /// Circuit breaker registry — blocks models after consecutive failures (#25).
    pub circuit_breakers: Arc<Mutex<CircuitBreakerRegistry>>,
}

/// All inputs to a routing decision packed into one struct so adding
/// a new combinator (per-tenant routing, streaming awareness, …)
/// only adds a field rather than another `route_*` sibling method.
/// See [`AdaptiveRouter::route_with`].
///
/// Use [`RouteRequest::new`] for the common defaults, then mutate
/// just the fields the caller cares about:
///
/// ```ignore
/// let decision = router.route_with(RouteRequest {
///     has_tools: true,
///     intent: Some(&hint),
///     ..RouteRequest::new(prompt, &registry, &tracker)
/// });
/// ```
pub struct RouteRequest<'a> {
    pub prompt: &'a str,
    pub registry: &'a UnifiedRegistry,
    pub tracker: &'a OutcomeTracker,
    /// Estimated prompt-side token count for context-window-aware
    /// scoring. `0` skips the compaction-headroom check.
    pub estimated_total_tokens: usize,
    pub has_tools: bool,
    pub has_vision: bool,
    pub workload: RoutingWorkload,
    /// Caller-supplied intent hint. `prefer_local: true` overrides
    /// `workload` to [`RoutingWorkload::LocalPreferred`].
    pub intent: Option<&'a crate::intent::IntentHint>,
}

impl<'a> RouteRequest<'a> {
    /// Build a request with the same defaults the bare
    /// [`AdaptiveRouter::route`] uses: interactive workload, no tools,
    /// no vision, no context-aware sizing, no intent.
    pub fn new(
        prompt: &'a str,
        registry: &'a UnifiedRegistry,
        tracker: &'a OutcomeTracker,
    ) -> Self {
        Self {
            prompt,
            registry,
            tracker,
            estimated_total_tokens: 0,
            has_tools: false,
            has_vision: false,
            workload: RoutingWorkload::Interactive,
            intent: None,
        }
    }
}

impl AdaptiveRouter {
    pub fn new(hw: HardwareInfo, config: RoutingConfig) -> Self {
        let circuit_breakers = Arc::new(Mutex::new(
            CircuitBreakerRegistry::new(3, 300), // 3 failures, 5 min cooldown
        ));
        Self {
            hw,
            config,
            circuit_breakers,
        }
    }

    pub fn with_default_config(hw: HardwareInfo) -> Self {
        Self::new(hw, RoutingConfig::default())
    }

    pub fn config(&self) -> &RoutingConfig {
        &self.config
    }

    pub fn set_config(&mut self, config: RoutingConfig) {
        self.config = config;
    }

    /// Canonical entry point. The seven `route_*` sibling methods
    /// each build a `RouteRequest` with their fixed defaults and
    /// call this — adding a new combinator (per-tenant routing,
    /// streaming awareness, …) only adds a field here, not another
    /// public method. Closes #108.
    pub fn route_with(&self, req: RouteRequest<'_>) -> AdaptiveRoutingDecision {
        // Intent-hint precedence over the caller-supplied workload:
        // `prefer_fast` wins outright (voice fast track is the most
        // latency-sensitive path we have); `prefer_local` is the
        // long-standing override; absent either, the caller's
        // workload stands.
        let workload = match req.intent {
            Some(h) if h.prefer_fast => RoutingWorkload::Fastest,
            Some(h) if h.prefer_local => RoutingWorkload::LocalPreferred,
            _ => req.workload,
        };
        self.route_inner_with_intent(
            req.prompt,
            req.registry,
            req.tracker,
            req.has_tools,
            req.has_vision,
            req.estimated_total_tokens,
            workload,
            req.intent,
        )
    }

    /// Route a generation request to the best model.
    /// If `has_tools` is true, requires ToolUse capability (#13).
    pub fn route(
        &self,
        prompt: &str,
        registry: &UnifiedRegistry,
        tracker: &OutcomeTracker,
    ) -> AdaptiveRoutingDecision {
        self.route_with(RouteRequest::new(prompt, registry, tracker))
    }

    /// Route an "editor" request — cheap mechanical work (context compaction,
    /// edit materialization, title generation, replanning). Uses
    /// [`RoutingWorkload::Background`] so cost/quality weights favour cheaper
    /// models, and lets hosting layers express the Aider-style architect/editor
    /// split without plumbing a raw model-name override through every layer.
    pub fn route_editor(
        &self,
        prompt: &str,
        registry: &UnifiedRegistry,
        tracker: &OutcomeTracker,
    ) -> AdaptiveRoutingDecision {
        self.route_with(RouteRequest {
            workload: RoutingWorkload::Background,
            ..RouteRequest::new(prompt, registry, tracker)
        })
    }

    /// Route with tool_use requirement — filters to models that support structured tool calls.
    pub fn route_with_tools(
        &self,
        prompt: &str,
        registry: &UnifiedRegistry,
        tracker: &OutcomeTracker,
    ) -> AdaptiveRoutingDecision {
        self.route_with(RouteRequest {
            has_tools: true,
            ..RouteRequest::new(prompt, registry, tracker)
        })
    }

    /// Route with image input requirement — filters to models that support vision.
    pub fn route_with_vision(
        &self,
        prompt: &str,
        registry: &UnifiedRegistry,
        tracker: &OutcomeTracker,
        has_tools: bool,
    ) -> AdaptiveRoutingDecision {
        self.route_with(RouteRequest {
            has_tools,
            has_vision: true,
            ..RouteRequest::new(prompt, registry, tracker)
        })
    }

    /// Route with caller-supplied intent — see [`crate::IntentHint`].
    /// The hint can override the auto-detected `InferenceTask`, add hard
    /// `require` capability filters on top of the prompt-derived ones,
    /// and bias the score profile toward local models when
    /// `prefer_local` is set.
    pub fn route_with_intent<'a>(
        &self,
        prompt: &'a str,
        registry: &'a UnifiedRegistry,
        tracker: &'a OutcomeTracker,
        intent: &'a crate::intent::IntentHint,
    ) -> AdaptiveRoutingDecision {
        self.route_with(RouteRequest {
            intent: Some(intent),
            ..RouteRequest::new(prompt, registry, tracker)
        })
    }

    /// Route with context awareness — estimates prompt tokens and prefers models
    /// whose context window can fit the full prompt without compaction.
    pub fn route_context_aware(
        &self,
        prompt: &str,
        estimated_total_tokens: usize,
        registry: &UnifiedRegistry,
        tracker: &OutcomeTracker,
        has_tools: bool,
        has_vision: bool,
        workload: RoutingWorkload,
    ) -> AdaptiveRoutingDecision {
        self.route_with(RouteRequest {
            estimated_total_tokens,
            has_tools,
            has_vision,
            workload,
            ..RouteRequest::new(prompt, registry, tracker)
        })
    }

    /// Context-aware routing with caller-supplied intent. Same context
    /// math as [`Self::route_context_aware`]; the intent layers on
    /// top — task override, additional `require` filters,
    /// `prefer_local` workload override.
    pub fn route_context_aware_with_intent<'a>(
        &self,
        prompt: &'a str,
        estimated_total_tokens: usize,
        registry: &'a UnifiedRegistry,
        tracker: &'a OutcomeTracker,
        has_tools: bool,
        has_vision: bool,
        workload: RoutingWorkload,
        intent: &'a crate::intent::IntentHint,
    ) -> AdaptiveRoutingDecision {
        self.route_with(RouteRequest {
            estimated_total_tokens,
            has_tools,
            has_vision,
            workload,
            intent: Some(intent),
            ..RouteRequest::new(prompt, registry, tracker)
        })
    }

    fn route_inner_with_intent(
        &self,
        prompt: &str,
        registry: &UnifiedRegistry,
        tracker: &OutcomeTracker,
        has_tools: bool,
        has_vision: bool,
        estimated_total_tokens: usize,
        workload: RoutingWorkload,
        intent: Option<&crate::intent::IntentHint>,
    ) -> AdaptiveRoutingDecision {
        let complexity = TaskComplexity::assess(prompt);
        // Caller intent overrides the prompt-derived task when supplied.
        let task = intent
            .and_then(|h| h.task)
            .map(task_hint_to_inference_task)
            .unwrap_or_else(|| complexity.inference_task());
        let mut required_caps = complexity.required_capabilities();
        if let Some(hint) = intent {
            for cap in &hint.require {
                if !required_caps.contains(cap) {
                    required_caps.push(*cap);
                }
            }
        }
        if has_vision {
            required_caps.push(ModelCapability::Vision);
        }
        if has_tools {
            required_caps.push(ModelCapability::ToolUse);
            // Detect multi-step prompts that need multiple tool calls in one response.
            // Patterns: numbered lists ("1) ... 2) ..."), multiple instructions, explicit multi-edit.
            if Self::needs_multi_tool_call(prompt) {
                required_caps.push(ModelCapability::MultiToolCall);
            }
        }

        // Phase 1: Filter candidates
        let mut candidates = self.filter_candidates(&required_caps, registry, tracker);

        // Fallback: if requiring MultiToolCall eliminates all candidates, drop the
        // requirement and let the best ToolUse model handle it with multiple round-trips.
        if candidates.is_empty() && required_caps.contains(&ModelCapability::MultiToolCall) {
            required_caps.retain(|c| *c != ModelCapability::MultiToolCall);
            candidates = self.filter_candidates(&required_caps, registry, tracker);
        }

        if candidates.is_empty() {
            // Nothing available — return the schema-based default
            return self.cold_start_decision(complexity, task, registry, has_vision);
        }

        candidates = self.apply_quality_first_bootstrap_policy(
            candidates, task, tracker, has_vision, has_tools, workload,
        );

        // Context-aware filtering: if we know the prompt size, prefer models that fit.
        // Phase 1b: separate candidates into "fits" and "needs compaction" groups.
        let (fits, needs_compaction_candidates) = if estimated_total_tokens > 0 {
            let mut fits = Vec::new();
            let mut tight = Vec::new();
            for m in &candidates {
                if m.context_length == 0 || m.context_length >= estimated_total_tokens {
                    fits.push(m.clone());
                } else {
                    tight.push(m.clone());
                }
            }
            (fits, tight)
        } else {
            (candidates.clone(), Vec::new())
        };

        // Prefer models that fit; fall back to compaction-required models if none fit
        let (scoring_candidates, compaction_needed) = if !fits.is_empty() {
            (fits, false)
        } else if !needs_compaction_candidates.is_empty() {
            tracing::info!(
                prompt_tokens = estimated_total_tokens,
                candidates = needs_compaction_candidates.len(),
                "no model fits full prompt — compaction will be needed"
            );
            (needs_compaction_candidates.clone(), true)
        } else {
            (candidates.clone(), false)
        };

        // Phase 2: Score candidates (with context headroom bonus)
        let scored = self.score_candidates_context_aware(
            &scoring_candidates,
            task,
            tracker,
            estimated_total_tokens,
            workload,
        );

        // Phase 3: Thompson Sampling selection
        let (selected_id, strategy) = self.select_with_thompson_sampling(&scored, tracker);

        // Build fallback chain: prefer models that fit, then compaction candidates
        let mut fallbacks: Vec<String> = scored
            .iter()
            .filter(|(id, _)| *id != selected_id)
            .map(|(id, _)| id.clone())
            .collect();
        // Add compaction candidates to the end of the fallback chain
        if !compaction_needed {
            for m in &needs_compaction_candidates {
                if m.id != selected_id && !fallbacks.contains(&m.id) {
                    fallbacks.push(m.id.clone());
                }
            }
        }

        let predicted_quality = scored
            .iter()
            .find(|(id, _)| *id == selected_id)
            .map(|(_, score)| *score)
            .unwrap_or(0.5);

        let selected_schema = registry
            .get(&selected_id)
            .or_else(|| registry.find_by_name(&selected_id));
        let model_name = selected_schema
            .map(|m| m.name.clone())
            .unwrap_or_else(|| selected_id.clone());
        let context_length = selected_schema.map(|m| m.context_length).unwrap_or(0);

        let needs_compact = compaction_needed
            || (estimated_total_tokens > 0
                && context_length > 0
                && estimated_total_tokens > context_length);

        let compaction_note = if needs_compact {
            format!(
                " [compaction needed: {}→{}tok]",
                estimated_total_tokens, context_length
            )
        } else {
            String::new()
        };

        let reason = format!(
            "{:?} task → {} via {:?} (quality: {:.2}, {} candidates){}",
            complexity,
            model_name,
            strategy,
            predicted_quality,
            scoring_candidates.len(),
            compaction_note,
        );

        AdaptiveRoutingDecision {
            model_id: selected_id,
            model_name,
            task,
            complexity,
            reason,
            strategy,
            predicted_quality,
            fallbacks,
            context_length,
            needs_compaction: needs_compact,
        }
    }

    /// Route to the best embedding model.
    pub fn route_embedding(&self, registry: &UnifiedRegistry) -> String {
        let embed_models = registry.query_by_capability(ModelCapability::Embed);
        embed_models
            .first()
            .map(|m| m.name.clone())
            .unwrap_or_else(|| "Qwen3-Embedding-0.6B".to_string())
    }

    /// Route to the smallest available model (for classification).
    pub fn route_small(&self, registry: &UnifiedRegistry) -> String {
        let gen_models = registry.query_by_capability(ModelCapability::Generate);
        // Pick smallest by size
        gen_models
            .iter()
            .filter(|m| m.is_local())
            .min_by_key(|m| m.size_mb())
            .map(|m| m.name.clone())
            .unwrap_or_else(|| "Qwen3-0.6B".to_string())
    }

    // --- Internal phases ---

    // --- Scoring constants ---

    /// Latency ceiling: requests taking longer than this score 0.0.
    const LATENCY_CEILING_MS: f64 = 10000.0;
    /// TPS ceiling: models faster than this score 1.0.
    const _TPS_CEILING: f64 = 150.0;
    /// MoE throughput penalty for Candle: naive expert routing runs at ~10% of declared TPS.
    const MOE_TPS_MULTIPLIER: f64 = 0.10;
    /// MoE throughput multiplier for MLX: fused Metal kernels run at ~50% of declared TPS.
    const MLX_MOE_TPS_MULTIPLIER: f64 = 0.50;
    /// Cost ceiling: models costing more than this per 1K output tokens score 0.0.
    const COST_CEILING_PER_1K: f64 = 0.1;
    /// Local preference bonus added to the weighted score (before normalization).
    const LOCAL_BONUS: f64 = 0.15;
    /// True on platforms where local inference has a GPU/NPU backend
    /// (Apple Silicon with MLX). On Intel Macs and on hosts built with
    /// `--cfg=car_skip_mlx`, local inference falls through to candle/
    /// CPU which is far slower than cloud for non-trivial models. The
    /// router suppresses LOCAL_BONUS when this is false so cloud
    /// models rank fairly instead of losing to a CPU-bound 4B that
    /// will take 30s to first-token.
    #[cfg(all(target_os = "macos", target_arch = "aarch64", not(car_skip_mlx)))]
    const HAS_GPU_BACKEND: bool = true;
    #[cfg(not(all(target_os = "macos", target_arch = "aarch64", not(car_skip_mlx))))]
    const HAS_GPU_BACKEND: bool = false;
    /// Extra bonus for MLX models on Apple Silicon (stacks with LOCAL_BONUS).
    /// MLX gets fused Metal kernels and better memory layout vs Candle on Mac.
    #[cfg(all(target_os = "macos", target_arch = "aarch64", not(car_skip_mlx)))]
    const MLX_BONUS: f64 = 0.10;
    /// Extra bonus for system-owned models — no model file on disk, no
    /// download to provision, framework-managed memory. Currently only
    /// `apple/foundation:default` qualifies. Tag-driven so future
    /// system-LLM integrations (e.g. Android AICore) inherit the
    /// scoring without a code change. Stacks with LOCAL_BONUS but
    /// **excludes** MLX_BONUS (system models aren't MLX); the net
    /// effect is FoundationModels ranks roughly even with a
    /// well-warmed MLX 4B for short fast-turn tasks instead of
    /// strictly losing because its catalog `tokens_per_second` /
    /// `size_mb` are null.
    const SYSTEM_LLM_BONUS: f64 = 0.12;

    // --- Internal phases ---

    /// Phase 1: Filter by hard constraints.
    fn filter_candidates(
        &self,
        required_caps: &[ModelCapability],
        registry: &UnifiedRegistry,
        tracker: &OutcomeTracker,
    ) -> Vec<ModelSchema> {
        registry
            .list()
            .into_iter()
            .filter(|m| {
                // Must have all required capabilities
                if !required_caps.iter().all(|c| m.has_capability(*c)) {
                    return false;
                }
                // Must be available (downloaded for local, API key set for remote)
                if !m.available {
                    return false;
                }
                // Local models must fit in memory (strict: >= excludes models at the limit)
                if m.is_local() && m.size_mb() >= self.hw.max_model_mb {
                    return false;
                }
                // Hard latency constraint
                if let Some(max) = self.config.max_latency_ms {
                    if let Some(p50) = m.performance.latency_p50_ms {
                        if p50 > max {
                            return false;
                        }
                    }
                }
                // Hard cost constraint
                if let Some(max) = self.config.max_cost_usd {
                    if m.cost_per_1k_output() > max {
                        return false;
                    }
                }
                // Hard exclusion: prefer_local=false excludes all local models (#12)
                if !self.config.prefer_local && m.is_local() {
                    return false;
                }
                // Hard exclusion: rate-limited models excluded for this session (#13)
                if tracker.is_excluded(&m.id) {
                    return false;
                }
                // Circuit breaker: skip models with consecutive failures (#25)
                if let Ok(mut cb) = self.circuit_breakers.lock() {
                    if !cb.allow_request(&m.id) {
                        tracing::debug!(model = %m.id, "skipped by circuit breaker");
                        return false;
                    }
                }
                true
            })
            .cloned()
            .collect()
    }

    fn apply_quality_first_bootstrap_policy(
        &self,
        candidates: Vec<ModelSchema>,
        task: InferenceTask,
        tracker: &OutcomeTracker,
        has_vision: bool,
        has_tools: bool,
        workload: RoutingWorkload,
    ) -> Vec<ModelSchema> {
        if !self.config.quality_first_cold_start
            || !workload.is_latency_sensitive()
            || !self.is_quality_critical_bootstrap_task(task, has_vision, has_tools)
        {
            return candidates;
        }

        let trusted_remote: Vec<ModelSchema> = candidates
            .iter()
            .filter(|model| self.is_trusted_quality_remote(model))
            .cloned()
            .collect();

        if trusted_remote.is_empty() {
            return candidates;
        }

        let proven_local: Vec<ModelSchema> = candidates
            .iter()
            .filter(|model| {
                model.is_local() && self.is_local_model_proven_for_task(model, task, tracker)
            })
            .cloned()
            .collect();

        if !proven_local.is_empty() {
            return proven_local;
        }

        trusted_remote
    }

    /// Phase 2: Score candidates with context awareness.
    /// Applies a headroom bonus to models with more context window for the prompt.
    /// When estimated_total_tokens is 0, no context bonus/penalty is applied.
    fn score_candidates_context_aware(
        &self,
        candidates: &[ModelSchema],
        task: InferenceTask,
        tracker: &OutcomeTracker,
        estimated_total_tokens: usize,
        workload: RoutingWorkload,
    ) -> Vec<(String, f64)> {
        let mut scored: Vec<(String, f64)> = candidates
            .iter()
            .map(|m| {
                let base_score = self.score_model(m, task, tracker, workload);
                // Context headroom bonus: prefer models with more room to spare.
                // Max bonus: 0.10 (at 4x headroom or more). No bonus if unknown.
                let headroom_bonus = if estimated_total_tokens > 0 && m.context_length > 0 {
                    let ratio = m.context_length as f64 / estimated_total_tokens as f64;
                    if ratio >= 1.0 {
                        (ratio.min(4.0) - 1.0) / 3.0 * 0.10 // 0.0 at exact fit, 0.10 at 4x
                    } else {
                        -0.15 // Penalty for models that require compaction
                    }
                } else {
                    0.0
                };
                (m.id.clone(), base_score + headroom_bonus)
            })
            .collect();

        scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
        scored
    }

    /// Score a single model. All sub-scores are in [0.0, 1.0].
    /// Final score = weighted sum + local_bonus, so range is [0.0, ~1.15].
    fn score_model(
        &self,
        model: &ModelSchema,
        task: InferenceTask,
        tracker: &OutcomeTracker,
        workload: RoutingWorkload,
    ) -> f64 {
        let profile = tracker.profile(&model.id);
        let schema_quality = self.schema_quality_estimate(model);
        let schema_latency = self.schema_latency_estimate(model);
        let (quality_weight, latency_weight, cost_weight) = workload.weights();

        // Quality: blend schema estimate with observed data based on observation count.
        // Both cold start and warm start use consistent blending.
        let quality = match profile {
            Some(p) if p.total_calls >= self.config.min_observations => p
                .task_stats(task)
                .map(|ts| ts.ema_quality)
                .unwrap_or(p.ema_quality),
            Some(p) if p.total_calls == 0 => p
                .task_stats(task)
                .map(|ts| ts.ema_quality)
                .unwrap_or(p.ema_quality),
            Some(p) if p.total_calls > 0 => {
                let w = p.total_calls as f64 / self.config.min_observations as f64;
                schema_quality * (1.0 - w) + p.ema_quality * w
            }
            _ => schema_quality,
        };

        // Latency: same blending as quality — don't trust a single observation more
        // than schema estimates. This prevents routing oscillation on first few calls.
        let latency = match profile {
            Some(p) if p.total_calls >= self.config.min_observations => {
                let avg = p
                    .task_stats(task)
                    .filter(|ts| ts.calls > 0 || ts.avg_latency_ms > 0.0)
                    .map(|ts| ts.avg_latency_ms)
                    .unwrap_or_else(|| p.avg_latency_ms());
                self.latency_ms_to_score(avg)
            }
            Some(p) if p.total_calls == 0 => p
                .task_stats(task)
                .filter(|ts| ts.avg_latency_ms > 0.0)
                .map(|ts| self.latency_ms_to_score(ts.avg_latency_ms))
                .unwrap_or(schema_latency),
            Some(p) if p.total_calls > 0 => {
                let observed = self.latency_ms_to_score(
                    p.task_stats(task)
                        .filter(|ts| ts.calls > 0 || ts.avg_latency_ms > 0.0)
                        .map(|ts| ts.avg_latency_ms)
                        .unwrap_or_else(|| p.avg_latency_ms()),
                );
                let w = p.total_calls as f64 / self.config.min_observations as f64;
                schema_latency * (1.0 - w) + observed * w
            }
            _ => schema_latency,
        };

        // Cost score (lower is better → invert)
        let cost = if model.is_local() {
            1.0
        } else {
            (1.0 - (model.cost_per_1k_output() / Self::COST_CEILING_PER_1K)).clamp(0.0, 1.0)
        };

        // Suppress LOCAL_BONUS on hosts without a GPU/NPU backend.
        // Without it, an Intel Mac or a `car_skip_mlx` build would
        // pick a CPU-bound local 4B over a comparable cloud model
        // every time, then surprise the user with 30s+ first-token
        // latency. Cloud loses on cost/privacy in normal scoring;
        // dropping the local bonus lets it win on the latency that
        // actually matters when the GPU isn't there. Tracing emits
        // when this fires so the silent degradation becomes visible.
        let local_bonus = if self.config.prefer_local && model.is_local() && Self::HAS_GPU_BACKEND {
            Self::LOCAL_BONUS
        } else {
            if self.config.prefer_local && model.is_local() && !Self::HAS_GPU_BACKEND {
                tracing::debug!(
                    model = %model.id,
                    "LOCAL_BONUS suppressed: no GPU backend on this host (Intel Mac or car_skip_mlx); cloud models will rank higher"
                );
            }
            0.0
        };
        let workload_local_bonus = if model.is_local() {
            workload.local_bonus()
        } else {
            0.0
        };

        // On Apple Silicon, prefer MLX models over Candle equivalents
        #[cfg(all(target_os = "macos", target_arch = "aarch64", not(car_skip_mlx)))]
        let mlx_bonus = if model.is_mlx() { Self::MLX_BONUS } else { 0.0 };
        #[cfg(not(all(target_os = "macos", target_arch = "aarch64", not(car_skip_mlx))))]
        let mlx_bonus = 0.0;

        // vLLM-MLX bonus: continuous batching gives better multi-agent throughput
        let vllm_mlx_bonus = if model.is_vllm_mlx() {
            Self::LOCAL_BONUS + 0.05
        } else {
            0.0
        };

        // System-LLM bonus: catalog tag-driven so it generalizes beyond
        // FoundationModels. Models tagged `low_latency` AND `private`
        // are zero-cost system-owned LLMs (apple/foundation:default
        // today; AICore-on-Android etc. in the future). They don't
        // appear in `is_mlx()` but deserve to compete with MLX 4B on
        // routing — the catalog's tags carry that intent and the
        // router now honors it.
        let system_llm_bonus = if model.tags.iter().any(|t| t == "low_latency")
            && model.tags.iter().any(|t| t == "private")
        {
            Self::SYSTEM_LLM_BONUS
        } else {
            0.0
        };

        quality_weight * quality
            + latency_weight * latency
            + cost_weight * cost
            + local_bonus
            + workload_local_bonus
            + mlx_bonus
            + vllm_mlx_bonus
            + system_llm_bonus
    }

    /// Convert latency in ms to a [0, 1] score. Used by both schema and observed paths
    /// so the scales are consistent (fixes Linus review issue #2).
    fn latency_ms_to_score(&self, ms: f64) -> f64 {
        (1.0 - (ms / Self::LATENCY_CEILING_MS)).clamp(0.0, 1.0)
    }

    /// Convert TPS to estimated latency in ms (for a typical 200-token response).
    fn tps_to_latency_ms(tps: f64) -> f64 {
        if tps <= 0.0 {
            return Self::LATENCY_CEILING_MS;
        }
        // Assume ~200 tokens per response as baseline
        (200.0 / tps) * 1000.0
    }

    /// Detect whether a prompt likely needs multiple tool calls in a single response.
    /// Looks for numbered lists, multiple explicit instructions, multi-edit patterns.
    fn needs_multi_tool_call(prompt: &str) -> bool {
        let lower = prompt.to_lowercase();

        // Numbered list patterns: "1) ... 2) ..." or "1. ... 2. ..."
        let has_numbered_list = {
            let mut count = 0u32;
            for i in 1..=5u32 {
                if lower.contains(&format!("{}) ", i)) || lower.contains(&format!("{}. ", i)) {
                    count += 1;
                }
            }
            count >= 2
        };

        // Explicit multi-action keywords
        let multi_keywords = [
            "multiple edits",
            "several changes",
            "three changes",
            "two changes",
            "all of the following",
            "each of these",
            "do both",
            "do all",
            "and also",
            "additionally",
            "as well as",
            "then also",
        ];
        let has_multi_keywords = multi_keywords.iter().any(|kw| lower.contains(kw));

        // Bullet point lists with action verbs
        let bullet_actions = lower.matches("- add ").count()
            + lower.matches("- update ").count()
            + lower.matches("- change ").count()
            + lower.matches("- remove ").count()
            + lower.matches("- fix ").count()
            + lower.matches("- edit ").count()
            + lower.matches("- implement ").count()
            + lower.matches("- create ").count();
        let has_bullet_list = bullet_actions >= 2;

        has_numbered_list || has_multi_keywords || has_bullet_list
    }

    /// Schema-based quality estimate (cold start).
    ///
    /// Diminishing returns on model size — the jump from 4B to 8B matters,
    /// but 8B to 30B is marginal. Remote models get a conservative estimate.
    fn schema_quality_estimate(&self, model: &ModelSchema) -> f64 {
        match model.size_mb() {
            0 => 0.5,             // remote: unknown, conservative
            s if s < 1000 => 0.4, // 0.6B
            s if s < 2000 => 0.5, // 1.7B
            s if s < 3000 => 0.6, // 4B
            s if s < 6000 => 0.7, // 8B
            _ => 0.75,            // 30B+: diminishing returns
        }
    }

    /// Schema-based latency estimate (cold start).
    ///
    /// Converts declared TPS/p50 to the same ms-based score used by observed data,
    /// so there's no discontinuity when the first observation arrives.
    fn schema_latency_estimate(&self, model: &ModelSchema) -> f64 {
        let is_moe = model.tags.contains(&"moe".to_string());

        if model.is_local() {
            if let Some(tps) = model.performance.tokens_per_second {
                let effective_tps = if is_moe {
                    let multiplier = if model.is_mlx() {
                        Self::MLX_MOE_TPS_MULTIPLIER
                    } else {
                        Self::MOE_TPS_MULTIPLIER
                    };
                    tps * multiplier
                } else {
                    tps
                };
                let estimated_ms = Self::tps_to_latency_ms(effective_tps);
                return self.latency_ms_to_score(estimated_ms);
            }
            return 0.5; // local, no declared TPS
        }

        // Remote: use declared p50 latency
        if let Some(p50) = model.performance.latency_p50_ms {
            return self.latency_ms_to_score(p50 as f64);
        }
        0.3 // remote, no declared latency
    }

    fn is_quality_critical_bootstrap_task(
        &self,
        task: InferenceTask,
        has_vision: bool,
        has_tools: bool,
    ) -> bool {
        has_vision
            || has_tools
            || matches!(
                task,
                InferenceTask::Generate | InferenceTask::Code | InferenceTask::Reasoning
            )
    }

    fn is_trusted_quality_remote(&self, model: &ModelSchema) -> bool {
        model.is_remote()
            && matches!(model.provider.as_str(), "openai" | "anthropic" | "google")
            && !model.has_capability(ModelCapability::SpeechToText)
            && !model.has_capability(ModelCapability::TextToSpeech)
    }

    fn is_local_model_proven_for_task(
        &self,
        model: &ModelSchema,
        task: InferenceTask,
        tracker: &OutcomeTracker,
    ) -> bool {
        let Some(profile) = tracker.profile(&model.id) else {
            return false;
        };
        if let Some(task_stats) = profile.task_stats(task) {
            if task_stats.calls >= self.config.bootstrap_min_task_observations
                && task_stats.ema_quality >= self.config.bootstrap_quality_floor
            {
                return true;
            }
        }

        profile.total_calls >= self.config.bootstrap_min_task_observations
            && profile.ema_quality >= self.config.bootstrap_quality_floor
    }

    /// Phase 3: Thompson Sampling selection.
    ///
    /// Each model gets a Beta(alpha, beta) distribution where:
    /// - alpha = prior_successes + observed_successes
    /// - beta = prior_failures + observed_failures
    ///
    /// The Phase 2 score serves as the prior mean, scaled by `prior_strength`.
    /// Models with few observations have wide distributions (natural exploration).
    /// Models with many observations have tight distributions (exploitation).
    fn select_with_thompson_sampling(
        &self,
        scored: &[(String, f64)],
        tracker: &OutcomeTracker,
    ) -> (String, RoutingStrategy) {
        if scored.is_empty() {
            return (String::new(), RoutingStrategy::SchemaBased);
        }

        let mut rng = rand::rng();
        let mut best_sample = f64::NEG_INFINITY;
        let mut best_id = scored[0].0.clone();
        let mut best_strategy = RoutingStrategy::SchemaBased;

        for (id, phase2_score) in scored {
            let profile = tracker.profile(id);
            let prior = self.config.prior_strength;

            // Convert Phase 2 score (0.0-1.15) to a prior mean in [0, 1]
            let prior_mean = phase2_score.clamp(0.0, 1.0);

            // Prior pseudo-counts from the Phase 2 score
            let prior_alpha = prior * prior_mean;
            let prior_beta = prior * (1.0 - prior_mean);

            // Observed counts
            let (obs_alpha, obs_beta) = match profile {
                Some(p) => (p.success_count as f64, p.fail_count as f64),
                None => (0.0, 0.0),
            };

            // Posterior Beta parameters
            let alpha = (prior_alpha + obs_alpha).max(0.01);
            let beta = (prior_beta + obs_beta).max(0.01);

            // Sample from Beta(alpha, beta) using the Jöhnk algorithm
            let sample = sample_beta(&mut rng, alpha, beta);

            if sample > best_sample {
                best_sample = sample;
                best_id = id.clone();
                best_strategy = match profile {
                    Some(p) if p.total_calls >= self.config.min_observations => {
                        RoutingStrategy::ProfileBased
                    }
                    Some(p) if p.total_calls > 0 => {
                        // Under-tested but has some data — exploration
                        RoutingStrategy::Exploration
                    }
                    _ => RoutingStrategy::SchemaBased,
                };
            }
        }

        (best_id, best_strategy)
    }

    /// Fallback decision when no candidates pass filtering.
    fn cold_start_decision(
        &self,
        complexity: TaskComplexity,
        task: InferenceTask,
        registry: &UnifiedRegistry,
        has_vision: bool,
    ) -> AdaptiveRoutingDecision {
        if has_vision {
            if let Some(model) = registry
                .query_by_capability(ModelCapability::Vision)
                .into_iter()
                .find(|model| model.available && self.is_trusted_quality_remote(model))
                .or_else(|| {
                    registry
                        .query_by_capability(ModelCapability::Vision)
                        .first()
                        .copied()
                })
            {
                return AdaptiveRoutingDecision {
                    model_id: model.id.clone(),
                    model_name: model.name.clone(),
                    task,
                    complexity,
                    reason: format!(
                        "{:?} task → {} (cold start, vision fallback)",
                        complexity, model.name
                    ),
                    strategy: RoutingStrategy::SchemaBased,
                    predicted_quality: 0.5,
                    fallbacks: vec![],
                    context_length: model.context_length,
                    needs_compaction: false,
                };
            }
        }

        if self.config.quality_first_cold_start {
            let required_caps = complexity.required_capabilities();
            if let Some(model) = registry
                .list()
                .into_iter()
                .filter(|model| {
                    model.available
                        && required_caps.iter().all(|cap| model.has_capability(*cap))
                        && self.is_trusted_quality_remote(model)
                })
                .max_by(|a, b| {
                    self.schema_quality_estimate(a)
                        .partial_cmp(&self.schema_quality_estimate(b))
                        .unwrap_or(std::cmp::Ordering::Equal)
                })
            {
                return AdaptiveRoutingDecision {
                    model_id: model.id.clone(),
                    model_name: model.name.clone(),
                    task,
                    complexity,
                    reason: format!(
                        "{:?} task → {} (quality-first cold start)",
                        complexity, model.name
                    ),
                    strategy: RoutingStrategy::SchemaBased,
                    predicted_quality: self.schema_quality_estimate(model),
                    fallbacks: vec![],
                    context_length: model.context_length,
                    needs_compaction: false,
                };
            }
        }

        // Fall back to the old complexity-based defaults
        let model_name = match complexity {
            TaskComplexity::Simple => "Qwen3-0.6B",
            TaskComplexity::Medium => "Qwen3-1.7B",
            TaskComplexity::Code => "Qwen3-4B",
            TaskComplexity::Complex => &self.hw.recommended_model,
        };

        let model_id = registry
            .find_by_name(model_name)
            .map(|m| m.id.clone())
            .unwrap_or_else(|| model_name.to_string());

        let context_length = registry
            .find_by_name(model_name)
            .map(|m| m.context_length)
            .unwrap_or(0);

        AdaptiveRoutingDecision {
            model_id,
            model_name: model_name.to_string(),
            task,
            complexity,
            reason: format!(
                "{:?} task → {} (cold start, no candidates)",
                complexity, model_name
            ),
            strategy: RoutingStrategy::SchemaBased,
            predicted_quality: 0.5,
            fallbacks: vec![],
            context_length,
            needs_compaction: false,
        }
    }
}

/// Map a caller-supplied [`crate::TaskHint`] to the engine's
/// [`InferenceTask`] enum. The intent surface uses the higher-level
/// hint vocabulary; the router operates on InferenceTask. Every
/// TaskHint variant maps to a distinct InferenceTask — variants that
/// would have silently collapsed to `Generate` were cut from the MVP.
fn task_hint_to_inference_task(hint: crate::intent::TaskHint) -> InferenceTask {
    use crate::intent::TaskHint;
    match hint {
        TaskHint::Chat => InferenceTask::Generate,
        TaskHint::Classify => InferenceTask::Classify,
        TaskHint::Reasoning => InferenceTask::Reasoning,
        TaskHint::Code => InferenceTask::Code,
    }
}

/// Sample from a Beta(alpha, beta) distribution.
///
/// Uses the Gamma distribution method: if X ~ Gamma(alpha, 1) and Y ~ Gamma(beta, 1),
/// then X / (X + Y) ~ Beta(alpha, beta).
///
/// For Gamma sampling, uses Marsaglia and Tsang's method for alpha >= 1,
/// and Ahrens-Dieter for alpha < 1.
fn sample_beta(rng: &mut impl Rng, alpha: f64, beta: f64) -> f64 {
    let x = sample_gamma(rng, alpha);
    let y = sample_gamma(rng, beta);
    if x + y == 0.0 {
        0.5 // degenerate case
    } else {
        x / (x + y)
    }
}

/// Sample from Gamma(shape, 1) using Marsaglia-Tsang for shape >= 1,
/// with Ahrens-Dieter boost for shape < 1.
fn sample_gamma(rng: &mut impl Rng, shape: f64) -> f64 {
    if shape < 1.0 {
        // Ahrens-Dieter: Gamma(a) = Gamma(a+1) * U^(1/a)
        let u: f64 = rng.random();
        return sample_gamma(rng, shape + 1.0) * u.powf(1.0 / shape);
    }

    // Marsaglia-Tsang method for shape >= 1
    let d = shape - 1.0 / 3.0;
    let c = 1.0 / (9.0 * d).sqrt();

    loop {
        let x: f64 = loop {
            let n = sample_standard_normal(rng);
            if 1.0 + c * n > 0.0 {
                break n;
            }
        };

        let v = (1.0 + c * x).powi(3);
        let u: f64 = rng.random();

        if u < 1.0 - 0.0331 * x.powi(4) {
            return d * v;
        }
        if u.ln() < 0.5 * x * x + d * (1.0 - v + v.ln()) {
            return d * v;
        }
    }
}

/// Sample from standard normal N(0,1) using Box-Muller transform.
fn sample_standard_normal(rng: &mut impl Rng) -> f64 {
    let u1: f64 = rng.random();
    let u2: f64 = rng.random();
    (-2.0 * u1.max(1e-300).ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::outcome::InferredOutcome;

    fn test_hw() -> HardwareInfo {
        HardwareInfo {
            os: "macos".into(),
            arch: "aarch64".into(),
            cpu_cores: 10,
            total_ram_mb: 32768,
            gpu_backend: crate::hardware::GpuBackend::Metal,
            gpu_memory_mb: Some(28672),
            gpu_devices: Vec::new(),
            recommended_model: "Qwen3-8B".into(),
            recommended_context: 8192,
            max_model_mb: 18000, // headroom above 30B-A3B's 17000MB
        }
    }

    fn test_registry() -> UnifiedRegistry {
        let tmp = std::path::PathBuf::from("/tmp/car-test-adaptive-router");
        unsafe {
            std::env::set_var("OPENAI_API_KEY", "test-openai-key");
        }
        // Create fake model dirs so the registry marks them as available
        for name in &[
            "Qwen3-0.6B",
            "Qwen3-1.7B",
            "Qwen3-4B",
            "Qwen3-8B",
            "Qwen3-Embedding-0.6B",
        ] {
            let dir = tmp.join(name);
            let _ = std::fs::create_dir_all(&dir);
            let _ = std::fs::write(dir.join("model.gguf"), b"fake");
            let _ = std::fs::write(dir.join("tokenizer.json"), b"{}");
        }
        let mut reg = UnifiedRegistry::new(tmp);
        reg.register(ModelSchema {
            id: "openai/gpt-5.4-mini:latest".into(),
            name: "gpt-5.4-mini".into(),
            provider: "openai".into(),
            family: "gpt-5.4".into(),
            version: "latest".into(),
            capabilities: vec![
                ModelCapability::Generate,
                ModelCapability::Code,
                ModelCapability::Reasoning,
                ModelCapability::ToolUse,
                ModelCapability::MultiToolCall,
                ModelCapability::Vision,
            ],
            context_length: 128_000,
            param_count: "api".into(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: crate::schema::ModelSource::RemoteApi {
                endpoint: "https://api.openai.com/v1".into(),
                api_key_env: "OPENAI_API_KEY".into(),
                api_key_envs: vec![],
                api_version: None,
                protocol: crate::schema::ApiProtocol::OpenAiCompat,
            },
            tags: vec!["trusted-remote".into()],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: true,
        });
        reg
    }

    #[test]
    fn routes_simple_to_trusted_remote_during_cold_start() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0, // strong prior = exploit Phase 2 scores (deterministic-ish)
                ..Default::default()
            },
        );
        let reg = test_registry();
        let tracker = OutcomeTracker::new();

        let decision = router.route("What is 2+2?", &reg, &tracker);
        assert_eq!(decision.complexity, TaskComplexity::Simple);
        assert_eq!(decision.strategy, RoutingStrategy::SchemaBased);
        // On cold start, quality-critical tasks should stay on trusted remote models.
        let schema = reg
            .find_by_name(&decision.model_name)
            .expect("selected model should exist in registry");
        assert!(
            !schema.is_local(),
            "simple task should route to trusted remote model during cold start"
        );
        assert!(matches!(
            schema.provider.as_str(),
            "openai" | "anthropic" | "google"
        ));
    }

    #[test]
    fn routes_code_to_code_capable_remote_during_cold_start() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0, // strong prior = exploit Phase 2 scores (deterministic-ish)
                ..Default::default()
            },
        );
        let reg = test_registry();
        let tracker = OutcomeTracker::new();

        let decision = router.route(
            "Fix this function:\n```rust\nfn main() {}\n```",
            &reg,
            &tracker,
        );
        assert_eq!(decision.complexity, TaskComplexity::Code);
        assert_eq!(decision.task, InferenceTask::Code);
        // Must select a code-capable local model (not 0.6B which lacks Code)
        let schema = reg
            .find_by_name(&decision.model_name)
            .expect("model should exist");
        assert!(
            schema.has_capability(ModelCapability::Code),
            "selected model must support Code"
        );
        assert!(!schema.is_local(), "should route to trusted remote model");
    }

    #[test]
    fn routes_images_to_vision_capable_model() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0,
                ..Default::default()
            },
        );
        let mut reg = test_registry();
        let tracker = OutcomeTracker::new();

        reg.register(ModelSchema {
            id: "mlx-vlm/qwen3-vl-2b:bf16".into(),
            name: "Qwen3-VL-2B-mlx-vlm".into(),
            provider: "qwen".into(),
            family: "qwen3-vl".into(),
            version: "bf16".into(),
            capabilities: vec![
                ModelCapability::Generate,
                ModelCapability::Vision,
                ModelCapability::Grounding,
            ],
            context_length: 262_144,
            param_count: "2B".into(),
            quantization: None,
            performance: Default::default(),
            cost: Default::default(),
            source: crate::schema::ModelSource::Mlx {
                hf_repo: "Qwen/Qwen3-VL-2B-Instruct".into(),
                hf_weight_file: None,
            },
            tags: vec!["vision".into(), "mlx-vlm-cli".into()],
            supported_params: vec![],
            public_benchmarks: vec![],
            available: true,
        });

        let decision = router.route_with_vision("What is in this image?", &reg, &tracker, false);
        let schema = reg
            .find_by_name(&decision.model_name)
            .expect("model should exist");
        assert!(
            schema.has_capability(ModelCapability::Vision),
            "selected model must support Vision"
        );
    }

    #[test]
    fn profile_based_routing_favors_proven_model() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 0.5, // weak prior, let observed data dominate
                min_observations: 3,
                ..Default::default()
            },
        );
        let reg = test_registry();
        let mut tracker = OutcomeTracker::new();

        // Build a strong profile for Qwen3-8B on code tasks (fast + high quality)
        let qwen_8b_id = "qwen/qwen3-8b:q4_k_m";
        for _ in 0..20 {
            let trace = tracker.record_start(qwen_8b_id, InferenceTask::Code, "test");
            tracker.record_complete(&trace, 500, 100, 50);
            tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.95 });
        }

        // Thompson Sampling is stochastic — run multiple times, 8B should win majority
        let mut wins = 0;
        for _ in 0..20 {
            let decision = router.route("Fix this bug in the parser", &reg, &tracker);
            assert_eq!(decision.complexity, TaskComplexity::Code);
            if decision.model_id == qwen_8b_id {
                wins += 1;
            }
        }
        assert!(
            wins >= 12,
            "proven model won only {wins}/20 times (expected >= 12)"
        );
    }

    #[test]
    fn proven_local_model_can_displace_bootstrap_remote() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0,
                bootstrap_min_task_observations: 6,
                bootstrap_quality_floor: 0.8,
                ..Default::default()
            },
        );
        let reg = test_registry();
        let mut tracker = OutcomeTracker::new();

        let qwen_8b_id = "qwen/qwen3-8b:q4_k_m";
        for _ in 0..12 {
            let trace = tracker.record_start(qwen_8b_id, InferenceTask::Generate, "test");
            tracker.record_complete(&trace, 300, 50, 20);
            tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.95 });
        }

        let mut local_wins = 0;
        for _ in 0..20 {
            let decision = router.route("Summarize this design decision.", &reg, &tracker);
            let schema = reg
                .get(&decision.model_id)
                .expect("selected model should exist");
            if schema.is_local() {
                local_wins += 1;
            }
        }

        assert!(
            local_wins >= 12,
            "proven local model won only {local_wins}/20 times (expected >= 12)"
        );
    }

    #[test]
    fn benchmark_prior_informs_background_routing() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0,
                ..Default::default()
            },
        );
        let reg = test_registry();
        let mut tracker = OutcomeTracker::new();
        let mut profile = crate::outcome::ModelProfile::new("qwen/qwen3-8b:q4_k_m".into());
        profile.ema_quality = 0.95;
        tracker.import_profiles(vec![profile]);

        let decision = router.route_context_aware(
            "Write a Python fibonacci function.",
            128,
            &reg,
            &tracker,
            false,
            false,
            RoutingWorkload::Background,
        );

        let schema = reg
            .get(&decision.model_id)
            .expect("selected model should exist");
        assert!(
            schema.is_local(),
            "background routing should allow strong local benchmark priors to win"
        );
    }

    #[test]
    fn task_specific_benchmark_prior_informs_cold_start_routing() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0,
                ..Default::default()
            },
        );
        let reg = test_registry();
        let mut tracker = OutcomeTracker::new();
        let mut profile = crate::outcome::ModelProfile::new("qwen/qwen3-8b:q4_k_m".into());
        profile.task_stats.insert(
            crate::outcome::InferenceTask::Code.to_string(),
            crate::outcome::TaskStats {
                ema_quality: 0.95,
                ..Default::default()
            },
        );
        tracker.import_profiles(vec![profile]);

        let decision = router.route_context_aware(
            "Write a Python fibonacci function.",
            128,
            &reg,
            &tracker,
            false,
            false,
            RoutingWorkload::Background,
        );

        let schema = reg
            .get(&decision.model_id)
            .expect("selected model should exist");
        assert!(
            schema.is_local(),
            "background routing should use task-specific cold-start priors for local code models"
        );
    }

    #[test]
    fn task_specific_latency_prior_affects_cold_start_score() {
        let router = AdaptiveRouter::new(test_hw(), RoutingConfig::default());
        let reg = test_registry();
        let model = reg
            .get("qwen/qwen3-8b:q4_k_m")
            .expect("local test model should exist");

        let mut fast_tracker = OutcomeTracker::new();
        let mut fast_profile = crate::outcome::ModelProfile::new(model.id.clone());
        fast_profile.task_stats.insert(
            crate::outcome::InferenceTask::Generate.to_string(),
            crate::outcome::TaskStats {
                ema_quality: 0.95,
                avg_latency_ms: 1200.0,
                ..Default::default()
            },
        );
        fast_tracker.import_profiles(vec![fast_profile]);

        let mut slow_tracker = OutcomeTracker::new();
        let mut slow_profile = crate::outcome::ModelProfile::new(model.id.clone());
        slow_profile.task_stats.insert(
            crate::outcome::InferenceTask::Generate.to_string(),
            crate::outcome::TaskStats {
                ema_quality: 0.95,
                avg_latency_ms: 120_000.0,
                ..Default::default()
            },
        );
        slow_tracker.import_profiles(vec![slow_profile]);

        let fast_score = router.score_model(
            model,
            InferenceTask::Generate,
            &fast_tracker,
            RoutingWorkload::Interactive,
        );
        let slow_score = router.score_model(
            model,
            InferenceTask::Generate,
            &slow_tracker,
            RoutingWorkload::Interactive,
        );

        assert!(
            fast_score > slow_score,
            "faster task latency prior should improve cold-start score ({fast_score} <= {slow_score})"
        );
    }

    #[test]
    fn interactive_workload_keeps_remote_bootstrap_bias() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0,
                ..Default::default()
            },
        );
        let reg = test_registry();
        let mut tracker = OutcomeTracker::new();
        let mut profile = crate::outcome::ModelProfile::new("qwen/qwen3-8b:q4_k_m".into());
        profile.ema_quality = 0.95;
        tracker.import_profiles(vec![profile]);

        let decision = router.route_context_aware(
            "Write a Python fibonacci function.",
            128,
            &reg,
            &tracker,
            false,
            false,
            RoutingWorkload::Interactive,
        );

        let schema = reg
            .get(&decision.model_id)
            .expect("selected model should exist");
        assert!(
            !schema.is_local(),
            "interactive routing should still prefer trusted remote models during cold start"
        );
    }

    #[test]
    fn fallback_chain_has_alternatives() {
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 100.0, // strong prior = exploit Phase 2 scores (deterministic-ish)
                ..Default::default()
            },
        );
        let reg = test_registry();
        let tracker = OutcomeTracker::new();

        let decision = router.route("Analyze the architecture trade-offs", &reg, &tracker);
        assert!(!decision.fallbacks.is_empty());
        // Primary should not appear in fallbacks
        assert!(!decision.fallbacks.contains(&decision.model_id));
    }

    #[test]
    fn latency_scoring_is_consistent() {
        // Verify that schema and observed latency produce comparable scores
        let router = AdaptiveRouter::with_default_config(test_hw());

        // A model at 25 TPS → ~200/25*1000 = 8000ms → score = 1 - 8000/10000 = 0.2
        let schema_score = router.latency_ms_to_score(AdaptiveRouter::tps_to_latency_ms(25.0));
        // Same model observed at 8000ms → same formula
        let observed_score = router.latency_ms_to_score(8000.0);
        assert!(
            (schema_score - observed_score).abs() < 0.01,
            "schema ({schema_score}) and observed ({observed_score}) should match"
        );
    }

    #[test]
    fn complexity_assessment() {
        assert_eq!(
            TaskComplexity::assess("What is the capital of France?"),
            TaskComplexity::Simple
        );
        assert_eq!(
            TaskComplexity::assess("Fix this broken test"),
            TaskComplexity::Code
        );
        assert_eq!(
            TaskComplexity::assess("Analyze the trade-offs between A and B"),
            TaskComplexity::Complex
        );
    }

    #[test]
    fn beta_sampling_produces_valid_values() {
        let mut rng = rand::rng();
        // Sample 100 times from Beta(2, 5) — should be in [0, 1]
        for _ in 0..100 {
            let s = sample_beta(&mut rng, 2.0, 5.0);
            assert!(s >= 0.0 && s <= 1.0, "sample {s} out of [0,1] range");
        }
        // Beta(1, 1) = Uniform(0, 1) — mean should be ~0.5
        let samples: Vec<f64> = (0..1000).map(|_| sample_beta(&mut rng, 1.0, 1.0)).collect();
        let mean = samples.iter().sum::<f64>() / samples.len() as f64;
        assert!(
            (mean - 0.5).abs() < 0.05,
            "Beta(1,1) mean {mean} should be ~0.5"
        );
    }

    #[test]
    fn thompson_sampling_converges_to_best() {
        // A model with strong observed success should win most of the time
        let router = AdaptiveRouter::new(
            test_hw(),
            RoutingConfig {
                prior_strength: 1.0, // weak prior, let observations dominate
                ..Default::default()
            },
        );
        let reg = test_registry();
        let mut tracker = OutcomeTracker::new();

        // Give Qwen3-4B 20 successes (strong signal)
        let qwen_4b_id = "qwen/qwen3-4b:q4_k_m";
        for _ in 0..20 {
            let trace = tracker.record_start(qwen_4b_id, InferenceTask::Code, "test");
            tracker.record_complete(&trace, 500, 100, 50);
            tracker.record_inferred_outcome(&trace, InferredOutcome::Accepted { confidence: 0.95 });
        }

        // Route 20 code tasks — strong model should win the majority
        let mut wins = 0;
        for _ in 0..20 {
            let decision = router.route("Fix this parser bug", &reg, &tracker);
            if decision.model_id == qwen_4b_id {
                wins += 1;
            }
        }
        assert!(
            wins >= 14,
            "strong model won only {wins}/20 times (expected >= 14)"
        );
    }

    // ----- Intent surface (parslee-ai/car-releases#18) -----

    #[test]
    fn intent_require_filters_out_models_lacking_capability() {
        // Asking for vision when no candidate has it should fall to
        // the cold-start decision rather than scoring incompatible
        // candidates. The fixture registry has no vision-capable
        // local models.
        let router = AdaptiveRouter::new(test_hw(), RoutingConfig::default());
        let reg = test_registry();
        let tracker = OutcomeTracker::new();

        let intent = crate::intent::IntentHint {
            require: vec![ModelCapability::Vision],
            ..Default::default()
        };
        let decision = router.route_with_intent("hello", &reg, &tracker, &intent);

        // When require filters out every candidate, the router falls
        // to the schema-based cold-start path. Asserting the strategy
        // (not just non-empty model_id) catches a regression where
        // future code might silently include filtered candidates.
        assert_eq!(
            decision.strategy,
            RoutingStrategy::SchemaBased,
            "require=[vision] with no vision-capable candidates must drop to schema cold-start"
        );
    }

    #[test]
    fn intent_default_does_not_override_task_or_caps() {
        // Thompson sampling makes per-call model selection
        // non-deterministic, so we can't compare model_ids directly.
        // What we can assert deterministically: a default IntentHint
        // must not change the task selection or the capability
        // requirements — those are functions of the prompt only when
        // no hint is supplied.
        let router = AdaptiveRouter::new(test_hw(), RoutingConfig::default());
        let reg = test_registry();
        let tracker = OutcomeTracker::new();

        let baseline = router.route("write a haiku", &reg, &tracker);
        let with_default = router.route_with_intent(
            "write a haiku",
            &reg,
            &tracker,
            &crate::intent::IntentHint::default(),
        );

        assert_eq!(
            baseline.task, with_default.task,
            "default IntentHint must not change the prompt-derived task"
        );
        assert_eq!(
            baseline.complexity, with_default.complexity,
            "default IntentHint must not change the prompt-derived complexity"
        );
    }

    #[test]
    fn intent_task_hint_overrides_prompt_complexity() {
        // A short prompt that complexity assessment would route as
        // Generate should land on Reasoning when the intent says so.
        let router = AdaptiveRouter::new(test_hw(), RoutingConfig::default());
        let reg = test_registry();
        let tracker = OutcomeTracker::new();

        let hint = crate::intent::IntentHint {
            task: Some(crate::intent::TaskHint::Reasoning),
            ..Default::default()
        };
        let decision = router.route_with_intent("hi", &reg, &tracker, &hint);

        assert_eq!(
            decision.task,
            InferenceTask::Reasoning,
            "TaskHint::Reasoning should override the prompt-derived task"
        );
    }
}