car-inference 0.22.0

//! Caller-facing routing intent — express requirements, not model IDs.
//!
//! Tracks Parslee-ai/car-releases#18. The motivation is that callers
//! today choose between two extremes:
//!
//! - `model = None` → the adaptive router picks. Quality on average is
//!   good but per-request variability surfaces as UX inconsistency.
//! - `model = Some("claude-sonnet-4-7")` → the caller pins. Provider
//!   awareness leaks up the stack — exactly what CAR is supposed to
//!   prevent.
//!
//! `IntentHint` is the middle ground. The caller expresses *what* they
//! need; the router resolves intent → model. Existing `model = None`
//! and `model = Some(...)` paths are unchanged when no intent is
//! supplied.
//!
//! ## MVP scope
//!
//! Just `task`, `prefer_local`, `require`. Cost/latency ceilings wait
//! for clean registry numbers; `prefer_family` was cut as a soft
//! routing knob that accumulates tweaks without clear semantics
//! (Linus design review, 2026-05-04).
//!
//! ## Routing semantics
//!
//! `prefer_local: true` maps to a dedicated
//! [`crate::RoutingWorkload::LocalPreferred`] variant. Distinct from
//! `Background` (which is "this is a background job, latency barely
//! matters") — `LocalPreferred` keeps a quality-aware weight profile
//! and a strong local_bonus so the hint wins ties decisively.

use serde::{Deserialize, Serialize};

use crate::schema::ModelCapability;

/// What the caller is doing — coarse-grained categories the adaptive
/// router maps to `InferenceTask`. A closed enum so adding a new task
/// type is a deliberate FFI-visible change rather than a silent
/// fallback when the router doesn't recognize a string.
///
/// The MVP intentionally ships only the variants that map to a
/// distinct `InferenceTask` today. `Summarize` / `Extract` were cut
/// because both would have collapsed to `Generate` with no observable
/// behavior change — shipping enum variants that are accepted, parsed,
/// and silently discarded is exactly the routing variability the
/// intent surface is designed to remove. Add them back when the
/// registry actually distinguishes summarize-tuned or extract-tuned
/// models.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TaskHint {
    /// Conversational chat — maps to `InferenceTask::Generate`.
    Chat,
    /// Label assignment / categorization. Maps to
    /// `InferenceTask::Classify`.
    Classify,
    /// Chain-of-thought, planning, multi-step analysis. Maps to
    /// `InferenceTask::Reasoning` and tends to favor frontier
    /// reasoning models.
    Reasoning,
    /// Code generation, repair, refactoring. Maps to
    /// `InferenceTask::Code`.
    Code,
}

/// Caller-supplied routing intent. All fields are optional / additive.
/// An `IntentHint` with default values matches the no-intent path
/// exactly, so threading `Option<IntentHint>` through is safe.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct IntentHint {
    /// What the caller is doing. None = let the router infer from the
    /// prompt as today.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub task: Option<TaskHint>,

    /// Hard filter — every required capability must be present on the
    /// candidate. Empty = no extra filter.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub require: Vec<ModelCapability>,

    /// Bias the score profile toward local models (cost over quality).
    /// Internally this maps to `RoutingWorkload::Background` until the
    /// follow-up split lands (parslee-ai/car#106).
    #[serde(default, skip_serializing_if = "is_false")]
    pub prefer_local: bool,

    /// Bias the score profile aggressively toward latency. Maps to
    /// [`crate::tasks::RoutingWorkload::Fastest`] — a weight profile
    /// that downweights quality and cost in favour of time-to-first-token.
    /// Designed for voice turns where a sub-500ms first-audio target
    /// beats a richer-but-slower answer. Takes precedence over
    /// `prefer_local`; if both are set, the request is routed by
    /// `Fastest` rules.
    #[serde(default, skip_serializing_if = "is_false")]
    pub prefer_fast: bool,
}

fn is_false(b: &bool) -> bool {
    !*b
}

// ---------------------------------------------------------------------------
// Acquisition intent — which model to *recommend/install* for this machine.
//
// Distinct layer from `TaskHint`/`IntentHint` above, which are *inference-time*
// routing intent (which already-installed model should serve this request).
// The types below answer the earlier question: "given this hardware and what
// the user wants to do, which model should they acquire?" They are consumed by
// `ModelRecommender` (see docs/solutions/first-class-model-ux.md) and never
// expose model IDs, quantization, or HF repos to the caller.
// ---------------------------------------------------------------------------

/// The kind of model a use case needs. The recommender ranks only *within*
/// a role's lane — an embedding model and a chat model are not comparable,
/// so a retrieval pick never competes with a generative one. A use case that
/// spans roles resolves to a bundle (one recommendation per role).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum UseCaseRole {
    /// Produces text/tokens (chat, code, vision-to-text, summarize).
    Generative,
    /// Produces vectors / relevance scores (embeddings, rerank).
    Retrieval,
    /// Consumes audio (transcription).
    Audio,
}

/// What the user wants to do, in their terms — not a model ID. Closed enum:
/// adding a use case is a deliberate FFI-visible change, never a silent
/// string fallback. Each variant maps to a [`UseCaseRole`] and a required /
/// preferred [`ModelCapability`] set (see [`UseCase::required_capabilities`]).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum UseCase {
    /// General chat / Q&A. The default.
    Assistant,
    /// Code generation, repair, refactoring.
    Coding,
    /// Text condensation.
    Summarize,
    /// Image understanding (a generative model that also sees).
    Vision,
    /// Audio → text.
    Transcription,
    /// Semantic search — an embedding model for retrieval. NOT an LLM
    /// performing web search with tools; this is the `Retrieval` role and
    /// maps to the `Embed` capability, so it is ranked separately from any
    /// generative chat model.
    Search,
}

impl UseCase {
    /// The role lane this use case is ranked within.
    pub fn role(self) -> UseCaseRole {
        match self {
            UseCase::Assistant
            | UseCase::Coding
            | UseCase::Summarize
            | UseCase::Vision => UseCaseRole::Generative,
            UseCase::Search => UseCaseRole::Retrieval,
            UseCase::Transcription => UseCaseRole::Audio,
        }
    }

    /// Hard eligibility filter — a model missing any of these is excluded.
    pub fn required_capabilities(self) -> &'static [ModelCapability] {
        use ModelCapability::*;
        match self {
            UseCase::Assistant => &[Generate],
            UseCase::Coding => &[Generate, Code],
            UseCase::Summarize => &[Generate],
            UseCase::Vision => &[Vision, Generate],
            UseCase::Transcription => &[SpeechToText],
            UseCase::Search => &[Embed],
        }
    }

    /// Soft preference — present capabilities add a ranking bonus but are
    /// never required for eligibility.
    pub fn preferred_capabilities(self) -> &'static [ModelCapability] {
        use ModelCapability::*;
        match self {
            UseCase::Assistant => &[ToolUse],
            UseCase::Coding => &[ToolUse, Reasoning],
            UseCase::Summarize => &[Summarize],
            UseCase::Vision => &[],
            UseCase::Transcription => &[],
            UseCase::Search => &[Rerank],
        }
    }
}

impl Default for UseCase {
    fn default() -> Self {
        UseCase::Assistant
    }
}

/// Speed/quality knob. Each tier is a fixed weighting over the recommender's
/// soft-score axes, applied *after* the hard eligibility filter, so tier
/// semantics are explicit rather than reinvented per call site.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum QualityTier {
    /// Smallest eligible model; lowest latency.
    Fastest,
    /// Best quality that fits with KV-cache headroom. The default.
    Balanced,
    /// Largest model that fits at all; accepts slower output.
    MostCapable,
}

impl Default for QualityTier {
    fn default() -> Self {
        QualityTier::Balanced
    }
}

/// Relative weights a [`QualityTier`] places on each soft-score axis. The
/// recommender normalizes each axis to `[0,1]` and combines them with these.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct TierWeights {
    /// Toward higher-quality models (benchmarks / param-count prior).
    pub quality: f32,
    /// Toward lower-latency models (smaller, better-accelerated).
    pub latency: f32,
    /// Toward leaving memory headroom (smaller fraction of budget used).
    pub memory_pressure: f32,
}

impl QualityTier {
    /// The fixed axis weighting for this tier. Mirrors the table in
    /// docs/solutions/first-class-model-ux.md.
    pub fn weights(self) -> TierWeights {
        match self {
            QualityTier::Fastest => TierWeights {
                quality: 0.2,
                latency: 0.6,
                memory_pressure: 0.2,
            },
            QualityTier::Balanced => TierWeights {
                quality: 0.5,
                latency: 0.2,
                memory_pressure: 0.3,
            },
            QualityTier::MostCapable => TierWeights {
                quality: 0.8,
                latency: 0.0,
                memory_pressure: 0.2,
            },
        }
    }
}

/// Where the user is willing to run inference. Orthogonal to [`QualityTier`].
/// Choosing the cloud is never silent — `CloudOk` only makes remote models
/// *eligible*; the recommender still flags them as requiring one-time consent.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Privacy {
    /// Local models only.
    OnDevice,
    /// Remote APIs / the Parslee gateway may compete and win.
    CloudOk,
}

impl Default for Privacy {
    fn default() -> Self {
        Privacy::OnDevice
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_intent_serializes_compactly() {
        // No-intent must round-trip through serde without verbose
        // null fields — the FFI layer transmits as JSON and clients
        // shouldn't see {"task":null,"require":[],"prefer_local":false}.
        let hint = IntentHint::default();
        let json = serde_json::to_string(&hint).unwrap();
        assert_eq!(json, "{}");
    }

    #[test]
    fn round_trip_with_capability_require() {
        let hint = IntentHint {
            task: Some(TaskHint::Code),
            require: vec![ModelCapability::Code, ModelCapability::ToolUse],
            prefer_local: true,
            prefer_fast: false,
        };
        let json = serde_json::to_string(&hint).unwrap();
        let back: IntentHint = serde_json::from_str(&json).unwrap();
        assert_eq!(back.task, Some(TaskHint::Code));
        assert_eq!(
            back.require,
            vec![ModelCapability::Code, ModelCapability::ToolUse]
        );
        assert!(back.prefer_local);
        assert!(!back.prefer_fast);
    }

    #[test]
    fn missing_fields_default_cleanly() {
        // Pre-MVP clients that don't know about IntentHint may send
        // partial JSON. Defaults must match the no-intent path.
        let hint: IntentHint = serde_json::from_str("{}").unwrap();
        assert_eq!(hint.task, None);
        assert!(hint.require.is_empty());
        assert!(!hint.prefer_local);
        assert!(!hint.prefer_fast);
    }

    #[test]
    fn prefer_fast_round_trips_and_skips_when_false() {
        let off = IntentHint::default();
        assert_eq!(serde_json::to_string(&off).unwrap(), "{}");

        let on = IntentHint {
            prefer_fast: true,
            ..IntentHint::default()
        };
        let json = serde_json::to_string(&on).unwrap();
        assert!(json.contains("prefer_fast"));
        let back: IntentHint = serde_json::from_str(&json).unwrap();
        assert!(back.prefer_fast);
    }

    // --- acquisition intent (UseCase / QualityTier / Privacy) ---

    #[test]
    fn use_case_defaults_to_assistant_and_balanced_on_device() {
        assert_eq!(UseCase::default(), UseCase::Assistant);
        assert_eq!(QualityTier::default(), QualityTier::Balanced);
        assert_eq!(Privacy::default(), Privacy::OnDevice);
    }

    #[test]
    fn coding_requires_both_generate_and_code() {
        // Regression guard for the design-review point that a coding
        // model that can't generate is useless — Code alone is not enough.
        let req = UseCase::Coding.required_capabilities();
        assert!(req.contains(&ModelCapability::Generate));
        assert!(req.contains(&ModelCapability::Code));
    }

    #[test]
    fn search_is_a_retrieval_role_not_generative() {
        // Search must never be ranked against chat models.
        assert_eq!(UseCase::Search.role(), UseCaseRole::Retrieval);
        assert_eq!(UseCase::Assistant.role(), UseCaseRole::Generative);
        assert_eq!(UseCase::Transcription.role(), UseCaseRole::Audio);
        assert_eq!(
            UseCase::Search.required_capabilities(),
            &[ModelCapability::Embed]
        );
    }

    #[test]
    fn required_and_preferred_are_disjoint() {
        // A capability listed as required must not also be "preferred" —
        // that would double-count it in scoring.
        for uc in [
            UseCase::Assistant,
            UseCase::Coding,
            UseCase::Summarize,
            UseCase::Vision,
            UseCase::Transcription,
            UseCase::Search,
        ] {
            for p in uc.preferred_capabilities() {
                assert!(
                    !uc.required_capabilities().contains(p),
                    "{uc:?}: {p:?} is both required and preferred"
                );
            }
        }
    }

    #[test]
    fn tier_weights_match_documented_table() {
        let b = QualityTier::Balanced.weights();
        assert_eq!((b.quality, b.latency, b.memory_pressure), (0.5, 0.2, 0.3));
        let f = QualityTier::Fastest.weights();
        assert!(f.latency > f.quality, "Fastest must favor latency");
        let c = QualityTier::MostCapable.weights();
        assert!(c.quality > c.latency, "MostCapable must favor quality");
    }

    #[test]
    fn tier_weights_are_non_negative_and_sum_to_one() {
        // Guards against a future edit silently letting one axis dominate
        // by making weights sum to ≠ 1.0. Epsilon compare for floats.
        for tier in [
            QualityTier::Fastest,
            QualityTier::Balanced,
            QualityTier::MostCapable,
        ] {
            let w = tier.weights();
            for axis in [w.quality, w.latency, w.memory_pressure] {
                assert!(axis >= 0.0, "{tier:?}: negative weight {axis}");
            }
            let sum = w.quality + w.latency + w.memory_pressure;
            assert!(
                (sum - 1.0).abs() < 1e-6,
                "{tier:?}: weights sum to {sum}, expected 1.0"
            );
        }
    }

    #[test]
    fn use_case_round_trips_snake_case() {
        let json = serde_json::to_string(&UseCase::Coding).unwrap();
        assert_eq!(json, "\"coding\"");
        let back: UseCase = serde_json::from_str("\"search\"").unwrap();
        assert_eq!(back, UseCase::Search);
    }
}