xybrid-sdk 0.1.1

//! Model loading and execution for xybrid-sdk.
//!
//! This module provides:
//! - `ModelLoader`: Preparatory step for loading models (from registry, bundle, or directory)
//! - `XybridModel`: Loaded model ready for inference
//! - `ModelHandle`: Internal state management for the loaded model
//! - `StreamEvent`: Events emitted during streaming inference

use crate::registry_client::RegistryClient;
use crate::result::{InferenceResult, OutputType};
use crate::run_options::{check_abort_for_streaming, AbortState, CancellationToken, RunOptions};
use crate::source::{detect_platform, ModelSource};
use crate::stream::XybridStream;
use std::path::PathBuf;
use std::pin::Pin;
use std::sync::{Arc, OnceLock, RwLock};
use std::time::{Duration, Instant};
use tempfile::TempDir;
use tokio_stream::wrappers::ReceiverStream;
use xybrid_core::conversation::ConversationContext;
use xybrid_core::execution::{
    ExecutionTemplate, ModelMetadata, TemplateExecutor, VoiceConfig, VoiceInfo,
};
use xybrid_core::ir::Envelope;
use xybrid_core::orchestrator::authority::{
    ExecutionOutcome, LocalAuthority, OrchestrationAuthority, OutcomeCategory, PolicyOutcome,
    PolicyRequest, ResolvedTarget, SignalContext, StageContext,
};
use xybrid_core::orchestrator::routing_engine::LocalReliabilityHint;
use xybrid_core::runtime_adapter::types::GenerationConfig;
use xybrid_core::streaming::{StreamConfig as CoreStreamConfig, VadStreamConfig as CoreVadConfig};

/// A token generated during streaming inference.
///
/// This is the SDK's version of the core `PartialToken`, re-exported for convenience.
#[derive(Debug, Clone)]
pub struct StreamToken {
    /// The generated token text
    pub token: String,
    /// The token ID (if available from the model)
    pub token_id: Option<i64>,
    /// Index of this token in the generation sequence
    pub index: usize,
    /// All text generated so far (cumulative)
    pub cumulative_text: String,
    /// Reason for stopping (only set on the final token)
    pub finish_reason: Option<String>,
}

/// Events emitted during streaming inference.
///
/// Use this with `run_stream()` to handle tokens as they're generated.
#[derive(Debug, Clone)]
pub enum StreamEvent {
    /// A token was generated (emitted for each token during LLM inference)
    Token(StreamToken),
    /// Inference completed successfully with final result
    Complete(InferenceResult),
    /// An error occurred during inference
    Error(String),
}

/// SDK-level error type.
#[derive(Debug, thiserror::Error)]
pub enum SdkError {
    #[error("Model not found: {0}")]
    ModelNotFound(String),
    #[error("Directory not found: {0}")]
    DirectoryNotFound(String),
    #[error("model_metadata.json not found in directory: {0}")]
    MetadataNotFound(String),
    #[error("model_metadata.json is invalid: {0}")]
    MetadataInvalid(String),
    #[error("Failed to load model: {0}")]
    LoadError(String),
    #[error("Inference failed: {0}")]
    InferenceError(String),
    /// Local streaming aborted under resource pressure with the caller's
    /// permission to retry on cloud (`AbortPolicy::fallback_to_cloud`).
    /// `run_streaming_with_fallback` catches this variant; lower-level
    /// streaming entry points (e.g. `run_streaming_with_options`) propagate
    /// it so callers can choose their own retry strategy.
    #[error("Aborted for cloud fallback: {reason}")]
    AbortedForCloudFallback {
        reason: xybrid_core::abort::AbortReason,
    },
    #[error("Streaming not supported by this model")]
    StreamingNotSupported,
    #[error("Model not loaded")]
    NotLoaded,
    #[error("Invalid configuration: {0}")]
    ConfigError(String),
    #[error("Network error: {0}")]
    NetworkError(String),
    /// The registry could not be reached at all (DNS failure, connection refused,
    /// network unreachable, interface down). This is distinct from `NetworkError`
    /// because it represents *local* unreachability rather than a server-side problem,
    /// and the circuit breaker treats it differently — offline errors don't count
    /// toward the failure threshold so callers aren't punished for being offline.
    #[error("Registry unreachable: {0}")]
    Offline(String),
    #[error("IO error: {0}")]
    IoError(#[from] std::io::Error),
    #[error("Cache error: {0}")]
    CacheError(String),
    #[error("Pipeline error: {0}")]
    PipelineError(String),
    #[error("Circuit breaker open: {0}")]
    CircuitOpen(String),
    #[error("Rate limited, retry after {retry_after_secs} seconds")]
    RateLimited { retry_after_secs: u64 },
    #[error("Request timeout after {timeout_ms}ms")]
    Timeout { timeout_ms: u64 },
}

/// Result type for SDK operations.
pub type SdkResult<T> = Result<T, SdkError>;

impl SdkError {
    /// Whether retrying the operation that produced this error could
    /// succeed without the caller changing anything.
    ///
    /// Transient failures (`NetworkError`, `RateLimited`, `Timeout`,
    /// `Offline`) are retryable; everything else — including
    /// `CircuitOpen`, `ConfigError`, `ModelNotFound`, `LoadError`,
    /// `InferenceError`, and `AbortedForCloudFallback` — is not. `Offline`
    /// is retryable only across *different* registry URLs (a fallback
    /// registry may be reachable when the primary isn't); within a single
    /// URL the retry loop short-circuits (see `registry_client`).
    ///
    /// This is the inherent form of the [`xybrid_core::http::RetryableError`]
    /// trait method, exposed directly on `SdkError` so callers (and the
    /// FFI / UniFFI layers) can query retryability without importing the
    /// trait. The trait impl forwards here.
    pub fn is_retryable(&self) -> bool {
        match self {
            // Retryable errors (transient failures)
            SdkError::NetworkError(_) => true,
            SdkError::RateLimited { .. } => true,
            SdkError::Timeout { .. } => true,
            // Offline is "retryable" only across URLs — the fallback registry
            // may be reachable even when the primary isn't. Within a single URL
            // the retry loop short-circuits immediately (see registry_client).
            SdkError::Offline(_) => true,

            // Non-retryable errors (permanent failures)
            SdkError::ModelNotFound(_) => false,
            SdkError::DirectoryNotFound(_) => false,
            SdkError::MetadataNotFound(_) => false,
            SdkError::MetadataInvalid(_) => false,
            SdkError::LoadError(_) => false,
            SdkError::InferenceError(_) => false,
            // Resource-driven abort is not retryable on the same path; the
            // wrapper redirects to cloud instead.
            SdkError::AbortedForCloudFallback { .. } => false,
            SdkError::StreamingNotSupported => false,
            SdkError::NotLoaded => false,
            SdkError::ConfigError(_) => false,
            SdkError::IoError(_) => false,
            SdkError::CacheError(_) => false,
            SdkError::PipelineError(_) => false,
            SdkError::CircuitOpen(_) => false, // Don't retry when circuit is open
        }
    }

    /// The minimum delay a caller should wait before retrying, when the
    /// error itself dictates one. Only `RateLimited` carries a
    /// server-specified backoff; every other variant returns `None`
    /// (the caller picks its own backoff if [`Self::is_retryable`]).
    ///
    /// Inherent form of [`xybrid_core::http::RetryableError::retry_after`];
    /// the trait impl forwards here.
    pub fn retry_after(&self) -> Option<std::time::Duration> {
        match self {
            SdkError::RateLimited { retry_after_secs } => {
                Some(std::time::Duration::from_secs(*retry_after_secs))
            }
            _ => None,
        }
    }
}

impl xybrid_core::http::RetryableError for SdkError {
    fn is_retryable(&self) -> bool {
        SdkError::is_retryable(self)
    }

    fn retry_after(&self) -> Option<std::time::Duration> {
        SdkError::retry_after(self)
    }
}

fn streaming_execution_error(error: xybrid_core::runtime_adapter::AdapterError) -> SdkError {
    match error {
        xybrid_core::runtime_adapter::AdapterError::AbortedForCloudFallback { reason } => {
            SdkError::AbortedForCloudFallback { reason }
        }
        other => SdkError::InferenceError(format!("Streaming execution failed: {}", other)),
    }
}

fn streaming_callback_error(error: Box<dyn std::error::Error + Send + Sync>) -> SdkError {
    if let Some(reason) = xybrid_core::abort::cloud_fallback_reason_from_error(error.as_ref()) {
        return SdkError::AbortedForCloudFallback { reason };
    }
    SdkError::InferenceError(format!("Streaming callback failed: {}", error))
}

fn streaming_pre_run_abort_error(
    reason: crate::run_options::AbortReason,
    fallback_to_cloud: bool,
) -> SdkError {
    if fallback_to_cloud && !matches!(reason, crate::run_options::AbortReason::UserCancelled) {
        return SdkError::AbortedForCloudFallback {
            reason: reason.to_core_abort_reason(),
        };
    }
    SdkError::InferenceError(format!("Execution aborted: {reason}"))
}

/// Information about a local→cloud handoff "seam" surfaced by
/// [`XybridModel::run_streaming_with_fallback`].
///
/// The wrapper invokes the caller's `on_seam` once when a local stream
/// aborts under resource pressure and the run is about to continue on
/// cloud. Callers use this to render UX cues ("switching to cloud…") or
/// reconcile telemetry against the same `correlation_id`.
#[derive(Debug, Clone)]
pub struct SeamInfo {
    /// Why the local run aborted.
    pub reason: xybrid_core::abort::AbortReason,
    /// Correlation id linking the local-aborted and cloud-retry telemetry events.
    pub correlation_id: String,
    /// Number of `PartialToken`s the local leg emitted before aborting.
    pub local_tokens: u32,
    /// Wall-clock latency of the local leg, in milliseconds.
    pub local_latency_ms: u32,
}

const FALLBACK_POLICY_RESOURCE_MAX_AGE: Duration = Duration::from_millis(500);

static FALLBACK_AUTHORITY: OnceLock<LocalAuthority> = OnceLock::new();

fn fallback_authority() -> &'static dyn OrchestrationAuthority {
    FALLBACK_AUTHORITY.get_or_init(LocalAuthority::new)
}

fn fallback_policy_metrics(options: &RunOptions) -> xybrid_core::context::DeviceMetrics {
    let snapshot = options
        .resource_provider
        .as_ref()
        .map(|provider| provider.current_snapshot(FALLBACK_POLICY_RESOURCE_MAX_AGE))
        .unwrap_or_else(|| {
            xybrid_core::device::ResourceMonitor::global()
                .current_snapshot(FALLBACK_POLICY_RESOURCE_MAX_AGE)
        });
    // Prefer caller-supplied DeviceMetrics so RTT- and battery-based deny
    // rules in the policy engine see real values. `with_live_snapshot` then
    // overlays the freshly sampled resource snapshot on top — best of both
    // worlds. Falls back to `DeviceMetrics::default()` when the caller has
    // no device adapter wired (the historical behaviour).
    let base = options.device_metrics.clone().unwrap_or_default();
    base.with_live_snapshot(snapshot)
}

fn cloud_target(provider: Option<&str>) -> ResolvedTarget {
    ResolvedTarget::Cloud {
        provider: provider.unwrap_or("xybrid").to_string(),
    }
}

fn record_local_abort_outcome(
    authority: &dyn OrchestrationAuthority,
    model_id: &str,
    reason: xybrid_core::abort::AbortReason,
    latency_ms: u32,
    signal_context: Option<SignalContext>,
) {
    authority.record_outcome(&ExecutionOutcome {
        stage_id: model_id.to_string(),
        target: ResolvedTarget::Device,
        latency_ms: latency_ms as u64,
        success: false,
        error: Some(reason.as_str().to_string()),
        category: Some(OutcomeCategory::AbortedForCloudFallback { reason }),
        model_id: Some(model_id.to_string()),
        signal_context,
    });
}

fn record_cloud_outcome(
    authority: &dyn OrchestrationAuthority,
    model_id: &str,
    provider: Option<&str>,
    latency_ms: u32,
    success: bool,
    error: Option<String>,
    category: OutcomeCategory,
    signal_context: Option<SignalContext>,
) {
    authority.record_outcome(&ExecutionOutcome {
        stage_id: model_id.to_string(),
        target: cloud_target(provider),
        latency_ms: latency_ms as u64,
        success,
        error,
        category: Some(category),
        model_id: Some(model_id.to_string()),
        signal_context,
    });
}

fn local_reliability_hint_after_abort(
    authority: &dyn OrchestrationAuthority,
    model_id: &str,
    envelope: &Envelope,
    metrics: &xybrid_core::context::DeviceMetrics,
) -> Option<LocalReliabilityHint> {
    let context = StageContext {
        stage_id: model_id.to_string(),
        model_id: model_id.to_string(),
        input_kind: envelope.kind.clone(),
        metrics: metrics.clone(),
        resource_monitor: xybrid_core::device::ResourceMonitor::global(),
        explicit_target: None,
        local_availability: None,
        device_class: Some(metrics.canonical_device_class()),
        device_class_schema_version: Some(xybrid_core::context::DEVICE_CLASS_SCHEMA_VERSION),
    };
    authority
        .resolve_target_with_feedback(&context)
        .local_reliability_hint
}

/// Inspect the local-leg result and, on a typed cloud-fallback abort, fire
/// `on_seam`, retry on the cloud adapter, and return the cloud
/// [`InferenceResult`]. On any other shape the original result is returned
/// unchanged.
///
/// `cancellation_token`, when set, makes the cloud retry leg honour
/// caller-driven cancellation. The cloud leg cannot meaningfully react to
/// resource pressure on the device, so only `UserCancelled` is consulted —
/// matching the local leg's contract.
///
/// Lives as a free function so unit tests can drive it directly without
/// constructing a real [`XybridModel`].
#[allow(clippy::too_many_arguments)]
fn dispatch_after_local<F, S>(
    local_result: SdkResult<InferenceResult>,
    envelope: &Envelope,
    cloud_adapter: &dyn xybrid_core::runtime_adapter::CloudStreaming,
    correlation_id: String,
    model_id: &str,
    local_tokens: u32,
    local_latency_ms: u32,
    local_resource_summary: Option<xybrid_core::device::ResourceUsageSummary>,
    authority: &dyn OrchestrationAuthority,
    policy_metrics: xybrid_core::context::DeviceMetrics,
    signal_context: Option<SignalContext>,
    cancellation_token: Option<CancellationToken>,
    on_token: &mut F,
    on_seam: &mut S,
) -> SdkResult<InferenceResult>
where
    F: FnMut(
            xybrid_core::runtime_adapter::types::PartialToken,
        ) -> Result<(), Box<dyn std::error::Error + Send + Sync>>
        + Send,
    S: FnMut(SeamInfo) + Send,
{
    match local_result {
        Ok(result) => Ok(result),
        Err(SdkError::AbortedForCloudFallback { reason }) => {
            record_local_abort_outcome(
                authority,
                model_id,
                reason,
                local_latency_ms,
                signal_context,
            );
            let local_reliability_hint =
                local_reliability_hint_after_abort(authority, model_id, envelope, &policy_metrics);
            // Emit `LocalAborted` before the user's `on_seam` callback fires so
            // the audit trail exists even if the caller's seam handler panics.
            crate::telemetry::publish_local_aborted_with_details(
                &correlation_id,
                model_id,
                reason,
                local_latency_ms,
                local_tokens,
                local_resource_summary,
                local_reliability_hint,
            );

            let seam = SeamInfo {
                reason,
                correlation_id: correlation_id.clone(),
                local_tokens,
                local_latency_ms,
            };
            on_seam(seam);

            if cancellation_token
                .as_ref()
                .is_some_and(CancellationToken::is_cancelled)
            {
                return Err(SdkError::InferenceError(format!(
                    "Execution aborted: {}",
                    crate::run_options::AbortReason::UserCancelled
                )));
            }

            // FR-6: reuse the original prompt; no partial-token reuse.
            let cloud_envelope = envelope.clone();
            let cloud_provider = cloud_envelope.metadata.get("provider").cloned();
            // The cloud leg almost always runs a different model than the
            // local leg (e.g. local `qwen2.5-0.5b-instruct` falling back to
            // cloud `deepseek-chat`). The dashboard reads `model_id` off
            // the published event, so passing the local `model_id` through
            // to the cloud event makes the trace lie about which model
            // actually produced the cloud tokens. Pull the cloud model from
            // the envelope's `model` metadata (the same field the gateway
            // dispatches on) and fall back to the local id only when the
            // caller didn't set one — in that case the dispatch would have
            // failed at the gateway anyway, so the local id is a fine
            // last-resort label.
            let cloud_model_id = cloud_envelope
                .metadata
                .get("model")
                .map(|s| s.as_str())
                .unwrap_or(model_id);
            let policy_decision = authority.apply_policy(&PolicyRequest {
                stage_id: cloud_model_id.to_string(),
                envelope: cloud_envelope.clone(),
                metrics: policy_metrics,
            });
            match policy_decision.result {
                PolicyOutcome::Allow => {}
                PolicyOutcome::Deny {
                    reason: policy_reason,
                } => {
                    crate::telemetry::publish_cloud_denied_by_policy(
                        &correlation_id,
                        cloud_model_id,
                        reason,
                        &policy_reason,
                        local_latency_ms,
                    );
                    record_cloud_outcome(
                        authority,
                        cloud_model_id,
                        cloud_provider.as_deref(),
                        0,
                        false,
                        Some(format!("cloud_denied_by_policy: {}", policy_reason)),
                        OutcomeCategory::HardFail {
                            reason: "cloud_denied_by_policy".to_string(),
                        },
                        signal_context,
                    );
                    return Err(SdkError::InferenceError(format!(
                        "cloud_denied_by_policy: {}",
                        policy_reason
                    )));
                }
                PolicyOutcome::Transform { transforms } => {
                    // Defensive hard-fail: the orchestrator path applies
                    // PolicyEngine::redact when this variant fires, but the
                    // SDK fallback path does not yet plumb a redact seam.
                    // Treating Transform as Allow would silently dispatch
                    // the un-redacted envelope to cloud — a privacy
                    // regression the orchestrator path explicitly avoids.
                    // Fail closed until the redact seam is wired in.
                    let policy_reason =
                        format!("transforms_unsupported_in_fallback: {:?}", transforms);
                    crate::telemetry::publish_cloud_denied_by_policy(
                        &correlation_id,
                        cloud_model_id,
                        reason,
                        &policy_reason,
                        local_latency_ms,
                    );
                    record_cloud_outcome(
                        authority,
                        cloud_model_id,
                        cloud_provider.as_deref(),
                        0,
                        false,
                        Some(format!("cloud_denied_by_policy: {}", policy_reason)),
                        OutcomeCategory::HardFail {
                            reason: "cloud_denied_by_policy".to_string(),
                        },
                        signal_context,
                    );
                    return Err(SdkError::InferenceError(format!(
                        "cloud_denied_by_policy: {}",
                        policy_reason
                    )));
                }
            }

            let cloud_start = Instant::now();
            let cloud_tokens = std::sync::Arc::new(std::sync::atomic::AtomicU32::new(0));
            let cloud_tokens_for_cb = cloud_tokens.clone();
            let cancellation_for_cb = cancellation_token.clone();
            let cloud_callback: xybrid_core::runtime_adapter::types::StreamingCallback<'_> =
                Box::new(move |token| {
                    // Honour user cancellation across the local→cloud seam.
                    // Resource-pressure signals are not consulted: the device's
                    // CPU/memory state says nothing about a cloud round-trip's
                    // viability, and treating it as a cloud-leg abort would
                    // surface CloudFallbackAbort errors at the wrong layer.
                    if let Some(token_handle) = cancellation_for_cb.as_ref() {
                        if token_handle.is_cancelled() {
                            return Err(Box::new(crate::run_options::AbortReason::UserCancelled)
                                as Box<dyn std::error::Error + Send + Sync>);
                        }
                    }
                    cloud_tokens_for_cb.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
                    on_token(token)
                });
            // Match instead of `?` so we publish a `CloudRetry` event on
            // BOTH branches. Without this the audit trail loses any record
            // that cloud was even attempted when the cloud leg fails (auth
            // misconfiguration, gateway 5xx, network partition, etc.) —
            // the trace would just stop after `LocalAborted` and the
            // dashboard would show no cloud activity at all.
            let cloud_result = cloud_adapter.execute_streaming(&cloud_envelope, cloud_callback);
            let cloud_latency_ms = cloud_start.elapsed().as_millis() as u32;
            let cloud_token_count = cloud_tokens.load(std::sync::atomic::Ordering::SeqCst);

            match cloud_result {
                Ok(cloud_output) => {
                    crate::telemetry::publish_cloud_retry(
                        &correlation_id,
                        cloud_model_id,
                        cloud_provider.as_deref(),
                        cloud_latency_ms,
                        cloud_token_count,
                        None,
                    );
                    record_cloud_outcome(
                        authority,
                        cloud_model_id,
                        cloud_provider.as_deref(),
                        cloud_latency_ms,
                        true,
                        None,
                        OutcomeCategory::Success,
                        signal_context,
                    );
                    let total_latency_ms = local_latency_ms.saturating_add(cloud_latency_ms);
                    Ok(InferenceResult::new(
                        cloud_output,
                        cloud_model_id,
                        total_latency_ms,
                    ))
                }
                Err(adapter_err) => {
                    let error_message = adapter_err.to_string();
                    let telemetry_error =
                        crate::telemetry::redact_error_for_telemetry(&error_message);
                    crate::telemetry::publish_cloud_retry(
                        &correlation_id,
                        cloud_model_id,
                        cloud_provider.as_deref(),
                        cloud_latency_ms,
                        cloud_token_count,
                        Some(&error_message),
                    );
                    record_cloud_outcome(
                        authority,
                        cloud_model_id,
                        cloud_provider.as_deref(),
                        cloud_latency_ms,
                        false,
                        Some(telemetry_error.clone()),
                        OutcomeCategory::HardFail {
                            reason: telemetry_error,
                        },
                        signal_context,
                    );
                    Err(streaming_execution_error(adapter_err))
                }
            }
        }
        Err(other) => Err(other),
    }
}

/// Configuration for streaming ASR sessions.
#[derive(Debug, Clone)]
pub struct StreamConfig {
    /// Enable VAD (Voice Activity Detection) for smart chunking
    pub enable_vad: bool,
    /// VAD threshold (0.0-1.0)
    pub vad_threshold: f32,
    /// Language hint for ASR
    pub language: Option<String>,
    /// Path to VAD model (uses default if None)
    pub vad_model_dir: Option<String>,
}

impl Default for StreamConfig {
    fn default() -> Self {
        Self {
            enable_vad: false,
            vad_threshold: 0.5,
            language: Some("en".to_string()),
            vad_model_dir: None,
        }
    }
}

impl StreamConfig {
    /// Create config with VAD enabled.
    pub fn with_vad() -> Self {
        Self {
            enable_vad: true,
            ..Default::default()
        }
    }

    /// Set language hint.
    pub fn language(mut self, lang: impl Into<String>) -> Self {
        self.language = Some(lang.into());
        self
    }

    /// Set VAD threshold.
    pub fn vad_threshold(mut self, threshold: f32) -> Self {
        self.vad_threshold = threshold;
        self
    }
}

/// Internal handle holding the loaded model state.
struct ModelHandle {
    /// Template executor for running inference
    executor: TemplateExecutor,
    /// Model metadata
    metadata: ModelMetadata,
    /// Model directory path (permanent extraction in cache)
    model_dir: PathBuf,
    /// Whether model is currently loaded
    loaded: bool,
}

/// Represents a model that can be loaded.
///
/// Created by `Xybrid::model()`, must call `.load()` to use.
/// This is a preparatory step that doesn't download or load anything.
///
/// # Example (Recommended - Registry-based)
///
/// ```no_run
/// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
/// # use xybrid_sdk::ModelLoader;
/// # use xybrid_sdk::ir::Envelope;
/// # let envelope: Envelope = unimplemented!();
/// // Load using registry (recommended - auto-resolves to best variant)
/// let loader = ModelLoader::from_registry("kokoro-82m");
/// let model = loader.load()?;
/// let result = model.run(&envelope, None)?;
/// # Ok(())
/// # }
/// ```
///
/// # Example (With progress callback)
///
/// ```no_run
/// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
/// # use xybrid_sdk::ModelLoader;
/// let loader = ModelLoader::from_registry("kokoro-82m");
/// let model = loader.load_with_progress(|progress| {
///     println!("Download: {:.1}%", progress * 100.0);
/// })?;
/// # Ok(())
/// # }
/// ```
/// GGUF quantization preference order for automatic selection.
/// Q4_K_M is the default — best quality/size tradeoff for edge devices.
const GGUF_PREFERENCE_ORDER: &[&str] = &[
    "Q4_K_M", "Q4_K_S", "Q4_0", "Q5_K_M", "Q5_K_S", "Q6_K", "Q8_0", "F16", "BF16", "F32",
];

/// Select the best GGUF file from a list based on user preference or default ranking.
///
/// If `variant` is specified, finds a file containing that quantization string (case-insensitive).
/// Otherwise, selects the file matching the highest-priority quantization from `GGUF_PREFERENCE_ORDER`.
fn select_gguf_variant(gguf_files: &[&str], variant: Option<&str>) -> SdkResult<String> {
    if let Some(v) = variant {
        let v_upper = v.to_uppercase();
        // Find a file containing the variant string (case-insensitive)
        if let Some(found) = gguf_files
            .iter()
            .find(|f| f.to_uppercase().contains(&v_upper))
        {
            return Ok(found.to_string());
        }
        return Err(SdkError::LoadError(format!(
            "No GGUF file matching variant '{}'. Available: {}",
            v,
            gguf_files.join(", ")
        )));
    }

    // Auto-select: try each preferred quantization in order
    for pref in GGUF_PREFERENCE_ORDER {
        if let Some(found) = gguf_files.iter().find(|f| f.to_uppercase().contains(pref)) {
            return Ok(found.to_string());
        }
    }

    // Fallback: pick the smallest file (likely the most quantized)
    Ok(gguf_files
        .first()
        .ok_or_else(|| SdkError::LoadError("No GGUF files found".to_string()))?
        .to_string())
}

#[derive(Debug, Clone)]
pub struct ModelLoader {
    source: ModelSource,
    model_id: Option<String>,
    version: Option<String>,
}

impl ModelLoader {
    /// Create loader from registry (recommended).
    ///
    /// Uses the registry API to resolve the model ID to the best variant
    /// for the current platform, then downloads from HuggingFace with
    /// caching and SHA256 verification.
    ///
    /// # Example
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// let loader = ModelLoader::from_registry("kokoro-82m");
    /// let model = loader.load()?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn from_registry(id: &str) -> Self {
        Self {
            source: ModelSource::registry(id),
            model_id: Some(id.to_string()),
            version: None, // Version is resolved by registry API
        }
    }

    /// Create loader from registry with explicit platform.
    ///
    /// # Example
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// let loader = ModelLoader::from_registry_with_platform("kokoro-82m", "macos-arm64");
    /// let model = loader.load()?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn from_registry_with_platform(id: &str, platform: &str) -> Self {
        Self {
            source: ModelSource::registry_with_platform(id, platform),
            model_id: Some(id.to_string()),
            version: None,
        }
    }

    /// Create loader from legacy registry with direct URL.
    ///
    /// # Deprecated
    /// Use `from_registry()` instead for automatic platform resolution and caching.
    #[deprecated(since = "0.0.17", note = "Use ModelLoader::from_registry() instead")]
    #[allow(deprecated)]
    pub fn from_legacy_registry(url: &str, model_id: &str, version: &str) -> Self {
        Self {
            source: ModelSource::legacy_registry(url, model_id, version),
            model_id: Some(model_id.to_string()),
            version: Some(version.to_string()),
        }
    }

    /// Create loader from legacy registry with explicit platform.
    ///
    /// # Deprecated
    /// Use `from_registry_with_platform()` instead.
    #[deprecated(
        since = "0.0.17",
        note = "Use ModelLoader::from_registry_with_platform() instead"
    )]
    #[allow(deprecated)]
    pub fn from_legacy_registry_with_platform(
        url: &str,
        model_id: &str,
        version: &str,
        platform: &str,
    ) -> Self {
        Self {
            source: ModelSource::legacy_registry_with_platform(url, model_id, version, platform),
            model_id: Some(model_id.to_string()),
            version: Some(version.to_string()),
        }
    }

    /// Create loader from local bundle file.
    pub fn from_bundle(path: impl Into<PathBuf>) -> SdkResult<Self> {
        let path: PathBuf = path.into();
        if !path.exists() {
            return Err(SdkError::ModelNotFound(format!(
                "Bundle not found: {:?}",
                path
            )));
        }
        Ok(Self {
            source: ModelSource::bundle(path),
            model_id: None,
            version: None,
        })
    }

    /// Create loader from local model directory.
    ///
    /// The directory must exist and contain a valid `model_metadata.json` file.
    ///
    /// # Errors
    ///
    /// - `SdkError::DirectoryNotFound` if the path does not exist
    /// - `SdkError::MetadataNotFound` if `model_metadata.json` is missing
    /// - `SdkError::MetadataInvalid` if `model_metadata.json` contains invalid JSON
    pub fn from_directory(path: impl Into<PathBuf>) -> SdkResult<Self> {
        let path: PathBuf = path.into();
        if !path.exists() {
            return Err(SdkError::DirectoryNotFound(path.display().to_string()));
        }
        let metadata_path = path.join("model_metadata.json");
        if !metadata_path.exists() {
            return Err(SdkError::MetadataNotFound(path.display().to_string()));
        }
        // Validate that the metadata is valid JSON
        let metadata_str = std::fs::read_to_string(&metadata_path).map_err(|e| {
            SdkError::MetadataInvalid(format!("failed to read model_metadata.json: {}", e))
        })?;
        let _metadata: xybrid_core::execution::ModelMetadata = serde_json::from_str(&metadata_str)
            .map_err(|e| {
                SdkError::MetadataInvalid(format!("invalid model_metadata.json: {}", e))
            })?;
        Ok(Self {
            source: ModelSource::directory(path),
            model_id: None,
            version: None,
        })
    }

    /// Create loader from a HuggingFace Hub repository.
    ///
    /// Downloads model files from the HuggingFace Hub and caches them locally.
    /// Subsequent calls use the cached files. The repository must contain a
    /// `model_metadata.json` for the model to be loadable (auto-generation
    /// is planned for a future version).
    ///
    /// Requires the `huggingface` feature flag at load time.
    /// The constructor itself is always available, but `load()` will return
    /// `SdkError::ConfigError` if the feature is not enabled.
    ///
    /// # Example
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// let loader = ModelLoader::from_huggingface("xybrid-ai/kokoro-82m");
    /// let model = loader.load()?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn from_huggingface(repo: &str) -> Self {
        Self {
            source: ModelSource::huggingface(repo),
            model_id: Some(repo.to_string()),
            version: None,
        }
    }

    /// Create loader from a HuggingFace Hub repository with explicit revision.
    ///
    /// # Example
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// let loader = ModelLoader::from_huggingface_with_revision("xybrid-ai/kokoro-82m", "v1.0");
    /// let model = loader.load()?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn from_huggingface_with_revision(repo: &str, revision: &str) -> Self {
        Self {
            source: ModelSource::huggingface_with_revision(repo, revision),
            model_id: Some(repo.to_string()),
            version: Some(revision.to_string()),
        }
    }

    /// Create loader from a HuggingFace repo string, parsing optional variant suffix.
    ///
    /// Supports `"org/repo:Q8_0"` syntax to select a specific GGUF quantization.
    /// Without a variant, defaults to Q4_K_M for GGUF repos.
    ///
    /// # Example
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// let loader = ModelLoader::from_huggingface_parsed("LiquidAI/LFM2.5-350M-GGUF:Q8_0");
    /// let model = loader.load()?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn from_huggingface_parsed(input: &str) -> Self {
        let source = ModelSource::parse_huggingface(input);
        let repo = source.model_id().unwrap_or(input).to_string();
        Self {
            source,
            model_id: Some(repo),
            version: None,
        }
    }

    /// Get the model ID (if known).
    pub fn model_id(&self) -> Option<&str> {
        self.model_id.as_deref()
    }

    /// Get the version (if known).
    pub fn version(&self) -> Option<&str> {
        self.version.as_deref()
    }

    /// Get the source type.
    pub fn source_type(&self) -> &'static str {
        self.source.source_type()
    }

    /// Load the model into memory (synchronous).
    ///
    /// This will:
    /// - For registry: Resolve via registry API, download from HuggingFace (with caching)
    /// - For legacy_registry (deprecated): Download the bundle if not cached, extract it
    /// - For bundle: Extract the bundle to a temp directory
    /// - For directory: Load directly from the directory
    ///
    /// Returns a loaded `XybridModel` ready for inference.
    #[allow(deprecated)]
    pub fn load(&self) -> SdkResult<XybridModel> {
        self.load_with_progress(|_| {})
    }

    /// Load the model with a progress callback.
    ///
    /// The callback receives progress as a float from 0.0 to 1.0.
    /// Only applies to registry-based loading (downloads from HuggingFace).
    ///
    /// # Example
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// # let loader: ModelLoader = unimplemented!();
    /// let model = loader.load_with_progress(|progress| {
    ///     println!("Download: {:.1}%", progress * 100.0);
    /// })?;
    /// # Ok(())
    /// # }
    /// ```
    #[allow(deprecated)]
    pub fn load_with_progress<F>(&self, progress_callback: F) -> SdkResult<XybridModel>
    where
        F: Fn(f32),
    {
        match &self.source {
            ModelSource::Registry { id, platform } => {
                self.load_from_registry_api(id, platform.as_deref(), progress_callback)
            }
            ModelSource::LegacyRegistry {
                url,
                model_id,
                version,
                platform,
            } => self.load_from_legacy_registry(url, model_id, version, platform.as_deref()),
            ModelSource::Bundle { path } => self.load_from_bundle(path),
            ModelSource::Directory { path } => self.load_from_directory(path),
            ModelSource::HuggingFace {
                repo,
                revision,
                variant,
            } => self.load_from_huggingface(
                repo,
                revision.as_deref(),
                variant.as_deref(),
                progress_callback,
            ),
        }
    }

    /// Load the model asynchronously.
    pub async fn load_async(&self) -> SdkResult<XybridModel> {
        // For now, wrap the sync version. Real async would use tokio::fs and async HTTP.
        let loader = self.clone();
        tokio::task::spawn_blocking(move || loader.load())
            .await
            .map_err(|e| SdkError::LoadError(format!("Task join error: {}", e)))?
    }

    /// Load model from registry using RegistryClient.
    ///
    /// This is the recommended loading method - it uses the registry API to resolve
    /// the model ID to the best variant for the platform, downloads from HuggingFace,
    /// and caches locally with SHA256 verification.
    fn load_from_registry_api<F>(
        &self,
        id: &str,
        platform: Option<&str>,
        progress_callback: F,
    ) -> SdkResult<XybridModel>
    where
        F: Fn(f32),
    {
        // Create registry client (uses default API or environment variable)
        let client = RegistryClient::from_env()?;

        // Fetch and extract model (handles both .xyb bundles and passthrough GGUF files)
        let model_dir = client.fetch_extracted(id, platform, progress_callback)?;

        // Load from extracted directory
        self.load_from_directory(&model_dir)
    }

    /// Load from legacy registry (deprecated - use load_from_registry_api instead).
    fn load_from_legacy_registry(
        &self,
        url: &str,
        model_id: &str,
        version: &str,
        platform: Option<&str>,
    ) -> SdkResult<XybridModel> {
        let platform = platform.map(String::from).unwrap_or_else(detect_platform);

        // Build bundle URL
        let bundle_url = format!(
            "{}/bundles/{}/{}/{}/{}.xyb",
            url.trim_end_matches('/'),
            model_id,
            version,
            platform,
            model_id
        );

        // Download bundle to temp file
        let temp_dir = TempDir::new().map_err(SdkError::IoError)?;
        let bundle_path = temp_dir.path().join(format!("{}.xyb", model_id));

        // Use blocking HTTP client
        let response = ureq::get(&bundle_url)
            .call()
            .map_err(|e| SdkError::NetworkError(format!("Failed to download bundle: {}", e)))?;

        if response.status() != 200 {
            return Err(SdkError::ModelNotFound(format!(
                "Bundle not found at registry: {} (status {})",
                bundle_url,
                response.status()
            )));
        }

        // Write bundle to temp file
        let mut file = std::fs::File::create(&bundle_path)?;
        std::io::copy(&mut response.into_reader(), &mut file)?;

        // Extract using CacheManager (extracts to permanent cache location)
        // The temp_dir will be dropped after this, but extracted files persist
        self.load_from_bundle(&bundle_path)
    }

    fn load_from_bundle(&self, path: &PathBuf) -> SdkResult<XybridModel> {
        // Use CacheManager for unified extraction (single source of truth)
        let cache = crate::cache::CacheManager::new()?;
        let extract_dir = cache.ensure_extracted(path)?;

        // Load from extracted directory (extraction is permanent in cache)
        let handle = Self::create_model_handle(&extract_dir)?;

        let model_id = handle.metadata.model_id.clone();
        let version = handle.metadata.version.clone();
        let supports_streaming = Self::check_streaming_support(&handle.metadata);
        let output_type = Self::infer_output_type(&handle.metadata);

        Ok(XybridModel {
            handle: Arc::new(RwLock::new(handle)),
            model_id,
            version,
            output_type,
            supports_streaming,
        })
    }

    /// Load a model from HuggingFace Hub.
    ///
    /// Only downloads the selected GGUF variant (defaults to Q4_K_M) plus essential
    /// supporting files (config, tokenizer, README). Avoids downloading all variants.
    #[cfg(feature = "huggingface")]
    fn load_from_huggingface<F>(
        &self,
        repo: &str,
        revision: Option<&str>,
        variant: Option<&str>,
        _progress_callback: F,
    ) -> SdkResult<XybridModel>
    where
        F: Fn(f32),
    {
        use hf_hub::{api::sync::Api, Repo, RepoType};

        // Determine our cache directory
        let cache_dir = Self::hf_cache_dir(repo)?;

        // Check if we already have a cached copy with model_metadata.json
        let metadata_path = cache_dir.join("model_metadata.json");
        if metadata_path.exists() {
            log::info!(target: "xybrid_sdk", "Loading HuggingFace model from cache: {}", cache_dir.display());
            return self.load_from_directory(&cache_dir);
        }

        log::info!(target: "xybrid_sdk", "Downloading model from HuggingFace: {}", repo);

        // Create HF API client
        let api = Api::new().map_err(|e| {
            SdkError::NetworkError(format!("Failed to create HuggingFace API client: {}", e))
        })?;

        // Create repo reference with optional revision
        let hf_repo = if let Some(rev) = revision {
            Repo::with_revision(repo.to_string(), RepoType::Model, rev.to_string())
        } else {
            Repo::new(repo.to_string(), RepoType::Model)
        };
        let repo_api = api.repo(hf_repo);

        // Get repo info to list all files
        let repo_info = repo_api.info().map_err(|e| {
            SdkError::NetworkError(format!(
                "Failed to get HuggingFace repo info for '{}': {}",
                repo, e
            ))
        })?;

        let siblings = repo_info.siblings;
        if siblings.is_empty() {
            return Err(SdkError::LoadError(format!(
                "HuggingFace repo '{}' has no files",
                repo
            )));
        }

        // Classify files by type to enable smart filtering
        let all_filenames: Vec<&str> = siblings.iter().map(|s| s.rfilename.as_str()).collect();
        let gguf_files: Vec<&str> = all_filenames
            .iter()
            .filter(|f| f.ends_with(".gguf"))
            .copied()
            .collect();

        // If multiple GGUF files exist, select the best one instead of downloading all
        let selected_gguf = if gguf_files.len() > 1 {
            Some(select_gguf_variant(&gguf_files, variant)?)
        } else {
            None
        };

        if let Some(ref selected) = selected_gguf {
            log::info!(
                target: "xybrid_sdk",
                "Selected GGUF variant: {} (from {} available)",
                selected, gguf_files.len()
            );
        }

        // Create cache directory
        std::fs::create_dir_all(&cache_dir)?;

        // Filter to only files we need
        let files_to_download: Vec<&str> = all_filenames
            .iter()
            .filter(|filename| {
                // Skip hidden files and directories
                if filename.starts_with('.') || filename.ends_with('/') {
                    return false;
                }

                // If we have a selected GGUF, skip other GGUF files
                if let Some(ref selected) = selected_gguf {
                    if filename.ends_with(".gguf") && **filename != *selected {
                        return false;
                    }
                }

                // Skip non-essential files (LICENSE, subdirectories like leap/)
                let dominated_by_model = selected_gguf.is_some() || gguf_files.len() == 1;
                if dominated_by_model {
                    Self::is_essential_file(filename)
                } else {
                    true
                }
            })
            .copied()
            .collect();

        let total_files = files_to_download.len();
        for (i, filename) in files_to_download.iter().enumerate() {
            log::debug!(target: "xybrid_sdk", "Downloading [{}/{}]: {}", i + 1, total_files, filename);

            // Report approximate progress
            _progress_callback((i as f32) / (total_files as f32));

            // Download file (hf-hub caches internally)
            let cached_path = repo_api.get(filename).map_err(|e| {
                SdkError::NetworkError(format!(
                    "Failed to download '{}' from '{}': {}",
                    filename, repo, e
                ))
            })?;

            // Create target path in our cache directory
            let target_path = cache_dir.join(filename);

            // Create parent directories if the file is in a subdirectory
            if let Some(parent) = target_path.parent() {
                std::fs::create_dir_all(parent)?;
            }

            // Skip if already exists in our cache
            if target_path.exists() {
                continue;
            }

            // Create symlink to hf-hub's cached file (avoids duplication)
            #[cfg(unix)]
            {
                std::os::unix::fs::symlink(&cached_path, &target_path).map_err(|e| {
                    SdkError::IoError(std::io::Error::other(format!(
                        "Failed to symlink {} -> {}: {}",
                        cached_path.display(),
                        target_path.display(),
                        e
                    )))
                })?;
            }

            // On Windows, copy the file instead
            #[cfg(not(unix))]
            {
                std::fs::copy(&cached_path, &target_path)?;
            }
        }

        // Report completion
        _progress_callback(1.0);

        // Auto-generate model_metadata.json if not provided by the repo
        if !metadata_path.exists() {
            log::info!(
                target: "xybrid_sdk",
                "No model_metadata.json in repo '{}', attempting auto-generation...",
                repo
            );
            match crate::metadata_gen::generate_metadata(&cache_dir, repo) {
                Ok((_metadata, _task_inference)) => {
                    log::info!(
                        target: "xybrid_sdk",
                        "Auto-generated model_metadata.json for '{}'. \
                         Review and adjust if inference results are unexpected.",
                        repo
                    );
                }
                Err(e) => {
                    return Err(SdkError::MetadataNotFound(format!(
                        "HuggingFace repo '{}' does not contain model_metadata.json and \
                         auto-generation failed: {}. \
                         Create one manually — see docs/sdk/MODEL_METADATA.md",
                        repo, e
                    )));
                }
            }
        }

        log::info!(target: "xybrid_sdk", "Model cached at: {}", cache_dir.display());
        self.load_from_directory(&cache_dir)
    }

    /// Not available without the `huggingface` feature.
    #[cfg(not(feature = "huggingface"))]
    fn load_from_huggingface<F>(
        &self,
        _repo: &str,
        _revision: Option<&str>,
        _variant: Option<&str>,
        _progress_callback: F,
    ) -> SdkResult<XybridModel>
    where
        F: Fn(f32),
    {
        Err(SdkError::ConfigError(
            "HuggingFace loading requires the 'huggingface' feature flag. \
             Enable it with: cargo build --features huggingface"
                .to_string(),
        ))
    }

    /// Get the cache directory for a HuggingFace repo.
    ///
    /// Returns `~/.xybrid/cache/hf/{sanitized_repo}/` or the SDK-configured cache path.
    fn hf_cache_dir(repo: &str) -> SdkResult<PathBuf> {
        let base_cache = if let Some(sdk_cache) = crate::get_sdk_cache_dir() {
            sdk_cache.join("hf")
        } else {
            let home = dirs::home_dir().ok_or_else(|| {
                SdkError::CacheError("Cannot determine home directory".to_string())
            })?;
            home.join(".xybrid").join("cache").join("hf")
        };

        // Sanitize repo name for filesystem (e.g., "xybrid-ai/kokoro-82m" -> "xybrid-ai--kokoro-82m")
        let sanitized = repo.replace('/', "--");
        Ok(base_cache.join(sanitized))
    }

    /// Check if a file is essential and should always be downloaded.
    ///
    /// Essential files are model files (.gguf, .onnx, .safetensors), metadata files
    /// (config.json, tokenizer, vocab), and the README (for model card parsing).
    /// Non-essential files (LICENSE, leap/, etc.) are skipped.
    fn is_essential_file(filename: &str) -> bool {
        // Model files
        if filename.ends_with(".gguf")
            || filename.ends_with(".onnx")
            || filename.ends_with(".safetensors")
        {
            return true;
        }

        // Metadata and supporting files
        let basename = filename.rsplit('/').next().unwrap_or(filename);
        matches!(
            basename,
            "model_metadata.json"
                | "config.json"
                | "tokenizer.json"
                | "tokenizer_config.json"
                | "special_tokens_map.json"
                | "vocab.json"
                | "vocab.txt"
                | "tokens.txt"
                | "merges.txt"
                | "preprocessor_config.json"
                | "generation_config.json"
                | "README.md"
        )
    }

    fn load_from_directory(&self, path: &PathBuf) -> SdkResult<XybridModel> {
        let handle = Self::create_model_handle(path)?;

        let model_id = handle.metadata.model_id.clone();
        let version = handle.metadata.version.clone();
        let supports_streaming = Self::check_streaming_support(&handle.metadata);
        let output_type = Self::infer_output_type(&handle.metadata);

        Ok(XybridModel {
            handle: Arc::new(RwLock::new(handle)),
            model_id,
            version,
            output_type,
            supports_streaming,
        })
    }

    fn create_model_handle(model_dir: &PathBuf) -> SdkResult<ModelHandle> {
        // Load metadata
        let metadata_path = model_dir.join("model_metadata.json");
        let metadata_str = std::fs::read_to_string(&metadata_path).map_err(|e| {
            SdkError::LoadError(format!("Failed to read model_metadata.json: {}", e))
        })?;
        let metadata: ModelMetadata = serde_json::from_str(&metadata_str)
            .map_err(|e| SdkError::LoadError(format!("Failed to parse metadata: {}", e)))?;

        // Create executor with base path
        let executor = TemplateExecutor::with_base_path(model_dir.to_str().unwrap_or("."));

        Ok(ModelHandle {
            executor,
            metadata,
            model_dir: model_dir.clone(),
            loaded: true,
        })
    }

    fn is_llm_template(metadata: &ModelMetadata) -> bool {
        matches!(metadata.execution_template, ExecutionTemplate::Gguf { .. })
    }

    fn check_streaming_support(metadata: &ModelMetadata) -> bool {
        if Self::is_llm_template(metadata) {
            return true;
        }

        // Check if this is an ASR model (supports streaming)
        // Look at metadata task or model type (metadata is HashMap<String, serde_json::Value>)
        if let Some(task) = metadata.metadata.get("task").and_then(|v| v.as_str()) {
            if task == "speech-recognition" || task == "asr" {
                return true;
            }
        }

        // Check execution template type
        match &metadata.execution_template {
            ExecutionTemplate::SafeTensors { architecture, .. } => {
                architecture.as_deref() == Some("whisper")
            }
            ExecutionTemplate::Onnx { .. } => {
                // Check if preprocessing includes AudioDecode (likely ASR)
                metadata.preprocessing.iter().any(|step| {
                    matches!(
                        step,
                        xybrid_core::execution_template::PreprocessingStep::AudioDecode { .. }
                    )
                })
            }
            _ => false,
        }
    }

    fn infer_output_type(metadata: &ModelMetadata) -> OutputType {
        if Self::is_llm_template(metadata) {
            return OutputType::Text;
        }

        // Check metadata hints (metadata is HashMap<String, serde_json::Value>)
        if let Some(task) = metadata.metadata.get("task").and_then(|v| v.as_str()) {
            match task {
                "speech-recognition" | "asr" | "transcription" => return OutputType::Text,
                "text-to-speech" | "tts" | "speech-synthesis" => return OutputType::Audio,
                "embedding" | "feature-extraction" => return OutputType::Embedding,
                _ => {}
            }
        }

        // Check postprocessing steps for hints
        for step in &metadata.postprocessing {
            match step {
                xybrid_core::execution_template::PostprocessingStep::CTCDecode { .. }
                | xybrid_core::execution_template::PostprocessingStep::WhisperDecode { .. } => {
                    return OutputType::Text
                }
                xybrid_core::execution_template::PostprocessingStep::TTSAudioEncode { .. } => {
                    return OutputType::Audio
                }
                _ => {}
            }
        }

        OutputType::Unknown
    }
}

/// Represents a loaded model ready for inference.
///
/// Created by `ModelLoader::load()`. Provides both batch and streaming inference.
///
/// # Example
///
/// ```no_run
/// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
/// # use xybrid_sdk::{ModelLoader, StreamConfig};
/// # use xybrid_sdk::ir::Envelope;
/// # let loader: ModelLoader = unimplemented!();
/// # let audio_envelope: Envelope = unimplemented!();
/// # let samples: Vec<f32> = vec![];
/// let model = loader.load()?;
///
/// // Batch inference
/// let result = model.run(&audio_envelope, None)?;
/// println!("Transcription: {}", result.unwrap_text());
///
/// // Streaming inference (if supported)
/// if model.supports_streaming() {
///     let stream = model.stream(StreamConfig::with_vad())?;
///     stream.feed(&samples)?;
///     let transcript = stream.flush()?;
/// }
///
/// // Cleanup
/// model.unload()?;
/// # Ok(())
/// # }
/// ```
pub struct XybridModel {
    handle: Arc<RwLock<ModelHandle>>,
    model_id: String,
    version: String,
    output_type: OutputType,
    supports_streaming: bool,
}

impl XybridModel {
    /// Get the model ID.
    pub fn model_id(&self) -> &str {
        &self.model_id
    }

    /// Get the model version.
    pub fn version(&self) -> &str {
        &self.version
    }

    /// Check if the model is currently loaded.
    pub fn is_loaded(&self) -> bool {
        self.handle.read().map(|h| h.loaded).unwrap_or(false)
    }

    /// Check if this model supports streaming.
    pub fn supports_streaming(&self) -> bool {
        self.supports_streaming
    }

    /// Get the expected output type for this model.
    pub fn output_type(&self) -> OutputType {
        self.output_type
    }

    /// Check if this is an LLM model (uses GGUF execution template).
    ///
    /// LLM models support multi-turn conversation contexts. Use this to
    /// determine if conversation history should be maintained.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::{ModelLoader, ConversationContext};
    /// # let loader: ModelLoader = unimplemented!();
    /// let model = loader.load()?;
    /// if model.is_llm() {
    ///     // Create conversation context for multi-turn chat
    ///     let mut ctx = ConversationContext::new();
    ///     // ... manage conversation history
    /// }
    /// # Ok(())
    /// # }
    /// ```
    pub fn is_llm(&self) -> bool {
        self.handle
            .read()
            .ok()
            .map(|h| {
                matches!(
                    h.metadata.execution_template,
                    ExecutionTemplate::Gguf { .. }
                )
            })
            .unwrap_or(false)
    }

    // =========================================================================
    // Voice Discovery (TTS models only)
    // =========================================================================

    /// Get the voice configuration for this model, if available.
    ///
    /// Returns `None` for non-TTS models or TTS models without voice configuration.
    pub fn voice_config(&self) -> Option<VoiceConfig> {
        self.handle
            .read()
            .ok()
            .and_then(|h| h.metadata.voices.clone())
    }

    /// Get all available voices for this TTS model.
    ///
    /// Returns `None` for non-TTS models or TTS models without voice configuration.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # use xybrid_sdk::XybridModel;
    /// # let model: XybridModel = unimplemented!();
    /// if let Some(voices) = model.voices() {
    ///     for voice in voices {
    ///         println!("{}: {} ({})", voice.id, voice.name, voice.language.unwrap_or_default());
    ///     }
    /// }
    /// ```
    pub fn voices(&self) -> Option<Vec<VoiceInfo>> {
        self.voice_config().map(|vc| vc.catalog)
    }

    /// Get the default voice for this TTS model.
    ///
    /// Returns `None` for non-TTS models or if no default is configured.
    pub fn default_voice(&self) -> Option<VoiceInfo> {
        self.voice_config().and_then(|vc| {
            let default_id = &vc.default;
            vc.catalog.into_iter().find(|v| &v.id == default_id)
        })
    }

    /// Check if this model has voice configuration.
    ///
    /// Returns `true` for TTS models with voice support.
    pub fn has_voices(&self) -> bool {
        self.voice_config().is_some()
    }

    /// Get a specific voice by ID.
    ///
    /// Returns `None` if the voice is not found or the model has no voice support.
    ///
    /// # Arguments
    ///
    /// * `voice_id` - The voice identifier (e.g., "af_bella")
    pub fn voice(&self, voice_id: &str) -> Option<VoiceInfo> {
        self.voice_config()
            .and_then(|vc| vc.catalog.into_iter().find(|v| v.id == voice_id))
    }

    // =========================================================================
    // Warmup Methods (for pre-loading models)
    // =========================================================================

    /// Warm up the model by running a minimal inference.
    ///
    /// This pre-loads the model into memory, ensuring that the first real inference
    /// is fast. For LLM models, this loads the model weights and creates the context.
    ///
    /// Call this at app startup or after `load()` to eliminate cold-start latency.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// # use xybrid_sdk::ir::Envelope;
    /// # let loader: ModelLoader = unimplemented!();
    /// # let envelope: Envelope = unimplemented!();
    /// let model = loader.load()?;
    /// model.warmup()?;  // Pre-load model
    ///
    /// // First inference is now fast
    /// let result = model.run(&envelope, None)?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn warmup(&self) -> SdkResult<()> {
        use xybrid_core::ir::EnvelopeKind;

        log::info!(target: "xybrid_sdk", "Warming up model: {}", self.model_id);
        let is_llm = self.is_llm();

        // Create a minimal input based on expected input type
        let warmup_input = match self.output_type {
            // For TTS models, use a short text
            OutputType::Audio => Envelope {
                kind: EnvelopeKind::Text("Hi".to_string()),
                metadata: std::collections::HashMap::new(),
            },
            // For ASR models, use minimal audio (1 second of silence at 16kHz)
            OutputType::Text if self.supports_streaming && !is_llm => {
                // Create a minimal WAV file with silence
                let silence_samples = vec![0i16; 16000]; // 1 second at 16kHz
                let audio_bytes = Self::create_wav_bytes(&silence_samples, 16000);
                Envelope {
                    kind: EnvelopeKind::Audio(audio_bytes),
                    metadata: std::collections::HashMap::new(),
                }
            }
            // For LLM/text models, use a short prompt
            OutputType::Text | OutputType::Embedding | OutputType::Unknown => Envelope {
                kind: EnvelopeKind::Text("Hi".to_string()),
                metadata: std::collections::HashMap::new(),
            },
        };

        // Warmup measures model-load + first-token latency, not full
        // generation. Cap LLM decoding at 1 token so a 2048-token
        // `GenerationConfig::default()` doesn't turn warmup into a real
        // inference. `executor::execute_llm` reads this from envelope
        // metadata when no explicit `GenerationConfig` is passed;
        // non-LLM paths ignore it.
        let mut warmup_input = warmup_input;
        warmup_input
            .metadata
            .insert("max_tokens".to_string(), "1".to_string());

        // Run the inference inline (rather than delegating to `self.run`)
        // so the publish at the end is a `ModelWarmup` event rather than
        // a `ModelComplete`. Warmups should be visible to billing /
        // perf-debugging but distinguishable from real inferences on
        // the Traces dashboard — `ModelWarmup` carries the same
        // attribution fields (`stage_name`, `target`, `latency_ms`) but
        // its own event_type so the platform can render with a `warmup`
        // badge and default-filter it out of cost-attribution rollups.
        let start = Instant::now();
        let resource_guard = crate::telemetry::begin_resource_run();
        let trace_id = uuid::Uuid::new_v4();
        let _telemetry_ctx =
            crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

        {
            let mut handle = self.handle.write().unwrap_or_else(|e| e.into_inner());
            if !handle.loaded {
                return Err(SdkError::NotLoaded);
            }
            let metadata = handle.metadata.clone();
            handle
                .executor
                .execute(&metadata, &warmup_input, None)
                .map_err(|e| SdkError::InferenceError(format!("Warmup execution failed: {}", e)))?;
        }

        let latency_ms = start.elapsed().as_millis() as u32;

        let event = crate::telemetry::TelemetryEvent {
            event_type: "ModelWarmup".to_string(),
            stage_name: Some(self.model_id.clone()),
            target: Some("local".to_string()),
            latency_ms: Some(latency_ms),
            error: None,
            data: Some(
                serde_json::json!({
                    "model_id": self.model_id,
                    "version": self.version,
                    "output_type": format!("{:?}", self.output_type),
                })
                .to_string(),
            ),
            timestamp_ms: std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .map(|d| d.as_millis() as u64)
                .unwrap_or(0),
        };
        crate::telemetry::publish_with_resource_summary(event, resource_guard);

        log::info!(
            target: "xybrid_sdk",
            "Model {} warmed up in {}ms",
            self.model_id,
            latency_ms
        );

        Ok(())
    }

    /// Warm up the model asynchronously.
    ///
    /// This is useful for background pre-loading at app startup without blocking the UI.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # async fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::ModelLoader;
    /// # let loader: ModelLoader = unimplemented!();
    /// let model = loader.load()?;
    ///
    /// // Start warmup in background
    /// let warmup_handle = tokio::spawn(async move {
    ///     model.warmup_async().await
    /// });
    ///
    /// // Do other initialization...
    ///
    /// // Wait for warmup if needed
    /// warmup_handle.await??;
    /// # Ok(())
    /// # }
    /// ```
    pub async fn warmup_async(&self) -> SdkResult<()> {
        let handle = self.handle.clone();
        let model_id = self.model_id.clone();
        let output_type = self.output_type;
        let supports_streaming = self.supports_streaming;
        let is_llm = self.is_llm();

        tokio::task::spawn_blocking(move || {
            use xybrid_core::ir::EnvelopeKind;

            log::info!(target: "xybrid_sdk", "Warming up model (async): {}", model_id);

            // Create a minimal input based on expected input type
            let warmup_input = match output_type {
                OutputType::Audio => Envelope {
                    kind: EnvelopeKind::Text("Hi".to_string()),
                    metadata: std::collections::HashMap::new(),
                },
                OutputType::Text if supports_streaming && !is_llm => {
                    let silence_samples = vec![0i16; 16000];
                    let audio_bytes = Self::create_wav_bytes(&silence_samples, 16000);
                    Envelope {
                        kind: EnvelopeKind::Audio(audio_bytes),
                        metadata: std::collections::HashMap::new(),
                    }
                }
                OutputType::Text | OutputType::Embedding | OutputType::Unknown => Envelope {
                    kind: EnvelopeKind::Text("Hi".to_string()),
                    metadata: std::collections::HashMap::new(),
                },
            };

            // See sync `warmup()` for rationale — cap LLM decoding at
            // 1 token so warmup doesn't run a full generation.
            let mut warmup_input = warmup_input;
            warmup_input
                .metadata
                .insert("max_tokens".to_string(), "1".to_string());

            let start = Instant::now();
            let resource_guard = crate::telemetry::begin_resource_run();
            let trace_id = uuid::Uuid::new_v4();
            let _telemetry_ctx =
                crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

            // Run inference inline and publish a `ModelWarmup` event —
            // same shape as the sync `warmup()` above. Previously this
            // path published nothing at all, so async warmups were
            // silent on the wire (visible only via logs).
            let version_for_event;
            let output_type_str;
            {
                let mut guard = handle.write().unwrap_or_else(|e| e.into_inner());
                if !guard.loaded {
                    return Err(SdkError::NotLoaded);
                }

                let metadata = guard.metadata.clone();
                guard
                    .executor
                    .execute(&metadata, &warmup_input, None)
                    .map_err(|e| SdkError::InferenceError(format!("Warmup failed: {}", e)))?;

                version_for_event = metadata.version.clone();
                output_type_str = format!("{:?}", output_type);
            }

            let latency_ms = start.elapsed().as_millis() as u32;

            let event = crate::telemetry::TelemetryEvent {
                event_type: "ModelWarmup".to_string(),
                stage_name: Some(model_id.clone()),
                target: Some("local".to_string()),
                latency_ms: Some(latency_ms),
                error: None,
                data: Some(
                    serde_json::json!({
                        "model_id": model_id,
                        "version": version_for_event,
                        "output_type": output_type_str,
                    })
                    .to_string(),
                ),
                timestamp_ms: std::time::SystemTime::now()
                    .duration_since(std::time::UNIX_EPOCH)
                    .map(|d| d.as_millis() as u64)
                    .unwrap_or(0),
            };
            crate::telemetry::publish_with_resource_summary(event, resource_guard);

            log::info!(
                target: "xybrid_sdk",
                "Model {} warmed up (async) in {}ms",
                model_id,
                latency_ms
            );

            Ok(())
        })
        .await
        .map_err(|e| SdkError::InferenceError(format!("Task join error: {}", e)))?
    }

    /// Create a minimal WAV file bytes from samples for warmup.
    fn create_wav_bytes(samples: &[i16], sample_rate: u32) -> Vec<u8> {
        let mut bytes = Vec::new();
        let num_samples = samples.len();
        let data_size = (num_samples * 2) as u32;
        let file_size = 36 + data_size;

        // RIFF header
        bytes.extend_from_slice(b"RIFF");
        bytes.extend_from_slice(&file_size.to_le_bytes());
        bytes.extend_from_slice(b"WAVE");

        // fmt chunk
        bytes.extend_from_slice(b"fmt ");
        bytes.extend_from_slice(&16u32.to_le_bytes()); // Chunk size
        bytes.extend_from_slice(&1u16.to_le_bytes()); // Audio format (PCM)
        bytes.extend_from_slice(&1u16.to_le_bytes()); // Num channels
        bytes.extend_from_slice(&sample_rate.to_le_bytes()); // Sample rate
        bytes.extend_from_slice(&(sample_rate * 2).to_le_bytes()); // Byte rate
        bytes.extend_from_slice(&2u16.to_le_bytes()); // Block align
        bytes.extend_from_slice(&16u16.to_le_bytes()); // Bits per sample

        // data chunk
        bytes.extend_from_slice(b"data");
        bytes.extend_from_slice(&data_size.to_le_bytes());
        for sample in samples {
            bytes.extend_from_slice(&sample.to_le_bytes());
        }

        bytes
    }

    /// Run batch inference with an Envelope.
    ///
    /// # Arguments
    ///
    /// * `envelope` - Input data wrapped in an Envelope
    ///
    /// # Returns
    ///
    /// `InferenceResult` containing the output with convenient accessors.
    pub fn run(
        &self,
        envelope: &Envelope,
        config: Option<&GenerationConfig>,
    ) -> SdkResult<InferenceResult> {
        crate::telemetry::maybe_emit_dev_nudge();
        let start = Instant::now();
        // Begin a resource-telemetry scope for this run. When
        // `resource_telemetry_mode()` is `Off` the guard is a no-op; otherwise
        // it captures start snapshots (and launches a sampler for Summary /
        // DebugLocal). Summary is produced by
        // `publish_with_resource_summary` at the end of this function.
        let resource_guard = crate::telemetry::begin_resource_run();

        // Install a per-call trace_id for the lifetime of this run. Any
        // telemetry events emitted between install and `Drop` (including
        // any deferred adapter events) share the same `trace_id`, so the
        // dashboard collapses them to a single Traces row. Same
        // discipline `run_with_context` uses; see `docs/sdk/trace-model.md`.
        let trace_id = uuid::Uuid::new_v4();
        let _telemetry_ctx =
            crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

        // Recover from poisoned RwLock to prevent permanent lock errors
        let mut handle = self.handle.write().unwrap_or_else(|e| e.into_inner());

        if !handle.loaded {
            return Err(SdkError::NotLoaded);
        }

        // Clone metadata to avoid borrow conflict with executor
        let metadata = handle.metadata.clone();
        let output = handle
            .executor
            .execute(&metadata, envelope, config)
            .map_err(|e| SdkError::InferenceError(format!("Execution failed: {}", e)))?;

        let latency_ms = start.elapsed().as_millis() as u32;

        // Emit ModelComplete telemetry event
        let event = crate::telemetry::TelemetryEvent {
            event_type: "ModelComplete".to_string(),
            stage_name: Some(self.model_id.clone()),
            target: Some("local".to_string()),
            latency_ms: Some(latency_ms),
            error: None,
            data: Some(
                serde_json::json!({
                    "model_id": self.model_id,
                    "version": self.version,
                    "output_type": format!("{:?}", self.output_type),
                })
                .to_string(),
            ),
            timestamp_ms: std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .map(|d| d.as_millis() as u64)
                .unwrap_or(0),
        };
        crate::telemetry::publish_with_resource_summary(event, resource_guard);

        // `_telemetry_ctx` drops here, clearing the pipeline context after
        // the publish — same ordering as `run_with_context`.

        Ok(InferenceResult::new(output, &self.model_id, latency_ms))
    }

    /// Run batch inference with per-run controls.
    pub fn run_with_options(
        &self,
        envelope: &Envelope,
        options: &RunOptions,
    ) -> SdkResult<InferenceResult> {
        let mut abort_state = AbortState::new(options);
        abort_state
            .check_before_run()
            .map_err(|reason| SdkError::InferenceError(format!("Execution aborted: {reason}")))?;
        self.run(envelope, options.generation_config.as_ref())
    }

    /// Run inference with conversation context.
    ///
    /// This method passes the conversation history to the model, allowing it to
    /// generate context-aware responses. The model uses its chat template to
    /// format the conversation history into a prompt.
    ///
    /// **Important:** This method does not mutate the context. The caller is
    /// responsible for pushing the result to the context if desired.
    ///
    /// # Arguments
    ///
    /// * `envelope` - The current user input (should have `MessageRole::User`)
    /// * `context` - Conversation history (system prompt + previous turns)
    ///
    /// # Returns
    ///
    /// `InferenceResult` containing the assistant's response (tagged with `MessageRole::Assistant`).
    ///
    /// # Example
    ///
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// use xybrid_sdk::{ModelLoader, ConversationContext};
    /// use xybrid_sdk::ir::{Envelope, EnvelopeKind, MessageRole};
    ///
    /// let model = ModelLoader::from_registry("gemma-3-1b").load()?;
    /// let mut ctx = ConversationContext::new();
    ///
    /// // Add user message to context
    /// let user_input = Envelope::new(EnvelopeKind::Text("Hello!".into()))
    ///     .with_role(MessageRole::User);
    /// ctx.push(user_input.clone());
    ///
    /// // Run with context (model sees the full history)
    /// let result = model.run_with_context(&user_input, &ctx, None)?;
    ///
    /// // Add assistant response to context
    /// ctx.push(result.envelope().clone());
    ///
    /// println!("{}", result.text().unwrap_or_default());
    /// # Ok(())
    /// # }
    /// ```
    pub fn run_with_context(
        &self,
        envelope: &Envelope,
        context: &ConversationContext,
        config: Option<&GenerationConfig>,
    ) -> SdkResult<InferenceResult> {
        let start = Instant::now();
        let resource_guard = crate::telemetry::begin_resource_run();

        // Install a turn-scoped trace_id for the lifetime of this call.
        // The guard's `Drop` clears the pipeline context on every exit —
        // including the `?` error paths and panics below — so every
        // telemetry event emitted between install and drop shares the
        // same `trace_id` and the dashboard collapses the turn to one
        // Traces row. Same discipline `Pipeline::run` uses for stages.
        let trace_id = uuid::Uuid::new_v4();
        let _telemetry_ctx =
            crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

        // Recover from poisoned RwLock to prevent permanent lock errors
        let mut handle = self.handle.write().unwrap_or_else(|e| e.into_inner());

        if !handle.loaded {
            return Err(SdkError::NotLoaded);
        }

        // Clone metadata to avoid borrow conflict with executor
        let metadata = handle.metadata.clone();
        let output = handle
            .executor
            .execute_with_context(&metadata, envelope, context, config)
            .map_err(|e| SdkError::InferenceError(format!("Execution failed: {}", e)))?;

        let latency_ms = start.elapsed().as_millis() as u32;

        // Emit ModelComplete telemetry event
        let event = crate::telemetry::TelemetryEvent {
            event_type: "ModelComplete".to_string(),
            stage_name: Some(self.model_id.clone()),
            target: Some("local".to_string()),
            latency_ms: Some(latency_ms),
            error: None,
            data: Some(
                serde_json::json!({
                    "model_id": self.model_id,
                    "version": self.version,
                    "output_type": format!("{:?}", self.output_type),
                    "context_messages": context.history().len(),
                })
                .to_string(),
            ),
            timestamp_ms: std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .map(|d| d.as_millis() as u64)
                .unwrap_or(0),
        };
        crate::telemetry::publish_with_resource_summary(event, resource_guard);

        // `_telemetry_ctx` drops here (and on every early return / unwind
        // above), clearing the pipeline context after the publish.

        Ok(InferenceResult::new(output, &self.model_id, latency_ms))
    }

    /// Run inference with conversation context and per-run controls.
    pub fn run_with_context_options(
        &self,
        envelope: &Envelope,
        context: &ConversationContext,
        options: &RunOptions,
    ) -> SdkResult<InferenceResult> {
        let mut abort_state = AbortState::new(options);
        abort_state
            .check_before_run()
            .map_err(|reason| SdkError::InferenceError(format!("Execution aborted: {reason}")))?;
        self.run_with_context(envelope, context, options.generation_config.as_ref())
    }

    /// Run streaming inference with conversation context.
    ///
    /// Combines streaming output with multi-turn conversation memory.
    /// The model sees the full conversation history when generating responses.
    ///
    /// # Arguments
    ///
    /// * `envelope` - Current user input wrapped in an Envelope
    /// * `context` - Conversation history for multi-turn chat
    /// * `on_token` - Callback invoked for each token (LLM) or once (other models)
    ///
    /// # Returns
    ///
    /// `InferenceResult` containing the final output.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::{XybridModel, ConversationContext};
    /// # use xybrid_sdk::ir::{Envelope, EnvelopeKind, MessageRole};
    /// # let model: XybridModel = unimplemented!();
    /// let mut ctx = ConversationContext::new();
    ///
    /// // Add user message and run with streaming
    /// let input = Envelope::new(EnvelopeKind::Text("Tell me a joke".into()))
    ///     .with_role(MessageRole::User);
    /// ctx.push(input.clone());
    ///
    /// let result = model.run_streaming_with_context(&input, &ctx, None, |token| {
    ///     print!("{}", token.token);
    ///     std::io::Write::flush(&mut std::io::stdout())?;
    ///     Ok(())
    /// })?;
    ///
    /// // Add assistant response to context
    /// ctx.push(result.envelope().clone());
    /// # Ok(())
    /// # }
    /// ```
    pub fn run_streaming_with_context<F>(
        &self,
        envelope: &Envelope,
        context: &ConversationContext,
        config: Option<&GenerationConfig>,
        mut on_token: F,
    ) -> SdkResult<InferenceResult>
    where
        F: FnMut(
                xybrid_core::runtime_adapter::types::PartialToken,
            ) -> Result<(), Box<dyn std::error::Error + Send + Sync>>
            + Send,
    {
        use xybrid_core::execution::ExecutionTemplate;
        use xybrid_core::runtime_adapter::types::PartialToken;

        let start = Instant::now();

        // RAII pipeline context — see `run_with_context` for rationale.
        // Cleared on drop at end of scope (after publish) or on any
        // early return / unwind below.
        let trace_id = uuid::Uuid::new_v4();
        let _telemetry_ctx =
            crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

        // Get write lock on handle
        let mut handle = self.handle.write().unwrap_or_else(|e| e.into_inner());

        if !handle.loaded {
            return Err(SdkError::NotLoaded);
        }

        // Clone metadata to check execution template
        let metadata = handle.metadata.clone();

        // Check if this is an LLM model (GGUF template)
        let is_llm = matches!(metadata.execution_template, ExecutionTemplate::Gguf { .. });

        let output = if is_llm {
            // True streaming with context for LLM models
            handle
                .executor
                .execute_streaming_with_context(
                    &metadata,
                    envelope,
                    context,
                    Box::new(&mut on_token),
                    config,
                )
                .map_err(streaming_execution_error)?
        } else {
            // For non-LLM models: run with context and emit single "token" with full result
            let result = handle
                .executor
                .execute_with_context(&metadata, envelope, context, config)
                .map_err(|e| SdkError::InferenceError(format!("Execution failed: {}", e)))?;

            // Extract text from result (if any) and emit as single token
            if let xybrid_core::ir::EnvelopeKind::Text(text) = &result.kind {
                let token = PartialToken {
                    token: text.clone(),
                    token_id: None,
                    index: 0,
                    cumulative_text: text.clone(),
                    finish_reason: Some("stop".to_string()),
                };
                on_token(token).map_err(streaming_callback_error)?;
            }

            result
        };

        let latency_ms = start.elapsed().as_millis() as u32;

        // Emit telemetry event
        let event = crate::telemetry::TelemetryEvent {
            event_type: "ModelComplete".to_string(),
            stage_name: Some(self.model_id.clone()),
            target: Some("local".to_string()),
            latency_ms: Some(latency_ms),
            error: None,
            data: Some(
                serde_json::json!({
                    "model_id": self.model_id,
                    "version": self.version,
                    "output_type": format!("{:?}", self.output_type),
                    "streaming": true,
                    "context_messages": context.history().len(),
                })
                .to_string(),
            ),
            timestamp_ms: std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .map(|d| d.as_millis() as u64)
                .unwrap_or(0),
        };
        crate::telemetry::publish_telemetry_event(event);

        // `_telemetry_ctx` drops here, clearing pipeline context after
        // the publish — same ordering as before.

        Ok(InferenceResult::new(output, &self.model_id, latency_ms))
    }

    /// Run streaming inference with conversation context and per-token abort checks.
    pub fn run_streaming_with_context_options<F>(
        &self,
        envelope: &Envelope,
        context: &ConversationContext,
        options: &RunOptions,
        mut on_token: F,
    ) -> SdkResult<InferenceResult>
    where
        F: FnMut(
                xybrid_core::runtime_adapter::types::PartialToken,
            ) -> Result<(), Box<dyn std::error::Error + Send + Sync>>
            + Send,
    {
        let mut abort_state = AbortState::new(options);
        // Honour pre-run cancellation so a `token.cancel()` issued before
        // invocation aborts immediately rather than running the full batch
        // (or model load + prompt fill on streaming models) before the first
        // token boundary checks the abort state.
        let fallback_to_cloud = options.abort_policy.fallback_to_cloud;
        abort_state
            .check_before_run()
            .map_err(|reason| streaming_pre_run_abort_error(reason, fallback_to_cloud))?;
        self.run_streaming_with_context(
            envelope,
            context,
            options.generation_config.as_ref(),
            move |token| {
                if let Err(reason) = abort_state.check_before_token() {
                    return Err(reason.into_streaming_error(fallback_to_cloud));
                }
                on_token(token)
            },
        )
    }

    /// Run inference with streaming output.
    ///
    /// This method provides a unified streaming interface for all model types:
    /// - **LLM models (GGUF)**: True token-by-token streaming via the callback
    /// - **Other models (TTS, ASR, etc.)**: Single callback with the full result
    ///
    /// This "everything is a stream" pattern allows consumers to use the same
    /// API regardless of model type, while LLMs get the latency benefits of
    /// true streaming.
    ///
    /// # Arguments
    ///
    /// * `envelope` - Input data wrapped in an Envelope
    /// * `on_token` - Callback invoked for each token (LLM) or once (other models)
    ///
    /// # Returns
    ///
    /// `InferenceResult` containing the final output.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::XybridModel;
    /// # use xybrid_sdk::ir::Envelope;
    /// # let model: XybridModel = unimplemented!();
    /// # let envelope: Envelope = unimplemented!();
    /// // Works for both LLM and non-LLM models
    /// let result = model.run_streaming(&envelope, None, |token| {
    ///     print!("{}", token.token);
    ///     std::io::Write::flush(&mut std::io::stdout())?;
    ///     Ok(())
    /// })?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn run_streaming<F>(
        &self,
        envelope: &Envelope,
        config: Option<&GenerationConfig>,
        mut on_token: F,
    ) -> SdkResult<InferenceResult>
    where
        F: FnMut(
                xybrid_core::runtime_adapter::types::PartialToken,
            ) -> Result<(), Box<dyn std::error::Error + Send + Sync>>
            + Send,
    {
        use xybrid_core::execution::ExecutionTemplate;
        use xybrid_core::runtime_adapter::types::PartialToken;

        let start = Instant::now();

        // Per-call trace_id scope — see `run` for rationale. Cleared on
        // drop at end of scope (after publish) or on any early return.
        let trace_id = uuid::Uuid::new_v4();
        let _telemetry_ctx =
            crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

        // Get write lock on handle
        let mut handle = self.handle.write().unwrap_or_else(|e| e.into_inner());

        if !handle.loaded {
            return Err(SdkError::NotLoaded);
        }

        // Clone metadata to check execution template
        let metadata = handle.metadata.clone();

        // Check if this is an LLM model (GGUF template)
        let is_llm = matches!(metadata.execution_template, ExecutionTemplate::Gguf { .. });

        let output = if is_llm {
            // True streaming for LLM models
            handle
                .executor
                .execute_streaming(&metadata, envelope, Box::new(&mut on_token), config)
                .map_err(streaming_execution_error)?
        } else {
            // For non-LLM models: run batch and emit single "token" with full result
            let result = handle
                .executor
                .execute(&metadata, envelope, config)
                .map_err(|e| SdkError::InferenceError(format!("Execution failed: {}", e)))?;

            // Extract text from result (if any) and emit as single token
            if let xybrid_core::ir::EnvelopeKind::Text(text) = &result.kind {
                let token = PartialToken {
                    token: text.clone(),
                    token_id: None,
                    index: 0,
                    cumulative_text: text.clone(),
                    finish_reason: Some("stop".to_string()),
                };
                on_token(token).map_err(streaming_callback_error)?;
            }

            result
        };

        let latency_ms = start.elapsed().as_millis() as u32;

        // Emit telemetry event
        let event = crate::telemetry::TelemetryEvent {
            event_type: "ModelComplete".to_string(),
            stage_name: Some(self.model_id.clone()),
            target: Some("local".to_string()),
            latency_ms: Some(latency_ms),
            error: None,
            data: Some(
                serde_json::json!({
                    "model_id": self.model_id,
                    "version": self.version,
                    "output_type": format!("{:?}", self.output_type),
                    "streaming": true,
                })
                .to_string(),
            ),
            timestamp_ms: std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .map(|d| d.as_millis() as u64)
                .unwrap_or(0),
        };
        crate::telemetry::publish_telemetry_event(event);

        Ok(InferenceResult::new(output, &self.model_id, latency_ms))
    }

    /// Run streaming inference with per-token abort checks.
    pub fn run_streaming_with_options<F>(
        &self,
        envelope: &Envelope,
        options: &RunOptions,
        mut on_token: F,
    ) -> SdkResult<InferenceResult>
    where
        F: FnMut(
                xybrid_core::runtime_adapter::types::PartialToken,
            ) -> Result<(), Box<dyn std::error::Error + Send + Sync>>
            + Send,
    {
        // Capture supports_streaming before the closure takes ownership.
        // The check_abort_for_streaming helper short-circuits when this is
        // false so non-streaming models (batch executor + single synthetic
        // token at the end) don't trigger cloud fallback after local
        // inference already succeeded — see helper docs for the full
        // privacy/cost rationale.
        let supports_streaming = self.supports_streaming;
        let mut abort_state = AbortState::new(options);
        // Honour pre-run cancellation: without this, a non-streaming model
        // whose `supports_streaming = false` would silently execute the full
        // batch even when the cancellation token was set before invocation,
        // because `check_abort_for_streaming` short-circuits in that case.
        let fallback_to_cloud = options.abort_policy.fallback_to_cloud;
        abort_state
            .check_before_run()
            .map_err(|reason| streaming_pre_run_abort_error(reason, fallback_to_cloud))?;
        self.run_streaming(envelope, options.generation_config.as_ref(), move |token| {
            check_abort_for_streaming(supports_streaming, &mut abort_state, fallback_to_cloud)?;
            on_token(token)
        })
    }

    /// Run streaming inference with automatic cloud fallback on resource-driven
    /// abort.
    ///
    /// On a healthy run this behaves identically to
    /// [`Self::run_streaming_with_options`] — `on_token` fires once per local
    /// token, `on_seam` is never invoked, and the returned [`InferenceResult`]
    /// reflects the local execution.
    ///
    /// When the configured [`AbortPolicy`](crate::run_options::AbortPolicy)
    /// trips mid-stream **and** the policy permits cloud fallback, the wrapper:
    /// 1. Captures the local token count and elapsed latency.
    /// 2. Fires `on_seam(SeamInfo { … })` with the abort reason and a
    ///    shared `correlation_id`. Callers use this to render a UX cue.
    /// 3. Records the local abort outcome and re-checks policy before cloud.
    ///    If policy denies cloud, the wrapper emits `cloud_denied_by_policy`
    ///    telemetry and returns an explicit error without retrying.
    /// 4. Builds a fresh cloud envelope from the **original prompt** (no
    ///    partial-token reuse) and calls
    ///    [`CloudStreaming::execute_streaming`](xybrid_core::runtime_adapter::CloudStreaming::execute_streaming)
    ///    on `cloud_adapter`.
    /// 5. Routes cloud chunks through the same `on_token` so the user sees
    ///    one continuous token stream.
    /// 6. Records the cloud outcome and returns the cloud-leg
    ///    [`InferenceResult`].
    ///
    /// On any other error (no abort policy fired, or `fallback_to_cloud` is
    /// false), the original [`SdkError`] is returned unchanged.
    ///
    /// # Notes
    ///
    /// - The same `on_token` is invoked for both legs; the seam is observable
    ///   via `on_seam`, not via the token stream itself.
    /// - `correlation_id` is taken from `options.correlation_id` if set,
    ///   otherwise generated via `uuid::Uuid::new_v4()`.
    /// - The `envelope` must carry cloud-side routing metadata (`provider`,
    ///   `model`, `system_prompt`, `temperature`, …) for the retry leg. See
    ///   [`CloudRuntimeAdapter`](xybrid_core::runtime_adapter::CloudRuntimeAdapter)
    ///   for supported keys.
    /// - The wrapper is fully synchronous; the default `CloudRuntimeAdapter`
    ///   consumes OpenAI-compatible gateway SSE via `CloudStreaming`.
    /// - **Cancellation timing across the seam.** A cancel set on the
    ///   `cancellation_token` *before* `execute_streaming` is invoked
    ///   (including from inside `on_seam`) short-circuits before the cloud
    ///   adapter is called — the prompt is never sent. A cancel that arrives
    ///   while the cloud leg is waiting on SSE is observed at the next gateway
    ///   chunk: the user-visible token stream is suppressed, but the prompt may
    ///   already have been transmitted and billed. Treat the token as
    ///   "responsive on the seam, best-effort during cloud."
    pub fn run_streaming_with_fallback<F, S>(
        &self,
        envelope: &Envelope,
        options: &RunOptions,
        cloud_adapter: &dyn xybrid_core::runtime_adapter::CloudStreaming,
        on_token: &mut F,
        on_seam: &mut S,
    ) -> SdkResult<InferenceResult>
    where
        F: FnMut(
                xybrid_core::runtime_adapter::types::PartialToken,
            ) -> Result<(), Box<dyn std::error::Error + Send + Sync>>
            + Send,
        S: FnMut(SeamInfo) + Send,
    {
        use std::sync::atomic::{AtomicU32, Ordering};
        use std::sync::Arc;

        let correlation_id = options
            .correlation_id
            .clone()
            .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());

        let local_tokens = Arc::new(AtomicU32::new(0));
        let local_start = Instant::now();
        let local_resource_guard = crate::telemetry::begin_resource_run();

        let local_result = {
            let local_tokens = local_tokens.clone();
            self.run_streaming_with_options(envelope, options, |token| {
                local_tokens.fetch_add(1, Ordering::SeqCst);
                on_token(token)
            })
        };

        let local_latency_ms = local_start.elapsed().as_millis() as u32;
        let local_resource_summary = local_resource_guard.finish();
        let policy_metrics = fallback_policy_metrics(options);
        let signal_context = Some(SignalContext::from_metrics(&policy_metrics));

        dispatch_after_local(
            local_result,
            envelope,
            cloud_adapter,
            correlation_id,
            &self.model_id,
            local_tokens.load(Ordering::SeqCst),
            local_latency_ms,
            local_resource_summary,
            fallback_authority(),
            policy_metrics,
            signal_context,
            options.cancellation_token.clone(),
            on_token,
            on_seam,
        )
    }

    /// Run inference returning a stream of events.
    ///
    /// This is the idiomatic Rust streaming API that returns a `Stream` instead of
    /// using callbacks. Events are emitted as they occur:
    /// - `StreamEvent::Token` - for each generated token (LLM models)
    /// - `StreamEvent::Complete` - when inference finishes successfully
    /// - `StreamEvent::Error` - if an error occurs
    ///
    /// For non-LLM models, a single `Token` event is emitted with the full result,
    /// followed by `Complete`.
    ///
    /// # Example
    ///
    /// ```no_run
    /// # async fn _example() {
    /// # use xybrid_sdk::{XybridModel, StreamEvent};
    /// # use xybrid_sdk::ir::Envelope;
    /// # let model: XybridModel = unimplemented!();
    /// # let envelope: Envelope = unimplemented!();
    /// use tokio_stream::StreamExt;
    ///
    /// let mut stream = model.run_stream(envelope, None);
    /// while let Some(event) = stream.next().await {
    ///     match event {
    ///         StreamEvent::Token(token) => print!("{}", token.token),
    ///         StreamEvent::Complete(result) => println!("\nDone: {}ms", result.latency_ms()),
    ///         StreamEvent::Error(e) => eprintln!("Error: {}", e),
    ///     }
    /// }
    /// # }
    /// ```
    pub fn run_stream(
        &self,
        envelope: Envelope,
        config: Option<GenerationConfig>,
    ) -> Pin<Box<dyn tokio_stream::Stream<Item = StreamEvent> + Send + '_>> {
        use tokio::sync::mpsc;
        use xybrid_core::runtime_adapter::types::PartialToken;

        let (tx, rx) = mpsc::channel::<StreamEvent>(100);
        let handle = self.handle.clone();
        let model_id = self.model_id.clone();
        let version = self.version.clone();
        let output_type = self.output_type;

        // Clone tx for the completion event (before moving into spawn_blocking)
        let tx_completion = tx.clone();

        // Spawn blocking task to run inference
        tokio::task::spawn(async move {
            let result = tokio::task::spawn_blocking(move || {
                let start = Instant::now();

                // Per-call trace_id scope — see `run` for rationale. Lives
                // inside the spawn_blocking closure so the install + drop
                // happen on the same thread the publish runs on.
                let trace_id = uuid::Uuid::new_v4();
                let _telemetry_ctx =
                    crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

                // Get write lock on handle
                let mut guard = handle.write().unwrap_or_else(|e| e.into_inner());

                if !guard.loaded {
                    return Err(SdkError::NotLoaded);
                }

                let metadata = guard.metadata.clone();
                let is_llm = matches!(
                    metadata.execution_template,
                    xybrid_core::execution::ExecutionTemplate::Gguf { .. }
                );

                // Clone tx for the streaming callback (so we can use tx in the else branch)
                let tx_for_callback = tx.clone();

                let output = if is_llm {
                    // True streaming for LLM models
                    guard
                        .executor
                        .execute_streaming(
                            &metadata,
                            &envelope,
                            Box::new(move |token: PartialToken| {
                                let stream_token = StreamToken {
                                    token: token.token.clone(),
                                    token_id: token.token_id.map(|id| id),
                                    index: token.index,
                                    cumulative_text: token.cumulative_text.clone(),
                                    finish_reason: token.finish_reason.clone(),
                                };
                                // Ignore send errors (receiver dropped)
                                let _ =
                                    tx_for_callback.blocking_send(StreamEvent::Token(stream_token));
                                Ok(())
                            }),
                            config.as_ref(),
                        )
                        .map_err(streaming_execution_error)?
                } else {
                    // Non-LLM: batch execution, emit single token
                    let result = guard
                        .executor
                        .execute(&metadata, &envelope, config.as_ref())
                        .map_err(|e| {
                            SdkError::InferenceError(format!("Execution failed: {}", e))
                        })?;

                    // Emit single token with full result
                    if let xybrid_core::ir::EnvelopeKind::Text(text) = &result.kind {
                        let stream_token = StreamToken {
                            token: text.clone(),
                            token_id: None,
                            index: 0,
                            cumulative_text: text.clone(),
                            finish_reason: Some("stop".to_string()),
                        };
                        let _ = tx.blocking_send(StreamEvent::Token(stream_token));
                    }
                    result
                };

                let latency_ms = start.elapsed().as_millis() as u32;

                // Emit telemetry
                let event = crate::telemetry::TelemetryEvent {
                    event_type: "ModelComplete".to_string(),
                    stage_name: Some(model_id.clone()),
                    target: Some("local".to_string()),
                    latency_ms: Some(latency_ms),
                    error: None,
                    data: Some(
                        serde_json::json!({
                            "model_id": model_id,
                            "version": version,
                            "output_type": format!("{:?}", output_type),
                            "streaming": true,
                        })
                        .to_string(),
                    ),
                    timestamp_ms: std::time::SystemTime::now()
                        .duration_since(std::time::UNIX_EPOCH)
                        .map(|d| d.as_millis() as u64)
                        .unwrap_or(0),
                };
                crate::telemetry::publish_telemetry_event(event);

                Ok(InferenceResult::new(output, &model_id, latency_ms))
            })
            .await;

            // Send completion or error event
            match result {
                Ok(Ok(inference_result)) => {
                    let _ = tx_completion
                        .send(StreamEvent::Complete(inference_result))
                        .await;
                }
                Ok(Err(e)) => {
                    let _ = tx_completion.send(StreamEvent::Error(e.to_string())).await;
                }
                Err(e) => {
                    let _ = tx_completion
                        .send(StreamEvent::Error(format!("Task failed: {}", e)))
                        .await;
                }
            }
        });

        Box::pin(ReceiverStream::new(rx))
    }

    /// Check if this model supports true token streaming.
    ///
    /// Returns `true` for LLM models (GGUF) when LLM features are enabled,
    /// `false` for other model types or when LLM features are disabled.
    /// Note: `run_streaming()` works for all models, but only LLM models
    /// get true token-by-token streaming; others emit a single result.
    pub fn supports_token_streaming(&self) -> bool {
        #[cfg(any(feature = "llm-mistral", feature = "llm-llamacpp"))]
        {
            use xybrid_core::execution::ExecutionTemplate;

            self.handle
                .read()
                .ok()
                .map(|h| {
                    matches!(
                        h.metadata.execution_template,
                        ExecutionTemplate::Gguf { .. }
                    )
                })
                .unwrap_or(false)
        }
        #[cfg(not(any(feature = "llm-mistral", feature = "llm-llamacpp")))]
        {
            false
        }
    }

    /// Run batch inference asynchronously.
    pub async fn run_async(
        &self,
        envelope: &Envelope,
        config: Option<&GenerationConfig>,
    ) -> SdkResult<InferenceResult> {
        crate::telemetry::maybe_emit_dev_nudge();
        let handle = self.handle.clone();
        let model_id = self.model_id.clone();
        let version = self.version.clone();
        let output_type = self.output_type;
        let envelope = envelope.clone();
        let config = config.cloned();

        tokio::task::spawn_blocking(move || {
            let start = Instant::now();
            let resource_guard = crate::telemetry::begin_resource_run();

            // Per-call trace_id scope — see `run` for rationale. Cleared
            // on drop at end of the spawn_blocking closure (after publish)
            // or on any early return.
            let trace_id = uuid::Uuid::new_v4();
            let _telemetry_ctx =
                crate::telemetry::TelemetryPipelineContextGuard::install(None, Some(trace_id));

            // Recover from poisoned RwLock to prevent permanent lock errors
            let mut guard = handle.write().unwrap_or_else(|e| e.into_inner());

            if !guard.loaded {
                return Err(SdkError::NotLoaded);
            }

            // Clone metadata to avoid borrow conflict with executor
            let metadata = guard.metadata.clone();
            let output = guard
                .executor
                .execute(&metadata, &envelope, config.as_ref())
                .map_err(|e| SdkError::InferenceError(format!("Execution failed: {}", e)))?;

            let latency_ms = start.elapsed().as_millis() as u32;

            // Emit ModelComplete telemetry event
            let event = crate::telemetry::TelemetryEvent {
                event_type: "ModelComplete".to_string(),
                stage_name: Some(model_id.clone()),
                target: Some("local".to_string()),
                latency_ms: Some(latency_ms),
                error: None,
                data: Some(
                    serde_json::json!({
                        "model_id": model_id,
                        "version": version,
                        "output_type": format!("{:?}", output_type),
                    })
                    .to_string(),
                ),
                timestamp_ms: std::time::SystemTime::now()
                    .duration_since(std::time::UNIX_EPOCH)
                    .map(|d| d.as_millis() as u64)
                    .unwrap_or(0),
            };
            crate::telemetry::publish_with_resource_summary(event, resource_guard);

            Ok(InferenceResult::new(output, &model_id, latency_ms))
        })
        .await
        .map_err(|e| SdkError::InferenceError(format!("Task join error: {}", e)))?
    }

    /// Create a streaming session for real-time ASR.
    ///
    /// Returns an error if `!supports_streaming()`.
    ///
    /// # Arguments
    ///
    /// * `config` - Streaming configuration (VAD, language, etc.)
    ///
    /// # Example
    ///
    /// ```no_run
    /// # fn _example() -> Result<(), Box<dyn std::error::Error>> {
    /// # use xybrid_sdk::{XybridModel, StreamConfig};
    /// # let model: XybridModel = unimplemented!();
    /// # let audio_samples: Vec<f32> = vec![];
    /// let stream = model.stream(StreamConfig::with_vad())?;
    ///
    /// // Feed audio chunks
    /// stream.feed(&audio_samples)?;
    ///
    /// // Get partial results
    /// if let Some(partial) = stream.partial_result() {
    ///     println!("Partial: {}", partial.text);
    /// }
    ///
    /// // Get final transcript
    /// let transcript = stream.flush()?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn stream(&self, config: StreamConfig) -> SdkResult<XybridStream> {
        if !self.supports_streaming {
            return Err(SdkError::StreamingNotSupported);
        }

        // Recover from poisoned RwLock to prevent permanent lock errors
        let handle = self.handle.read().unwrap_or_else(|e| e.into_inner());

        if !handle.loaded {
            return Err(SdkError::NotLoaded);
        }

        // Convert to core StreamConfig
        let core_config = CoreStreamConfig {
            vad: CoreVadConfig {
                enabled: config.enable_vad,
                model_dir: config.vad_model_dir,
                threshold: config.vad_threshold,
                ..Default::default()
            },
            language: config.language,
            ..Default::default()
        };

        XybridStream::new(&handle.model_dir, core_config, &self.model_id)
    }

    /// Unload the model from memory.
    ///
    /// The model can be reloaded by creating a new ModelLoader.
    pub fn unload(&self) -> SdkResult<()> {
        // Recover from poisoned RwLock to prevent permanent lock errors
        let mut handle = self.handle.write().unwrap_or_else(|e| e.into_inner());

        handle.loaded = false;
        // Clear the session cache (drop executor and recreate empty)
        handle.executor = TemplateExecutor::default();

        Ok(())
    }
}

// Make XybridModel cloneable (shares the handle)
impl Clone for XybridModel {
    fn clone(&self) -> Self {
        Self {
            handle: self.handle.clone(),
            model_id: self.model_id.clone(),
            version: self.version.clone(),
            output_type: self.output_type,
            supports_streaming: self.supports_streaming,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// The inherent `SdkError::is_retryable` / `retry_after` accessors and
    /// the `RetryableError` trait impl must agree for every variant — the
    /// trait forwards to the inherent methods, so a divergence would be a
    /// refactor slip. Covers the four retryable variants, a representative
    /// non-retryable one, and the `RateLimited` retry-after passthrough.
    #[test]
    fn inherent_and_trait_retryability_agree() {
        use xybrid_core::http::RetryableError;

        let cases = [
            (SdkError::NetworkError("x".into()), true),
            (
                SdkError::RateLimited {
                    retry_after_secs: 5,
                },
                true,
            ),
            (SdkError::Timeout { timeout_ms: 100 }, true),
            (SdkError::Offline("x".into()), true),
            (SdkError::CircuitOpen("x".into()), false),
            (SdkError::NotLoaded, false),
            (SdkError::ConfigError("x".into()), false),
        ];
        for (err, expected) in &cases {
            assert_eq!(err.is_retryable(), *expected, "inherent for {err:?}");
            assert_eq!(
                RetryableError::is_retryable(err),
                *expected,
                "trait for {err:?}"
            );
        }

        // Only RateLimited carries a server-specified backoff.
        let rl = SdkError::RateLimited {
            retry_after_secs: 7,
        };
        assert_eq!(rl.retry_after(), Some(std::time::Duration::from_secs(7)));
        assert_eq!(rl.retry_after(), RetryableError::retry_after(&rl));
        assert_eq!(SdkError::NotLoaded.retry_after(), None);
    }

    #[test]
    fn streaming_execution_error_preserves_typed_cloud_fallback_abort() {
        let error = streaming_execution_error(
            xybrid_core::runtime_adapter::AdapterError::AbortedForCloudFallback {
                reason: xybrid_core::abort::AbortReason::StressMemory,
            },
        );

        match error {
            SdkError::AbortedForCloudFallback { reason } => {
                assert_eq!(reason, xybrid_core::abort::AbortReason::StressMemory);
            }
            other => panic!("expected AbortedForCloudFallback, got {other:?}"),
        }
    }

    #[test]
    fn streaming_callback_error_preserves_typed_cloud_fallback_abort() {
        let error =
            streaming_callback_error(Box::new(xybrid_core::abort::CloudFallbackAbort::new(
                xybrid_core::abort::AbortReason::StressThermal,
            )));

        match error {
            SdkError::AbortedForCloudFallback { reason } => {
                assert_eq!(reason, xybrid_core::abort::AbortReason::StressThermal);
            }
            other => panic!("expected AbortedForCloudFallback, got {other:?}"),
        }
    }

    #[test]
    fn streaming_callback_error_keeps_non_fallback_abort_generic() {
        let error =
            streaming_callback_error(Box::new(crate::run_options::AbortReason::UserCancelled));

        match error {
            SdkError::InferenceError(message) => {
                assert!(message.contains("Streaming callback failed"));
                assert!(message.contains("user_cancelled"));
            }
            other => panic!("expected inference error, got {other:?}"),
        }
    }

    #[test]
    fn test_model_loader_from_registry() {
        let loader = ModelLoader::from_registry("kokoro-82m");
        assert_eq!(loader.model_id(), Some("kokoro-82m"));
        assert_eq!(loader.version(), None); // Version resolved by registry
        assert_eq!(loader.source_type(), "registry");
    }

    #[test]
    fn test_model_loader_from_registry_with_platform() {
        let loader = ModelLoader::from_registry_with_platform("whisper-tiny", "macos-arm64");
        assert_eq!(loader.model_id(), Some("whisper-tiny"));
        assert_eq!(loader.source_type(), "registry");
    }

    #[test]
    fn gguf_models_are_streaming_text_models() {
        let mut metadata = ModelMetadata::onnx("qwen2.5-0.5b-instruct", "1.0", "model.gguf");
        metadata.execution_template = ExecutionTemplate::Gguf {
            model_file: "model.gguf".to_string(),
            chat_template: None,
            context_length: 2048,
            generation_params: None,
        };

        assert!(
            ModelLoader::check_streaming_support(&metadata),
            "GGUF LLMs stream tokens and must run abort checks between chunks"
        );
        assert_eq!(ModelLoader::infer_output_type(&metadata), OutputType::Text);
    }

    #[test]
    #[allow(deprecated)]
    fn test_model_loader_from_legacy_registry() {
        let loader = ModelLoader::from_legacy_registry("http://localhost:8080", "whisper", "1.0");
        assert_eq!(loader.model_id(), Some("whisper"));
        assert_eq!(loader.version(), Some("1.0"));
        assert_eq!(loader.source_type(), "legacy_registry");
    }

    #[test]
    fn test_stream_config_defaults() {
        let config = StreamConfig::default();
        assert!(!config.enable_vad);
        assert_eq!(config.language, Some("en".to_string()));
    }

    #[test]
    fn test_stream_config_with_vad() {
        let config = StreamConfig::with_vad().language("fr").vad_threshold(0.7);
        assert!(config.enable_vad);
        assert_eq!(config.language, Some("fr".to_string()));
        assert_eq!(config.vad_threshold, 0.7);
    }

    // ========================================================================
    // run_streaming_with_fallback / dispatch_after_local
    //
    // We test the testable inner helper `dispatch_after_local` directly so the
    // fallback decision logic is covered without standing up a real model. The
    // public wrapper is a thin shim over `run_streaming_with_options` plus this
    // helper, so its behavior is derivable from these tests + the existing
    // streaming-error tests above.
    // ========================================================================

    use std::sync::atomic::{AtomicBool, AtomicUsize};
    use std::sync::Mutex;

    /// Records calls and emits a fixed response as one or more synthetic tokens.
    /// Used as a stand-in for `CloudRuntimeAdapter` in unit tests.
    struct FakeCloudAdapter {
        response_text: String,
        calls: Mutex<Vec<xybrid_core::ir::Envelope>>,
    }

    impl FakeCloudAdapter {
        fn new(response: &str) -> Self {
            Self {
                response_text: response.to_string(),
                calls: Mutex::new(Vec::new()),
            }
        }

        fn call_count(&self) -> usize {
            self.calls.lock().unwrap().len()
        }

        fn calls(&self) -> Vec<xybrid_core::ir::Envelope> {
            self.calls.lock().unwrap().clone()
        }
    }

    impl xybrid_core::runtime_adapter::CloudStreaming for FakeCloudAdapter {
        fn execute_streaming(
            &self,
            input: &xybrid_core::ir::Envelope,
            mut on_token: xybrid_core::runtime_adapter::types::StreamingCallback<'_>,
        ) -> xybrid_core::runtime_adapter::AdapterResult<xybrid_core::ir::Envelope> {
            self.calls.lock().unwrap().push(input.clone());

            let token = xybrid_core::runtime_adapter::types::PartialToken {
                token: self.response_text.clone(),
                token_id: None,
                index: 0,
                cumulative_text: self.response_text.clone(),
                finish_reason: Some("stop".to_string()),
            };
            on_token(token).map_err(|e| {
                xybrid_core::runtime_adapter::AdapterError::InferenceFailed(format!("{}", e))
            })?;

            Ok(xybrid_core::ir::Envelope::new(
                xybrid_core::ir::EnvelopeKind::Text(self.response_text.clone()),
            ))
        }
    }

    struct FailingCloudAdapter {
        calls: AtomicUsize,
    }

    impl FailingCloudAdapter {
        fn new() -> Self {
            Self {
                calls: AtomicUsize::new(0),
            }
        }

        fn call_count(&self) -> usize {
            self.calls.load(std::sync::atomic::Ordering::SeqCst)
        }
    }

    impl xybrid_core::runtime_adapter::CloudStreaming for FailingCloudAdapter {
        fn execute_streaming(
            &self,
            _input: &xybrid_core::ir::Envelope,
            _on_token: xybrid_core::runtime_adapter::types::StreamingCallback<'_>,
        ) -> xybrid_core::runtime_adapter::AdapterResult<xybrid_core::ir::Envelope> {
            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
            Err(xybrid_core::runtime_adapter::AdapterError::RuntimeError(
                "gateway unavailable".to_string(),
            ))
        }
    }

    #[derive(Debug)]
    struct FixedUnitResourceProvider {
        snapshot: xybrid_core::device::ResourceSnapshot,
    }

    impl FixedUnitResourceProvider {
        fn new(snapshot: xybrid_core::device::ResourceSnapshot) -> Self {
            Self { snapshot }
        }
    }

    impl xybrid_core::device::ResourceSnapshotProvider for FixedUnitResourceProvider {
        fn current_snapshot(
            &self,
            _max_age: std::time::Duration,
        ) -> xybrid_core::device::ResourceSnapshot {
            self.snapshot
        }
    }

    fn text_envelope(text: &str) -> xybrid_core::ir::Envelope {
        xybrid_core::ir::Envelope::new(xybrid_core::ir::EnvelopeKind::Text(text.to_string()))
    }

    fn test_loaded_model(supports_streaming: bool) -> XybridModel {
        let metadata =
            xybrid_core::execution::ModelMetadata::onnx("local-test-model", "1.0", "model.onnx");
        XybridModel {
            handle: Arc::new(RwLock::new(ModelHandle {
                executor: TemplateExecutor::default(),
                metadata,
                model_dir: PathBuf::from("."),
                loaded: true,
            })),
            model_id: "local-test-model".to_string(),
            version: "1.0".to_string(),
            output_type: OutputType::Text,
            supports_streaming,
        }
    }

    fn default_metrics() -> xybrid_core::context::DeviceMetrics {
        xybrid_core::context::DeviceMetrics::default()
    }

    fn default_signal() -> xybrid_core::orchestrator::authority::SignalContext {
        xybrid_core::orchestrator::authority::SignalContext::from_metrics(&default_metrics())
    }

    struct FakeAuthority {
        allow_policy: bool,
        deny_reason: String,
        policy_requests: Mutex<Vec<xybrid_core::orchestrator::authority::PolicyRequest>>,
        outcomes: Mutex<Vec<xybrid_core::orchestrator::authority::ExecutionOutcome>>,
    }

    impl FakeAuthority {
        fn allow() -> Self {
            Self {
                allow_policy: true,
                deny_reason: String::new(),
                policy_requests: Mutex::new(Vec::new()),
                outcomes: Mutex::new(Vec::new()),
            }
        }

        fn deny(reason: &str) -> Self {
            Self {
                allow_policy: false,
                deny_reason: reason.to_string(),
                policy_requests: Mutex::new(Vec::new()),
                outcomes: Mutex::new(Vec::new()),
            }
        }

        fn policy_requests(&self) -> Vec<xybrid_core::orchestrator::authority::PolicyRequest> {
            self.policy_requests.lock().unwrap().clone()
        }

        fn outcomes(&self) -> Vec<xybrid_core::orchestrator::authority::ExecutionOutcome> {
            self.outcomes.lock().unwrap().clone()
        }
    }

    impl xybrid_core::orchestrator::authority::OrchestrationAuthority for FakeAuthority {
        fn apply_policy(
            &self,
            request: &xybrid_core::orchestrator::authority::PolicyRequest,
        ) -> xybrid_core::orchestrator::authority::AuthorityDecision<
            xybrid_core::orchestrator::authority::PolicyOutcome,
        > {
            self.policy_requests.lock().unwrap().push(request.clone());
            if self.allow_policy {
                xybrid_core::orchestrator::authority::AuthorityDecision::local(
                    xybrid_core::orchestrator::authority::PolicyOutcome::Allow,
                    "policy allowed",
                )
            } else {
                xybrid_core::orchestrator::authority::AuthorityDecision::local(
                    xybrid_core::orchestrator::authority::PolicyOutcome::Deny {
                        reason: self.deny_reason.clone(),
                    },
                    self.deny_reason.clone(),
                )
            }
        }

        fn resolve_target(
            &self,
            _context: &xybrid_core::orchestrator::authority::StageContext,
        ) -> xybrid_core::orchestrator::authority::AuthorityDecision<
            xybrid_core::orchestrator::authority::ResolvedTarget,
        > {
            xybrid_core::orchestrator::authority::AuthorityDecision::local(
                xybrid_core::orchestrator::authority::ResolvedTarget::Device,
                "test",
            )
        }

        fn select_model(
            &self,
            request: &xybrid_core::orchestrator::authority::ModelRequest,
        ) -> xybrid_core::orchestrator::authority::AuthorityDecision<
            xybrid_core::orchestrator::authority::ModelSelection,
        > {
            xybrid_core::orchestrator::authority::AuthorityDecision::local(
                xybrid_core::orchestrator::authority::ModelSelection {
                    model_id: request.model_id.clone(),
                    variant: None,
                    source: xybrid_core::orchestrator::authority::ModelSource::Local {
                        path: "test".to_string(),
                    },
                },
                "test",
            )
        }

        fn record_outcome(&self, outcome: &xybrid_core::orchestrator::authority::ExecutionOutcome) {
            self.outcomes.lock().unwrap().push(outcome.clone());
        }

        fn name(&self) -> &str {
            "fake"
        }
    }

    #[test]
    fn dispatch_after_local_retries_on_typed_abort_and_emits_seam() {
        let cloud = FakeCloudAdapter::new("hello from cloud");
        let authority = FakeAuthority::allow();
        let envelope = text_envelope("write me a haiku");
        let collected: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
        let collected_for_cb = collected.clone();
        let mut on_token = move |t: xybrid_core::runtime_adapter::types::PartialToken| {
            collected_for_cb.lock().unwrap().push(t.token);
            Ok::<_, Box<dyn std::error::Error + Send + Sync>>(())
        };
        let seam_count = Arc::new(AtomicUsize::new(0));
        let seam_reason: Arc<Mutex<Option<xybrid_core::abort::AbortReason>>> =
            Arc::new(Mutex::new(None));
        let seam_count_for_cb = seam_count.clone();
        let seam_reason_for_cb = seam_reason.clone();
        let mut on_seam = move |s: SeamInfo| {
            seam_count_for_cb.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
            *seam_reason_for_cb.lock().unwrap() = Some(s.reason);
            assert_eq!(s.correlation_id, "corr-1");
            assert_eq!(s.local_tokens, 3);
            assert_eq!(s.local_latency_ms, 100);
        };

        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-1".to_string(),
            "test-model",
            3,
            100,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        )
        .expect("retry should succeed");

        assert_eq!(seam_count.load(std::sync::atomic::Ordering::SeqCst), 1);
        assert_eq!(
            *seam_reason.lock().unwrap(),
            Some(xybrid_core::abort::AbortReason::StressMemory)
        );
        assert_eq!(cloud.call_count(), 1);
        assert_eq!(result.text(), Some("hello from cloud"));
        assert_eq!(result.model_id(), "test-model");
        let tokens = collected.lock().unwrap().clone();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0], "hello from cloud");
    }

    #[test]
    fn run_streaming_with_fallback_retries_cloud_on_pre_run_resource_pressure() {
        let model = test_loaded_model(true);
        let cloud = FakeCloudAdapter::new("hello from cloud");
        let mut critical_snapshot = xybrid_core::device::ResourceSnapshot::unknown();
        critical_snapshot.memory_pressure = xybrid_core::device::MemoryPressure::Critical;
        let resource_provider = Arc::new(FixedUnitResourceProvider::new(critical_snapshot));
        let options = RunOptions::new()
            .with_abort_policy(
                crate::run_options::AbortPolicy::default()
                    .stop_on(crate::run_options::AbortSignal::MemoryPressureCritical)
                    .with_cloud_fallback(true)
                    .with_max_grace_tokens(0),
            )
            .with_resource_provider(resource_provider)
            .with_correlation_id("corr-pre-run");
        let envelope = text_envelope("write me a haiku");
        let collected: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
        let collected_for_cb = collected.clone();
        let mut on_token = move |t: xybrid_core::runtime_adapter::types::PartialToken| {
            collected_for_cb.lock().unwrap().push(t.token);
            Ok::<_, Box<dyn std::error::Error + Send + Sync>>(())
        };
        let seam_count = Arc::new(AtomicUsize::new(0));
        let seam_count_for_cb = seam_count.clone();
        let mut on_seam = move |s: SeamInfo| {
            seam_count_for_cb.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
            assert_eq!(s.reason, xybrid_core::abort::AbortReason::StressMemory);
            assert_eq!(s.correlation_id, "corr-pre-run");
            assert_eq!(s.local_tokens, 0);
        };

        let result = model
            .run_streaming_with_fallback(&envelope, &options, &cloud, &mut on_token, &mut on_seam)
            .expect("pre-run resource pressure should retry on cloud");

        assert_eq!(seam_count.load(std::sync::atomic::Ordering::SeqCst), 1);
        assert_eq!(cloud.call_count(), 1);
        assert_eq!(result.text(), Some("hello from cloud"));
        assert_eq!(collected.lock().unwrap().as_slice(), ["hello from cloud"]);
    }

    #[test]
    fn run_streaming_with_fallback_retries_cloud_on_pre_run_thermal_pressure() {
        let model = test_loaded_model(true);
        let cloud = FakeCloudAdapter::new("hello from cloud");
        let mut critical_snapshot = xybrid_core::device::ResourceSnapshot::unknown();
        critical_snapshot.thermal_state = xybrid_core::device::ThermalState::Critical;
        let resource_provider = Arc::new(FixedUnitResourceProvider::new(critical_snapshot));
        let options = RunOptions::new()
            .with_abort_policy(
                crate::run_options::AbortPolicy::default()
                    .stop_on(crate::run_options::AbortSignal::ThermalCritical)
                    .with_cloud_fallback(true)
                    .with_max_grace_tokens(0),
            )
            .with_resource_provider(resource_provider)
            .with_correlation_id("corr-thermal-pre-run");
        let envelope = text_envelope("write me a haiku");
        let collected: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
        let collected_for_cb = collected.clone();
        let mut on_token = move |t: xybrid_core::runtime_adapter::types::PartialToken| {
            collected_for_cb.lock().unwrap().push(t.token);
            Ok::<_, Box<dyn std::error::Error + Send + Sync>>(())
        };
        let seam_count = Arc::new(AtomicUsize::new(0));
        let seam_count_for_cb = seam_count.clone();
        let mut on_seam = move |s: SeamInfo| {
            seam_count_for_cb.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
            assert_eq!(s.reason, xybrid_core::abort::AbortReason::StressThermal);
            assert_eq!(s.correlation_id, "corr-thermal-pre-run");
            assert_eq!(s.local_tokens, 0);
        };

        let result = model
            .run_streaming_with_fallback(&envelope, &options, &cloud, &mut on_token, &mut on_seam)
            .expect("pre-run thermal pressure should retry on cloud");

        assert_eq!(seam_count.load(std::sync::atomic::Ordering::SeqCst), 1);
        assert_eq!(cloud.call_count(), 1);
        assert_eq!(result.text(), Some("hello from cloud"));
        assert_eq!(collected.lock().unwrap().as_slice(), ["hello from cloud"]);
    }

    #[test]
    fn dispatch_after_local_records_local_abort_and_cloud_success_outcomes() {
        let cloud = FakeCloudAdapter::new("hello from cloud");
        let authority = FakeAuthority::allow();
        let mut envelope = text_envelope("write a haiku");
        envelope
            .metadata
            .insert("model".to_string(), "deepseek-chat".to_string());
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let mut on_seam = |_s: SeamInfo| {};
        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-record".to_string(),
            "local-model",
            7,
            321,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        )
        .expect("cloud retry should succeed");

        let outcomes = authority.outcomes();
        assert_eq!(outcomes.len(), 2);
        assert!(matches!(
            outcomes[0].category,
            Some(
                xybrid_core::orchestrator::authority::OutcomeCategory::AbortedForCloudFallback {
                    reason: xybrid_core::abort::AbortReason::StressMemory
                }
            )
        ));
        assert!(matches!(
            outcomes[0].target,
            xybrid_core::orchestrator::authority::ResolvedTarget::Device
        ));
        assert!(matches!(
            outcomes[1].category,
            Some(xybrid_core::orchestrator::authority::OutcomeCategory::Success)
        ));
        assert!(matches!(
            outcomes[1].target,
            xybrid_core::orchestrator::authority::ResolvedTarget::Cloud { .. }
        ));
        assert_eq!(outcomes[1].model_id.as_deref(), Some("deepseek-chat"));
    }

    #[test]
    fn dispatch_after_local_records_cloud_retry_failure() {
        let cloud = FailingCloudAdapter::new();
        let authority = FakeAuthority::allow();
        let mut envelope = text_envelope("write a haiku");
        envelope
            .metadata
            .insert("provider".to_string(), "openai".to_string());
        envelope
            .metadata
            .insert("model".to_string(), "gpt-4o-mini".to_string());
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let mut on_seam = |_s: SeamInfo| {};
        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-cloud-fail".to_string(),
            "local-model",
            2,
            120,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        );

        match result {
            Err(SdkError::InferenceError(message)) => {
                assert!(message.contains("gateway unavailable"), "{message}");
            }
            other => panic!("expected cloud retry failure, got {other:?}"),
        }
        assert_eq!(cloud.call_count(), 1);

        let outcomes = authority.outcomes();
        assert_eq!(outcomes.len(), 2);
        assert!(matches!(
            outcomes[0].category,
            Some(
                xybrid_core::orchestrator::authority::OutcomeCategory::AbortedForCloudFallback { .. }
            )
        ));
        assert!(matches!(
            outcomes[1].category,
            Some(xybrid_core::orchestrator::authority::OutcomeCategory::HardFail { .. })
        ));
        assert_eq!(outcomes[1].model_id.as_deref(), Some("gpt-4o-mini"));
    }

    #[test]
    fn dispatch_after_local_rechecks_policy_and_retries_with_original_envelope() {
        let cloud = FakeCloudAdapter::new("cloud continuation");
        let authority = FakeAuthority::allow();
        let mut envelope = text_envelope("original prompt, not partial local output");
        envelope
            .metadata
            .insert("provider".to_string(), "openai".to_string());
        envelope
            .metadata
            .insert("model".to_string(), "gpt-4o-mini".to_string());
        envelope
            .metadata
            .insert("temperature".to_string(), "0.2".to_string());

        let received: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
        let received_for_cb = received.clone();
        let mut on_token = move |t: xybrid_core::runtime_adapter::types::PartialToken| {
            received_for_cb.lock().unwrap().push(t.token);
            Ok::<_, Box<dyn std::error::Error + Send + Sync>>(())
        };
        let mut on_seam = |_s: SeamInfo| {};
        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-original-envelope".to_string(),
            "local-model",
            4,
            123,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        )
        .expect("cloud retry should succeed");

        let policy_requests = authority.policy_requests();
        assert_eq!(policy_requests.len(), 1);
        assert_eq!(policy_requests[0].stage_id, "gpt-4o-mini");
        assert_eq!(policy_requests[0].envelope, envelope);

        let cloud_calls = cloud.calls();
        assert_eq!(cloud_calls.len(), 1);
        assert_eq!(cloud_calls[0], envelope);
        assert_eq!(received.lock().unwrap().as_slice(), ["cloud continuation"]);
    }

    #[test]
    fn dispatch_after_local_stops_before_cloud_when_cancelled_after_seam() {
        let cloud = FakeCloudAdapter::new("must not run");
        let authority = FakeAuthority::allow();
        let envelope = text_envelope("prompt");
        let cancellation = CancellationToken::new();
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let seam_count = Arc::new(AtomicUsize::new(0));
        let seam_count_for_cb = seam_count.clone();
        let cancellation_for_seam = cancellation.clone();
        let mut on_seam = move |_s: SeamInfo| {
            seam_count_for_cb.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
            cancellation_for_seam.cancel();
        };
        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-cancel-after-seam".to_string(),
            "local-model",
            0,
            0,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            Some(cancellation),
            &mut on_token,
            &mut on_seam,
        );

        match result {
            Err(SdkError::InferenceError(message)) => assert!(message.contains("user_cancelled")),
            other => panic!("expected user_cancelled error, got {other:?}"),
        }
        assert_eq!(seam_count.load(std::sync::atomic::Ordering::SeqCst), 1);
        assert_eq!(cloud.call_count(), 0);
        assert_eq!(authority.outcomes().len(), 1);
    }

    #[test]
    fn dispatch_after_local_stops_when_cloud_policy_is_denied() {
        let cloud = FakeCloudAdapter::new("should not run");
        let authority = FakeAuthority::deny("Policy rule 'rtt_rule' matched");
        let mut envelope = text_envelope("write a haiku");
        envelope
            .metadata
            .insert("model".to_string(), "deepseek-chat".to_string());
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let mut on_seam = |_s: SeamInfo| {};
        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-denied".to_string(),
            "local-model",
            7,
            321,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        );

        match result {
            Err(SdkError::InferenceError(message)) => {
                assert!(message.contains("cloud_denied_by_policy"));
            }
            other => panic!("expected cloud_denied_by_policy error, got {other:?}"),
        }
        assert_eq!(cloud.call_count(), 0);

        let outcomes = authority.outcomes();
        assert_eq!(outcomes.len(), 2);
        assert!(matches!(
            outcomes[1].category,
            Some(xybrid_core::orchestrator::authority::OutcomeCategory::HardFail { ref reason })
                if reason == "cloud_denied_by_policy"
        ));
    }

    #[test]
    fn dispatch_after_local_uses_cloud_model_name_from_envelope() {
        // Regression: previously the cloud-leg `CloudRetry` event and the
        // returned `InferenceResult` both reported the LOCAL model_id
        // (e.g. `qwen2.5-0.5b-instruct`) for tokens that actually came from
        // the cloud-side model (e.g. `deepseek-chat`), making the dashboard
        // misattribute cloud output to the local model.
        let cloud = FakeCloudAdapter::new("hello from deepseek");
        let authority = FakeAuthority::allow();
        let mut envelope = text_envelope("write a haiku");
        envelope
            .metadata
            .insert("provider".to_string(), "deepseek".to_string());
        envelope
            .metadata
            .insert("model".to_string(), "deepseek-chat".to_string());
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let mut on_seam = |_s: SeamInfo| {};

        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-cloud-model".to_string(),
            "qwen2.5-0.5b-instruct",
            5,
            220,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        )
        .expect("cloud retry should succeed");

        assert_eq!(result.model_id(), "deepseek-chat");
    }

    #[test]
    fn dispatch_after_local_falls_back_to_local_model_when_envelope_missing_model() {
        // When no `model` metadata is on the envelope the dispatch would
        // typically fail at the gateway, but if it somehow succeeds we
        // shouldn't leave the result struct's model_id empty — fall back
        // to the local id so the trace at least labels something.
        let cloud = FakeCloudAdapter::new("ok");
        let authority = FakeAuthority::allow();
        let envelope = text_envelope("p");
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let mut on_seam = |_s: SeamInfo| {};
        let local_result: SdkResult<InferenceResult> = Err(SdkError::AbortedForCloudFallback {
            reason: xybrid_core::abort::AbortReason::StressMemory,
        });

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-fallback".to_string(),
            "local-only-model",
            0,
            0,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        )
        .expect("cloud retry should succeed");

        assert_eq!(result.model_id(), "local-only-model");
    }

    #[test]
    fn dispatch_after_local_passes_through_ok_result() {
        let cloud = FakeCloudAdapter::new("unused");
        let authority = FakeAuthority::allow();
        let envelope = text_envelope("prompt");
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let seam_fired = Arc::new(AtomicBool::new(false));
        let seam_fired_for_cb = seam_fired.clone();
        let mut on_seam = move |_s: SeamInfo| {
            seam_fired_for_cb.store(true, std::sync::atomic::Ordering::SeqCst);
        };

        let local_inner = InferenceResult::new(text_envelope("local result"), "test-model", 50);
        let local_result: SdkResult<InferenceResult> = Ok(local_inner);

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-2".to_string(),
            "test-model",
            10,
            200,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        )
        .expect("ok should pass through");

        assert!(!seam_fired.load(std::sync::atomic::Ordering::SeqCst));
        assert_eq!(cloud.call_count(), 0);
        assert_eq!(result.text(), Some("local result"));
        assert_eq!(result.latency_ms(), 50);
    }

    #[test]
    fn dispatch_after_local_passes_through_other_errors() {
        let cloud = FakeCloudAdapter::new("unused");
        let authority = FakeAuthority::allow();
        let envelope = text_envelope("prompt");
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let seam_fired = Arc::new(AtomicBool::new(false));
        let seam_fired_for_cb = seam_fired.clone();
        let mut on_seam = move |_s: SeamInfo| {
            seam_fired_for_cb.store(true, std::sync::atomic::Ordering::SeqCst);
        };

        let local_result: SdkResult<InferenceResult> =
            Err(SdkError::InferenceError("local failed".to_string()));

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-3".to_string(),
            "test-model",
            0,
            0,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        );

        match result {
            Err(SdkError::InferenceError(msg)) => assert!(msg.contains("local failed")),
            other => panic!("expected InferenceError, got {:?}", other),
        }
        assert!(!seam_fired.load(std::sync::atomic::Ordering::SeqCst));
        assert_eq!(cloud.call_count(), 0);
    }

    #[test]
    fn dispatch_after_local_never_retries_user_cancelled_with_fallback_enabled() {
        let cloud = FakeCloudAdapter::new("must not run");
        let authority = FakeAuthority::allow();
        let envelope = text_envelope("prompt");
        let mut on_token =
            |_: xybrid_core::runtime_adapter::types::PartialToken| -> Result<(), Box<dyn std::error::Error + Send + Sync>> { Ok(()) };
        let seam_fired = Arc::new(AtomicBool::new(false));
        let seam_fired_for_cb = seam_fired.clone();
        let mut on_seam = move |_s: SeamInfo| {
            seam_fired_for_cb.store(true, std::sync::atomic::Ordering::SeqCst);
        };

        let cancellation_error =
            crate::run_options::AbortReason::UserCancelled.into_streaming_error(true);
        let local_result: SdkResult<InferenceResult> =
            Err(streaming_callback_error(cancellation_error));

        let result = dispatch_after_local(
            local_result,
            &envelope,
            &cloud,
            "corr-cancel".to_string(),
            "test-model",
            0,
            0,
            None,
            &authority,
            default_metrics(),
            Some(default_signal()),
            None,
            &mut on_token,
            &mut on_seam,
        );

        match result {
            Err(SdkError::InferenceError(message)) => assert!(message.contains("user_cancelled")),
            other => panic!("expected terminal user_cancelled error, got {other:?}"),
        }
        assert!(!seam_fired.load(std::sync::atomic::Ordering::SeqCst));
        assert_eq!(cloud.call_count(), 0);
    }
}