xybrid-core 0.1.0-rc4

//! Runtime Adapter module - Interface for model runtime backends.
//!
//! The RuntimeAdapter trait provides a unified interface for executing inference
//! across different model runtime backends (ONNX, CoreML, Candle, etc.).
//!
//! Runtime adapters are responsible for:
//! - Loading model files from .xyb bundles
//! - Executing inference on input envelopes
//! - Returning output envelopes
//! - Managing runtime-specific resources (session pools, memory, etc.)
//!
//! # Module Organization
//!
//! Each runtime backend is organized in its own subdirectory:
//! - `onnx/` - ONNX Runtime backend (cross-platform)
//! - `coreml/` - CoreML backend (iOS/macOS)
//! - `candle/` - Candle backend (pure Rust, feature-gated)
//! - `cloud/` - Cloud LLM backend (OpenAI, Anthropic, etc.)
//! - `mistral/` - MistralBackend (mistral.rs, desktop only)
//! - `llama_cpp/` - LlamaCppBackend (llama.cpp, Android + fallback)
//! - `llm.rs` - Shared LLM types (LlmBackend trait, configs)
//!
//! # Example
//!
//! ```rust,ignore
//! use xybrid_core::runtime_adapter::{RuntimeAdapter, OnnxRuntimeAdapter};
//! use xybrid_core::ir::{Envelope, EnvelopeKind};
//!
//! // Create adapter
//! let mut adapter = OnnxRuntimeAdapter::new();
//!
//! // Load model from bundle
//! adapter.load_model("/path/to/model.onnx")?;
//!
//! // Run inference
//! let input = Envelope::new(EnvelopeKind::Text("hello world".to_string()));
//! let output = adapter.execute(&input)?;
//! ```

use crate::ir::Envelope;
use std::collections::HashMap;
use thiserror::Error;

// Shared utilities (stay at root level)
pub mod inference_backend;
pub(crate) mod metadata_driven;
pub(crate) mod tensor_utils;
pub mod traits;

// Always-available types for FFI/bindings (NOT feature-gated)
pub mod types;

// Runtime backends (organized in subdirectories)
pub mod onnx;

// Cloud LLM backend (OpenAI, Anthropic, etc.) - always available
pub mod cloud;

#[cfg(any(target_os = "macos", target_os = "ios", test))]
pub mod coreml;

// Candle backend (feature-gated, pure Rust ML framework)
#[cfg(feature = "candle")]
pub mod candle;

// LLM shared types and adapter (available when any LLM backend is enabled)
#[cfg(any(feature = "llm-mistral", feature = "llm-llamacpp"))]
pub mod llm;

// Shared telemetry helpers for LLM backends (itl_stats, etc.)
#[cfg(any(feature = "llm-mistral", feature = "llm-llamacpp"))]
pub(crate) mod llm_telemetry;

// Shared streaming post-processing for LLM backends
// (stop-pattern filtering, <think> tag stripping, safe-prefix buffer).
// Used by backends whose engines don't strip these internally — currently
// llama_cpp. Mistral's engine (mistralrs) handles it natively and does
// not import this module.
#[cfg(feature = "llm-llamacpp")]
pub(crate) mod streaming_postprocess;

// MistralBackend (feature-gated, uses mistral.rs - desktop only, NOT Android)
// Requires +fp16 on ARM which causes SIGILL on devices without ARMv8.2-A FP16
#[cfg(feature = "llm-mistral")]
pub mod mistral;

// LlamaCppBackend (feature-gated, uses llama.cpp - Android compatible)
// Has proper runtime SIMD detection via ggml
#[cfg(feature = "llm-llamacpp")]
pub mod llama_cpp;

// Re-exports from runtime backends
pub use cloud::{CloudRuntimeAdapter, CloudStreaming};
pub use metadata_driven::MetadataDrivenAdapter;
pub use onnx::OnnxBackend;
pub use onnx::OnnxRuntimeAdapter;
pub use onnx::{ExecutionProviderKind, ONNXSession, SessionOptions};

#[cfg(any(target_os = "android", test))]
pub use onnx::ONNXMobileRuntimeAdapter;

#[cfg(any(target_os = "macos", target_os = "ios", test))]
pub use coreml::CoreMLRuntimeAdapter;

#[cfg(feature = "candle")]
pub use candle::{CandleBackend, CandleRuntimeAdapter};

// LLM exports - adapter types only (ChatMessage, GenerationConfig, LlmConfig
// are re-exported from types.rs unconditionally below)
#[cfg(any(feature = "llm-mistral", feature = "llm-llamacpp"))]
pub use llm::{GenerationOutput, LlmBackend, LlmResult, LlmRuntimeAdapter};

// MistralBackend export (desktop only)
#[cfg(feature = "llm-mistral")]
pub use mistral::MistralBackend;

// LlamaCppBackend export (Android compatible)
#[cfg(feature = "llm-llamacpp")]
pub use llama_cpp::LlamaCppBackend;

// llama.cpp log control exports
#[cfg(feature = "llm-llamacpp")]
pub use llama_cpp::{llama_log_get_verbosity, llama_log_set_verbosity};

// Re-export inference backend types
pub use inference_backend::{BackendError, BackendResult, InferenceBackend, RuntimeType};
pub use traits::ModelRuntime;

// Always-available streaming and chat types (NOT feature-gated)
pub use types::{
    ChatMessage, GenerationConfig, LlmConfig, PartialToken, StreamingCallback, StreamingError,
};

/// Error type for runtime adapter operations.
#[derive(Error, Debug)]
pub enum AdapterError {
    #[error("Model not found: {0}")]
    ModelNotFound(String),
    #[error("Model not loaded: {0}")]
    ModelNotLoaded(String),
    #[error("Invalid input: {0}")]
    InvalidInput(String),
    #[error("Inference failed: {0}")]
    InferenceFailed(String),
    #[error("IO error: {0}")]
    IOError(#[from] std::io::Error),
    #[error("Serialization error: {0}")]
    SerializationError(String),
    #[error("Runtime error: {0}")]
    RuntimeError(String),
    #[error("Aborted for cloud fallback: {reason}")]
    AbortedForCloudFallback { reason: crate::abort::AbortReason },
}

impl AdapterError {
    pub fn from_streaming_callback_error(error: StreamingError) -> Self {
        if let Some(reason) = crate::abort::cloud_fallback_reason_from_error(error.as_ref()) {
            return Self::AbortedForCloudFallback { reason };
        }
        Self::RuntimeError(format!("Streaming callback error: {}", error))
    }

    pub fn cloud_fallback_abort_reason(&self) -> Option<crate::abort::AbortReason> {
        match self {
            Self::AbortedForCloudFallback { reason } => Some(*reason),
            _ => None,
        }
    }
}

/// Result type for runtime adapter operations.
pub type AdapterResult<T> = Result<T, AdapterError>;

/// Metadata about a loaded model.
#[derive(Debug, Clone)]
pub struct ModelMetadata {
    /// Model identifier
    pub model_id: String,
    /// Model version
    pub version: String,
    /// Runtime type (e.g., "onnx", "coreml", "candle")
    pub runtime_type: String,
    /// Model file path or location
    pub model_path: String,
    /// Input shapes/names (runtime-specific)
    pub input_schema: HashMap<String, Vec<u64>>,
    /// Output shapes/names (runtime-specific)
    pub output_schema: HashMap<String, Vec<u64>>,
}

/// Trait for model runtime adapters.
///
/// Runtime adapters abstract over different inference backends,
/// allowing the orchestrator to execute models without knowing
/// the underlying runtime implementation.
///
/// Adapters must be thread-safe (Send + Sync) to support concurrent
/// execution in the orchestrator.
pub trait RuntimeAdapter: Send + Sync {
    /// Returns the name of this adapter (e.g., "onnx", "coreml", "candle").
    fn name(&self) -> &str;

    /// Returns a list of file formats supported by this adapter.
    ///
    /// Examples: ["onnx", "onnx.gz"], ["mlpackage"], ["safetensors"]
    fn supported_formats(&self) -> Vec<&'static str>;

    /// Loads a model from the specified path.
    ///
    /// # Arguments
    ///
    /// * `path` - Path to the model file (ONNX, CoreML, SafeTensors, etc.)
    ///
    /// # Returns
    ///
    /// Unit on success, or an error if loading fails
    fn load_model(&mut self, path: &str) -> AdapterResult<()>;

    /// Executes inference on the currently loaded model.
    ///
    /// # Arguments
    ///
    /// * `input` - Input envelope containing the inference data
    ///
    /// # Returns
    ///
    /// Output envelope with inference results
    fn execute(&self, input: &Envelope) -> AdapterResult<Envelope>;

    /// Performs warm-up operations for GPU backends.
    ///
    /// GPU runtimes (CUDA, CoreML/Metal, TensorRT) often have "cold start"
    /// latency on first inference due to:
    /// - Shader compilation (Metal, CUDA)
    /// - Memory allocation and pinning
    /// - Execution graph optimization
    ///
    /// Calling `warmup()` after `load_model()` triggers these operations
    /// ahead of time, ensuring the first real inference is fast.
    ///
    /// # Default Implementation
    ///
    /// Returns `Ok(())` for CPU backends that don't need warm-up.
    ///
    /// # Example
    ///
    /// ```rust,ignore
    /// let mut adapter = OnnxRuntimeAdapter::new();
    /// adapter.load_model("model.onnx")?;
    /// adapter.warmup()?;  // Optional: trigger GPU initialization
    /// let output = adapter.execute(&input)?;  // First inference is now fast
    /// ```
    fn warmup(&mut self) -> AdapterResult<()> {
        Ok(())
    }
}

/// Extension trait for runtime adapters that support multiple models.
///
/// This provides additional functionality for adapters that need to manage
/// multiple loaded models simultaneously.
pub trait RuntimeAdapterExt {
    /// Checks if a model is loaded.
    ///
    /// # Arguments
    ///
    /// * `model_id` - Model identifier to check
    ///
    /// # Returns
    ///
    /// True if the model is loaded, false otherwise
    fn is_loaded(&self, model_id: &str) -> bool;

    /// Gets metadata for a loaded model.
    ///
    /// # Arguments
    ///
    /// * `model_id` - Model identifier
    ///
    /// # Returns
    ///
    /// ModelMetadata if the model is loaded
    fn get_metadata(&self, model_id: &str) -> AdapterResult<&ModelMetadata>;

    /// Runs inference on the specified model.
    ///
    /// # Arguments
    ///
    /// * `model_id` - Model identifier
    /// * `input` - Input envelope containing the inference data
    ///
    /// # Returns
    ///
    /// Output envelope with inference results
    fn infer(&self, model_id: &str, input: &Envelope) -> AdapterResult<Envelope>;

    /// Unloads a model, freeing its resources.
    ///
    /// # Arguments
    ///
    /// * `model_id` - Model identifier to unload
    fn unload_model(&mut self, model_id: &str) -> AdapterResult<()>;

    /// Lists all currently loaded models.
    ///
    /// # Returns
    ///
    /// Vector of model identifiers
    fn list_loaded_models(&self) -> Vec<String>;
}