oxillama-runtime 0.1.0

//! Error types for the inference runtime.

use thiserror::Error;

/// Result type alias for runtime operations.
pub type RuntimeResult<T> = Result<T, RuntimeError>;

/// Errors that can occur during inference.
#[derive(Error, Debug)]
pub enum RuntimeError {
    /// No model has been loaded yet.
    #[error("no model loaded")]
    ModelNotLoaded,

    /// Tokenizer is not available because neither `tokenizer-wasm` nor `tokenizer-onig`
    /// feature is enabled.
    ///
    /// Enable the `tokenizer-wasm` feature (default, pure Rust) to use
    /// the HuggingFace tokenizers library.
    #[error("tokenizer not available: rebuild with the `tokenizer-wasm` feature enabled")]
    TokenizerNotAvailable,

    /// Tokenizer initialization or encoding/decoding failed.
    #[error("tokenizer error: {message}")]
    TokenizerError {
        /// Description of the tokenizer error.
        message: String,
    },

    /// Sampling operation failed.
    #[error("sampling error: {message}")]
    SamplingError {
        /// Description of the sampling error.
        message: String,
    },

    /// KV cache has reached its maximum capacity.
    #[error("KV cache full: maximum context length {max_ctx} reached")]
    KvCacheFull {
        /// Maximum context length supported.
        max_ctx: usize,
    },

    /// Model file could not be loaded.
    #[error("model loading error: {message}")]
    ModelLoadError {
        /// Description of the loading error.
        message: String,
    },

    /// Generation was interrupted or cancelled.
    #[error("generation cancelled")]
    Cancelled,

    /// Error propagated from architecture layer.
    #[error("architecture error: {0}")]
    Arch(#[from] oxillama_arch::ArchError),

    /// Error propagated from GGUF parser.
    #[error("GGUF error: {0}")]
    Gguf(#[from] oxillama_gguf::GgufError),

    /// Error propagated from quantization kernel.
    #[error("quantization error: {0}")]
    Quant(#[from] oxillama_quant::QuantError),

    /// I/O error during model loading.
    #[error("I/O error: {0}")]
    Io(#[from] std::io::Error),

    /// Grammar error from GBNF constrained sampling.
    #[error("grammar error: {0}")]
    Grammar(#[from] crate::sampling::grammar::GrammarError),

    /// Attention computation error.
    #[error("attention error: {message}")]
    AttentionError {
        /// Description of the attention error.
        message: String,
    },
}