llama-gguf 0.14.0

//! Hardware backends for tensor operations
//!
//! This module defines the `Backend` trait which provides an abstraction
//! over different hardware implementations (CPU, CUDA, Vulkan, Metal, etc.)

pub mod cpu;
#[cfg(feature = "cuda")]
pub mod cuda;
pub mod dx12;
mod error;
pub mod tensor_parallel;
pub mod metal;
#[cfg(feature = "vulkan")]
pub mod vulkan;
#[cfg(feature = "hailo")]
pub mod hailo;

pub use error::BackendError;

use crate::tensor::{DType, Tensor};

/// Result type for backend operations
pub type BackendResult<T> = Result<T, BackendError>;

/// Hardware backend trait for tensor operations
///
/// This trait defines all the operations needed for LLM inference.
/// Each backend (CPU, CUDA, etc.) implements this trait.
pub trait Backend: Send + Sync {
    /// Get the name of this backend
    fn name(&self) -> &str;

    /// Check if this backend is available on the current system
    fn is_available(&self) -> bool;

    // =========================================================================
    // Memory operations
    // =========================================================================

    /// Allocate a tensor with the given shape and dtype
    fn alloc(&self, shape: &[usize], dtype: DType) -> BackendResult<Tensor>;

    /// Copy a tensor to this backend (may be a no-op for CPU)
    fn copy_to(&self, tensor: &Tensor) -> BackendResult<Tensor>;

    // =========================================================================
    // Element-wise operations
    // =========================================================================

    /// Element-wise addition: out = a + b
    fn add(&self, a: &Tensor, b: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// Element-wise multiplication: out = a * b
    fn mul(&self, a: &Tensor, b: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// Scale tensor by scalar: out = a * scalar
    fn scale(&self, a: &Tensor, scalar: f32, out: &mut Tensor) -> BackendResult<()>;

    // =========================================================================
    // Activation functions
    // =========================================================================

    /// SiLU activation: out = x * sigmoid(x)
    fn silu(&self, x: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// GELU activation
    fn gelu(&self, x: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// Softmax along last dimension
    fn softmax(&self, x: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    // =========================================================================
    // Normalization
    // =========================================================================

    /// RMS normalization: out = x / rms(x) * weight
    fn rms_norm(
        &self,
        x: &Tensor,
        weight: &Tensor,
        eps: f32,
        out: &mut Tensor,
    ) -> BackendResult<()>;

    // =========================================================================
    // Matrix operations
    // =========================================================================

    /// Matrix multiplication: out = a @ b
    fn matmul(&self, a: &Tensor, b: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// Matrix-vector multiplication: out = a @ b where a is 2D, b is 1D
    fn matvec(&self, a: &Tensor, b: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// Vector-matrix multiplication: out = a @ b where a is 1D, b is 2D
    /// This computes y = x @ W where x is [k] and W is [k, n], giving y [n]
    fn vec_mat(&self, a: &Tensor, b: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    // =========================================================================
    // Quantization
    // =========================================================================

    /// Dequantize tensor to f32
    fn dequantize(&self, src: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// Quantized matrix-vector multiply (fused dequant + matvec for performance)
    fn matvec_q(&self, a: &Tensor, b: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    /// Quantized vector-matrix multiply (fused dequant + vec_mat for performance)
    fn vec_mat_q(&self, a: &Tensor, b: &Tensor, out: &mut Tensor) -> BackendResult<()>;

    // =========================================================================
    // Position embeddings
    // =========================================================================

    /// Apply Rotary Position Embedding (RoPE) to query and key tensors
    ///
    /// # Arguments
    /// * `q` - Query tensor of shape [num_heads, seq_len, head_dim]
    /// * `k` - Key tensor of shape [num_kv_heads, seq_len, head_dim]
    /// * `pos` - Starting position for RoPE
    /// * `freq_base` - Base frequency (typically 10000.0)
    /// * `freq_scale` - Frequency scale factor (typically 1.0)
    /// * `use_neox` - Use NeoX style (first half with second half) vs Normal (consecutive pairs)
    fn rope(
        &self,
        q: &mut Tensor,
        k: &mut Tensor,
        pos: usize,
        freq_base: f32,
        freq_scale: f32,
        use_neox: bool,
    ) -> BackendResult<()>;

    // =========================================================================
    // Attention operations
    // =========================================================================

    /// Compute causal self-attention
    ///
    /// # Arguments
    /// * `q` - Query tensor [num_heads, seq_len, head_dim]
    /// * `k` - Key tensor [num_kv_heads, kv_len, head_dim]
    /// * `v` - Value tensor [num_kv_heads, kv_len, head_dim]
    /// * `out` - Output tensor [num_heads, seq_len, head_dim]
    /// * `scale` - Attention scale factor (typically 1/sqrt(head_dim))
    fn attention(
        &self,
        q: &Tensor,
        k: &Tensor,
        v: &Tensor,
        out: &mut Tensor,
        scale: f32,
    ) -> BackendResult<()>;

    /// Compute Flash Attention (memory-efficient tiled attention)
    ///
    /// Flash Attention computes attention using tiling to reduce memory usage
    /// from O(n²) to O(n). This is especially beneficial for long sequences.
    ///
    /// # Arguments
    /// * `q` - Query tensor [batch, num_heads, seq_len, head_dim]
    /// * `k` - Key tensor [batch, num_kv_heads, kv_len, head_dim]
    /// * `v` - Value tensor [batch, num_kv_heads, kv_len, head_dim]
    /// * `out` - Output tensor [batch, num_heads, seq_len, head_dim]
    /// * `scale` - Attention scale factor (typically 1/sqrt(head_dim))
    /// * `causal` - Whether to apply causal masking
    fn flash_attention(
        &self,
        q: &Tensor,
        k: &Tensor,
        v: &Tensor,
        out: &mut Tensor,
        scale: f32,
        _causal: bool,
    ) -> BackendResult<()> {
        // Default implementation falls back to standard attention
        // Backends can override this with optimized implementations
        self.attention(q, k, v, out, scale)
    }

    /// Compute causal self-attention directly from KV cache tensors.
    ///
    /// This avoids copying the KV cache into contiguous tensors for each
    /// token, which is a major performance win during autoregressive
    /// generation and prefill.
    ///
    /// # Arguments
    /// * `q` - Query tensor [num_heads, 1, head_dim] (single query position)
    /// * `k_cache` - Key cache tensor [num_kv_heads, max_seq_len, head_dim]
    /// * `v_cache` - Value cache tensor [num_kv_heads, max_seq_len, head_dim]
    /// * `out` - Output tensor [num_heads, 1, head_dim]
    /// * `scale` - Attention scale factor (typically 1/sqrt(head_dim))
    /// * `kv_len` - Number of valid positions in the cache (positions 0..kv_len)
    fn attention_cached(
        &self,
        q: &Tensor,
        k_cache: &Tensor,
        v_cache: &Tensor,
        out: &mut Tensor,
        scale: f32,
        kv_len: usize,
    ) -> BackendResult<()> {
        // Default: extract contiguous k/v from cache and call standard attention.
        // Backends should override with strided implementations for better performance.
        let num_kv_heads = k_cache.shape()[0];
        let max_seq_len = k_cache.shape()[1];
        let head_dim = k_cache.shape()[2];

        let mut k_contig = Tensor::zeros(vec![num_kv_heads, kv_len, head_dim], DType::F32);
        let mut v_contig = Tensor::zeros(vec![num_kv_heads, kv_len, head_dim], DType::F32);

        {
            let k_src = k_cache.as_f32()?;
            let k_dst = k_contig.as_f32_mut()?;
            for h in 0..num_kv_heads {
                for p in 0..kv_len {
                    let src_off = h * max_seq_len * head_dim + p * head_dim;
                    let dst_off = h * kv_len * head_dim + p * head_dim;
                    k_dst[dst_off..dst_off + head_dim]
                        .copy_from_slice(&k_src[src_off..src_off + head_dim]);
                }
            }
        }
        {
            let v_src = v_cache.as_f32()?;
            let v_dst = v_contig.as_f32_mut()?;
            for h in 0..num_kv_heads {
                for p in 0..kv_len {
                    let src_off = h * max_seq_len * head_dim + p * head_dim;
                    let dst_off = h * kv_len * head_dim + p * head_dim;
                    v_dst[dst_off..dst_off + head_dim]
                        .copy_from_slice(&v_src[src_off..src_off + head_dim]);
                }
            }
        }

        self.attention(q, &k_contig, &v_contig, out, scale)
    }

    /// Compute attention with TurboQuant-compressed KV cache.
    ///
    /// The default implementation delegates to the cache's own CPU path
    /// (`TurboQuantKVCache::attention_layer`). GPU backends can override
    /// this with fused kernels that operate on compressed data directly.
    ///
    /// # Arguments
    /// * `queries` - Query tensor [num_heads * key_length] (flat, after RoPE)
    /// * `tq_cache` - The TurboQuant KV cache
    /// * `layer_idx` - Which transformer layer
    /// * `num_heads` - Number of attention heads
    /// * `scale` - Attention scale factor
    ///
    /// Returns a Vec<f32> of shape [num_heads * value_length].
    fn attention_turboquant(
        &self,
        queries: &[f32],
        tq_cache: &crate::model::kv_turboquant::TurboQuantKVCache,
        layer_idx: usize,
        num_heads: usize,
        scale: f32,
    ) -> BackendResult<Vec<f32>> {
        Ok(tq_cache.attention_layer(layer_idx, queries, num_heads, scale))
    }
}

/// Get the default backend (CPU)
pub fn default_backend() -> Box<dyn Backend> {
    Box::new(cpu::CpuBackend::new())
}

// =============================================================================
// GPU-only inference abstraction
// =============================================================================

/// Trait for GPU-only inference engines that run the entire forward pass on GPU.
///
/// All GPU backends (CUDA, Vulkan, Metal, DX12) implement this trait with the
/// same contract: consume a `LlamaModel`, upload all weights to VRAM, and
/// provide token-level forward/prefill methods that return CPU-side logits.
///
/// Use [`GpuModelWrapper`] to adapt any `GpuInference` into a [`Model`].
pub trait GpuInference: Send {
    /// Run a single token through all layers, returning logit vector on CPU.
    fn forward(&mut self, token_id: u32) -> BackendResult<Vec<f32>>;

    /// Process a token through all layers (updating KV cache) without
    /// computing output logits. Used for prompt prefill.
    fn prefill_token(&mut self, token_id: u32) -> BackendResult<()>;

    /// Reset all state (KV caches, position counter) for a new sequence.
    fn reset(&mut self);

    /// Current sequence position (number of tokens processed so far).
    fn position(&self) -> usize;
}

/// Generic wrapper that adapts any [`GpuInference`] engine into a [`Model`].
///
/// This eliminates the duplicated `*GpuModelWrapper` structs that previously
/// existed in each backend module with identical logic.
pub struct GpuModelWrapper<T: GpuInference> {
    gpu: std::sync::Mutex<T>,
    config: crate::model::ModelConfig,
    architecture: crate::model::Architecture,
}

impl<T: GpuInference> GpuModelWrapper<T> {
    pub fn new(
        gpu: T,
        config: crate::model::ModelConfig,
        architecture: crate::model::Architecture,
    ) -> Self {
        Self {
            gpu: std::sync::Mutex::new(gpu),
            config,
            architecture,
        }
    }
}

impl<T: GpuInference + 'static> crate::model::Model for GpuModelWrapper<T> {
    fn forward(
        &self,
        tokens: &[u32],
        ctx: &mut crate::model::InferenceContext,
    ) -> crate::model::ModelResult<crate::tensor::Tensor> {
        let mut gpu = self.gpu.lock().map_err(|e| {
            crate::model::ModelError::ConfigError(format!(
                "GPU inference lock poisoned: {}", e
            ))
        })?;

        if ctx.position == 0 && gpu.position() > 0 {
            gpu.reset();
        }

        if tokens.is_empty() {
            return Err(crate::model::ModelError::ConfigError(
                "No tokens to process".into(),
            ));
        }

        let last_idx = tokens.len() - 1;
        for &token in &tokens[..last_idx] {
            gpu.prefill_token(token)?;
        }

        let logits_vec = gpu.forward(tokens[last_idx])?;
        ctx.position += tokens.len();
        ctx.kv_cache.seq_len = ctx.position;

        crate::tensor::Tensor::from_f32(&logits_vec, vec![logits_vec.len()])
            .map_err(|e| e.into())
    }

    fn config(&self) -> &crate::model::ModelConfig {
        &self.config
    }

    fn architecture(&self) -> crate::model::Architecture {
        self.architecture
    }
}