dakera-inference 0.11.80

Embedded inference engine for Dakera - generates embeddings locally via ONNX Runtime
Documentation
//! # Dakera Inference Engine
//!
//! Embedded inference engine for generating vector embeddings locally without
//! external API calls. This crate provides:
//!
//! - **Local Embedding Generation**: Generate embeddings using state-of-the-art
//!   transformer models running locally on CPU or GPU.
//! - **Multiple Model Support**: Choose from MiniLM (fast), BGE (balanced), or E5 (quality).
//! - **Batch Processing**: Efficient batch processing with automatic batching and parallelization.
//! - **Zero External Dependencies**: No OpenAI, Cohere, or other API keys required.
//!
//! ## Quick Start
//!
//! ```no_run
//! use inference::{EmbeddingEngine, ModelConfig, EmbeddingModel};
//!
//! #[tokio::main]
//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
//!     // Create engine with default model (MiniLM)
//!     let engine = EmbeddingEngine::new(ModelConfig::default()).await?;
//!
//!     // Embed a query
//!     let query_embedding = engine.embed_query("What is machine learning?").await?;
//!     println!("Query embedding: {} dimensions", query_embedding.len());
//!
//!     // Embed documents
//!     let docs = vec![
//!         "Machine learning is a type of artificial intelligence.".to_string(),
//!         "Deep learning uses neural networks with many layers.".to_string(),
//!     ];
//!     let doc_embeddings = engine.embed_documents(&docs).await?;
//!     println!("Generated {} document embeddings", doc_embeddings.len());
//!
//!     Ok(())
//! }
//! ```
//!
//! ## Model Selection
//!
//! Choose the right model for your use case:
//!
//! | Model | Speed | Quality | Use Case |
//! |-------|-------|---------|----------|
//! | MiniLM | ⚡⚡⚡ | ⭐⭐ | High-throughput, real-time |
//! | BGE-small | ⚡⚡ | ⭐⭐⭐ | Balanced performance |
//! | E5-small | ⚡⚡ | ⭐⭐⭐ | Best quality for retrieval |
//!
//! ## GPU Acceleration
//!
//! Enable GPU acceleration by building with the appropriate feature:
//!
//! ```toml
//! # For NVIDIA GPUs
//! inference = { path = "crates/inference", features = ["cuda"] }
//!
//! # For Apple Silicon
//! inference = { path = "crates/inference", features = ["metal"] }
//! ```
//!
//! ## Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────┐
//! │                    EmbeddingEngine                          │
//! │  ┌─────────────┐  ┌──────────────┐  ┌──────────────────┐   │
//! │  │ ModelConfig │  │ BatchProcessor│  │  ort::Session    │   │
//! │  │ - model     │  │ - tokenizer  │  │ (ONNX Runtime)   │   │
//! │  │ - threads   │  │ - batching   │  │ - BERT INT8      │   │
//! │  │ - batch_sz  │  │ - prefixes   │  │ - mean_pool()    │   │
//! │  └─────────────┘  └──────────────┘  └──────────────────┘   │
//! └─────────────────────────────────────────────────────────────┘
//!//!//!              ┌───────────────────────────────┐
//!              │      Vec<f32> Embeddings      │
//!              │  (normalized, model-dim dims) │
//!              └───────────────────────────────┘
//! ```

pub mod backend;
pub mod batch;
pub mod engine;
pub mod error;
pub mod extraction;
pub mod models;
pub mod ner;
pub mod reranker;
pub mod tiered;

// Global GPU inference semaphore — 1 permit serializes concurrent CUDA forward passes across
// ALL inference backends (ONNX + Candle). Prevents simultaneous VRAM allocations from
// exhausting device memory (L4 has 23 GB; FP32 BGE-large ~4–6 GB per concurrent forward pass
// × 8 ingest workers > 23 GB → CUBLAS_STATUS_ALLOC_FAILED). The GPU is massively parallel
// internally; application-level serialization costs nothing vs an OOM crash.
pub(crate) static GPU_INFERENCE_SEMAPHORE: std::sync::LazyLock<
    std::sync::Arc<tokio::sync::Semaphore>,
> = std::sync::LazyLock::new(|| std::sync::Arc::new(tokio::sync::Semaphore::new(1)));

// Re-exports for convenience
pub use backend::{BackendKind, EmbeddingBackend};
pub use batch::TokenBudgetBatcher;
pub use engine::{EmbeddingEngine, EmbeddingEngineBuilder};
pub use error::{InferenceError, Result};
pub use extraction::{
    build_provider, ExtractionOpts, ExtractionProvider, ExtractionResult, ExtractorConfig,
};
pub use models::{EmbeddingModel, ModelConfig};
pub use ner::{rule_based_extract, ExtractedEntity, GlinerEngine, NerEngine};
pub use reranker::CrossEncoderEngine;
pub use tiered::TieredEngine;

/// Prelude module for convenient imports.
pub mod prelude {
    pub use crate::engine::{EmbeddingEngine, EmbeddingEngineBuilder};
    pub use crate::error::{InferenceError, Result};
    pub use crate::models::{EmbeddingModel, ModelConfig};
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_model_defaults() {
        let config = ModelConfig::default();
        assert_eq!(config.model, EmbeddingModel::BgeLarge);
        assert_eq!(config.max_batch_size, 32);
        assert!(!config.use_gpu);
    }

    #[test]
    fn test_model_dimensions() {
        assert_eq!(EmbeddingModel::BgeLarge.dimension(), 1024);
        assert_eq!(EmbeddingModel::MiniLM.dimension(), 384);
        assert_eq!(EmbeddingModel::BgeSmall.dimension(), 384);
        assert_eq!(EmbeddingModel::E5Small.dimension(), 384);
    }

    // ── GPU_INFERENCE_SEMAPHORE structural tests (DAK-6096) ──────────────────

    #[test]
    fn test_gpu_semaphore_has_one_permit() {
        // The global GPU semaphore must have exactly 1 permit to guarantee that
        // at most one CUDA forward pass runs at a time across all ingest workers.
        // More than 1 permit would allow concurrent VRAM allocations that can exhaust
        // the L4's 23 GB when 8 workers × FP32 BGE-large forward passes are in flight.
        let avail = GPU_INFERENCE_SEMAPHORE.available_permits();
        // Available is 0 or 1: 1 when idle, 0 if a concurrent test holds the permit.
        assert!(
            avail <= 1,
            "GPU semaphore must not have more than 1 permit; got {avail}"
        );
    }

    #[tokio::test]
    async fn test_gpu_semaphore_acquire_and_release() {
        // try_acquire() is non-blocking so this test cannot deadlock even if
        // another test is concurrently holding the single permit.
        let result = GPU_INFERENCE_SEMAPHORE.try_acquire();
        if let Ok(permit) = result {
            assert_eq!(GPU_INFERENCE_SEMAPHORE.available_permits(), 0);
            drop(permit);
            assert_eq!(GPU_INFERENCE_SEMAPHORE.available_permits(), 1);
        }
        // If try_acquire returned Err, another test holds the permit — that's fine.
        assert!(GPU_INFERENCE_SEMAPHORE.available_permits() <= 1);
    }

    #[tokio::test]
    async fn test_gpu_semaphore_second_acquire_blocks_until_first_released() {
        // Acquire the single permit, then immediately release it, then acquire again.
        // Verifies that the semaphore enforces the 1-permit invariant correctly.
        let p1 = GPU_INFERENCE_SEMAPHORE.try_acquire();
        if let Ok(permit) = p1 {
            // Only one permit exists — a second try_acquire must fail while first is held.
            let p2 = GPU_INFERENCE_SEMAPHORE.try_acquire();
            assert!(
                p2.is_err(),
                "Second try_acquire must fail while the first permit is outstanding"
            );
            drop(permit);
            // After release, the slot is available again.
            assert_eq!(GPU_INFERENCE_SEMAPHORE.available_permits(), 1);
        }
        // If we couldn't get p1 (another test holds it), skip silently — that also proves
        // the semaphore is correctly limiting concurrency.
    }
}