inference/
lib.rs

1//! # Dakera Inference Engine
2//!
3//! Embedded inference engine for generating vector embeddings locally without
4//! external API calls. This crate provides:
5//!
6//! - **Local Embedding Generation**: Generate embeddings using state-of-the-art
7//!   transformer models running locally on CPU or GPU.
8//! - **Multiple Model Support**: Choose from MiniLM (fast), BGE (balanced), or E5 (quality).
9//! - **Batch Processing**: Efficient batch processing with automatic batching and parallelization.
10//! - **Zero External Dependencies**: No OpenAI, Cohere, or other API keys required.
11//!
12//! ## Quick Start
13//!
14//! ```no_run
15//! use inference::{EmbeddingEngine, ModelConfig, EmbeddingModel};
16//!
17//! #[tokio::main]
18//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
19//!     // Create engine with default model (MiniLM)
20//!     let engine = EmbeddingEngine::new(ModelConfig::default()).await?;
21//!
22//!     // Embed a query
23//!     let query_embedding = engine.embed_query("What is machine learning?").await?;
24//!     println!("Query embedding: {} dimensions", query_embedding.len());
25//!
26//!     // Embed documents
27//!     let docs = vec![
28//!         "Machine learning is a type of artificial intelligence.".to_string(),
29//!         "Deep learning uses neural networks with many layers.".to_string(),
30//!     ];
31//!     let doc_embeddings = engine.embed_documents(&docs).await?;
32//!     println!("Generated {} document embeddings", doc_embeddings.len());
33//!
34//!     Ok(())
35//! }
36//! ```
37//!
38//! ## Model Selection
39//!
40//! Choose the right model for your use case:
41//!
42//! | Model | Speed | Quality | Use Case |
43//! |-------|-------|---------|----------|
44//! | MiniLM | ⚡⚡⚡ | ⭐⭐ | High-throughput, real-time |
45//! | BGE-small | ⚡⚡ | ⭐⭐⭐ | Balanced performance |
46//! | E5-small | ⚡⚡ | ⭐⭐⭐ | Best quality for retrieval |
47//!
48//! ## GPU Acceleration
49//!
50//! Enable GPU acceleration by building with the appropriate feature:
51//!
52//! ```toml
53//! # For NVIDIA GPUs
54//! inference = { path = "crates/inference", features = ["cuda"] }
55//!
56//! # For Apple Silicon
57//! inference = { path = "crates/inference", features = ["metal"] }
58//! ```
59//!
60//! ## Architecture
61//!
62//! ```text
63//! ┌─────────────────────────────────────────────────────────────┐
64//! │                    EmbeddingEngine                          │
65//! │  ┌─────────────┐  ┌──────────────┐  ┌──────────────────┐   │
66//! │  │ ModelConfig │  │ BatchProcessor│  │  ort::Session    │   │
67//! │  │ - model     │  │ - tokenizer  │  │ (ONNX Runtime)   │   │
68//! │  │ - threads   │  │ - batching   │  │ - BERT INT8      │   │
69//! │  │ - batch_sz  │  │ - prefixes   │  │ - mean_pool()    │   │
70//! │  └─────────────┘  └──────────────┘  └──────────────────┘   │
71//! └─────────────────────────────────────────────────────────────┘
72//!                              │
73//!                              ▼
74//!              ┌───────────────────────────────┐
75//!              │      Vec<f32> Embeddings      │
76//!              │  (normalized, model-dim dims) │
77//!              └───────────────────────────────┘
78//! ```
79
80pub mod backend;
81pub mod batch;
82pub mod engine;
83pub mod error;
84pub mod extraction;
85pub mod models;
86pub mod ner;
87pub mod reranker;
88pub mod tiered;
89
90// Global GPU inference semaphore — 1 permit serializes concurrent CUDA forward passes across
91// ALL inference backends (ONNX + Candle). Prevents simultaneous VRAM allocations from
92// exhausting device memory (L4 has 23 GB; FP32 BGE-large ~4–6 GB per concurrent forward pass
93// × 8 ingest workers > 23 GB → CUBLAS_STATUS_ALLOC_FAILED). The GPU is massively parallel
94// internally; application-level serialization costs nothing vs an OOM crash.
95pub(crate) static GPU_INFERENCE_SEMAPHORE: std::sync::LazyLock<
96    std::sync::Arc<tokio::sync::Semaphore>,
97> = std::sync::LazyLock::new(|| std::sync::Arc::new(tokio::sync::Semaphore::new(1)));
98
99// Re-exports for convenience
100pub use backend::{BackendKind, EmbeddingBackend};
101pub use batch::TokenBudgetBatcher;
102pub use engine::{EmbeddingEngine, EmbeddingEngineBuilder};
103pub use error::{InferenceError, Result};
104pub use extraction::{
105    build_provider, ExtractionOpts, ExtractionProvider, ExtractionResult, ExtractorConfig,
106};
107pub use models::{EmbeddingModel, ModelConfig};
108pub use ner::{rule_based_extract, ExtractedEntity, GlinerEngine, NerEngine};
109pub use reranker::CrossEncoderEngine;
110pub use tiered::TieredEngine;
111
112/// Prelude module for convenient imports.
113pub mod prelude {
114    pub use crate::engine::{EmbeddingEngine, EmbeddingEngineBuilder};
115    pub use crate::error::{InferenceError, Result};
116    pub use crate::models::{EmbeddingModel, ModelConfig};
117}
118
119#[cfg(test)]
120mod tests {
121    use super::*;
122
123    #[test]
124    fn test_model_defaults() {
125        let config = ModelConfig::default();
126        assert_eq!(config.model, EmbeddingModel::BgeLarge);
127        assert_eq!(config.max_batch_size, 32);
128        assert!(!config.use_gpu);
129    }
130
131    #[test]
132    fn test_model_dimensions() {
133        assert_eq!(EmbeddingModel::BgeLarge.dimension(), 1024);
134        assert_eq!(EmbeddingModel::MiniLM.dimension(), 384);
135        assert_eq!(EmbeddingModel::BgeSmall.dimension(), 384);
136        assert_eq!(EmbeddingModel::E5Small.dimension(), 384);
137    }
138
139    // ── GPU_INFERENCE_SEMAPHORE structural tests (DAK-6096) ──────────────────
140
141    #[test]
142    fn test_gpu_semaphore_has_one_permit() {
143        // The global GPU semaphore must have exactly 1 permit to guarantee that
144        // at most one CUDA forward pass runs at a time across all ingest workers.
145        // More than 1 permit would allow concurrent VRAM allocations that can exhaust
146        // the L4's 23 GB when 8 workers × FP32 BGE-large forward passes are in flight.
147        let avail = GPU_INFERENCE_SEMAPHORE.available_permits();
148        // Available is 0 or 1: 1 when idle, 0 if a concurrent test holds the permit.
149        assert!(
150            avail <= 1,
151            "GPU semaphore must not have more than 1 permit; got {avail}"
152        );
153    }
154
155    #[tokio::test]
156    async fn test_gpu_semaphore_acquire_and_release() {
157        // try_acquire() is non-blocking so this test cannot deadlock even if
158        // another test is concurrently holding the single permit.
159        let result = GPU_INFERENCE_SEMAPHORE.try_acquire();
160        if let Ok(permit) = result {
161            assert_eq!(GPU_INFERENCE_SEMAPHORE.available_permits(), 0);
162            drop(permit);
163            assert_eq!(GPU_INFERENCE_SEMAPHORE.available_permits(), 1);
164        }
165        // If try_acquire returned Err, another test holds the permit — that's fine.
166        assert!(GPU_INFERENCE_SEMAPHORE.available_permits() <= 1);
167    }
168
169    #[tokio::test]
170    async fn test_gpu_semaphore_second_acquire_blocks_until_first_released() {
171        // Acquire the single permit, then immediately release it, then acquire again.
172        // Verifies that the semaphore enforces the 1-permit invariant correctly.
173        let p1 = GPU_INFERENCE_SEMAPHORE.try_acquire();
174        if let Ok(permit) = p1 {
175            // Only one permit exists — a second try_acquire must fail while first is held.
176            let p2 = GPU_INFERENCE_SEMAPHORE.try_acquire();
177            assert!(
178                p2.is_err(),
179                "Second try_acquire must fail while the first permit is outstanding"
180            );
181            drop(permit);
182            // After release, the slot is available again.
183            assert_eq!(GPU_INFERENCE_SEMAPHORE.available_permits(), 1);
184        }
185        // If we couldn't get p1 (another test holds it), skip silently — that also proves
186        // the semaphore is correctly limiting concurrency.
187    }
188}
inference/lib.rs

inference/
lib.rs