Skip to main content

ruvllm/
lib.rs

1//! # RuvLLM - LLM Serving Runtime with Ruvector Integration
2//!
3//! RuvLLM is an edge-focused LLM serving runtime designed for portable, high-performance
4//! inference across heterogeneous hardware. It integrates with Ruvector for intelligent
5//! memory capabilities, enabling continuous self-improvement through SONA learning.
6//!
7//! ## Architecture
8//!
9//! RuvLLM uses Ruvector as a unified memory layer with three distinct roles:
10//!
11//! - **Policy Memory Store**: Learned thresholds and parameters for runtime decisions
12//! - **Session State Index**: Multi-turn conversation state with KV cache references
13//! - **Witness Log Index**: Audit logging with semantic search capabilities
14//!
15//! ## Key Components
16//!
17//! - [`PagedAttention`]: Memory-efficient attention mechanism with page tables
18//! - [`TwoTierKvCache`]: FP16 tail + quantized store for optimal memory/quality tradeoff
19//! - [`AdapterManager`]: LoRA adapter loading and hot-swapping
20//! - [`SessionManager`]: Session lifecycle and state management
21//! - [`PolicyStore`]: Ruvector-backed policy storage with semantic search
22//! - [`WitnessLog`]: Audit logging with HNSW-indexed semantic search
23//! - [`SonaIntegration`]: Three-tier learning loop integration
24//!
25//! ## Example
26//!
27//! ```rust,ignore
28//! use ruvllm::{RuvLLMConfig, RuvLLMEngine};
29//!
30//! // Create engine with default configuration
31//! let config = RuvLLMConfig::default();
32//! let engine = RuvLLMEngine::new(config)?;
33//!
34//! // Create a session
35//! let session = engine.create_session("user-123")?;
36//!
37//! // Process a request
38//! let response = engine.process(&session, "Hello, world!")?;
39//! ```
40
41#![warn(missing_docs)]
42#![warn(clippy::all)]
43
44pub mod adapter_manager;
45pub mod autodetect;
46pub mod backends;
47pub mod bitnet;
48pub mod capabilities;
49pub mod claude_flow;
50pub mod context;
51pub mod error;
52pub mod evaluation;
53pub mod gguf;
54pub mod hub;
55pub mod intelligence;
56pub mod kernels;
57pub mod kv_cache;
58pub mod lora;
59pub mod memory_pool;
60#[cfg(all(target_os = "macos", feature = "metal-compute"))]
61pub mod metal;
62pub mod models;
63pub mod optimization;
64pub mod paged_attention;
65pub mod policy_store;
66pub mod quality;
67pub mod quantize;
68pub mod reasoning_bank;
69pub mod reflection;
70pub mod ruvector_integration;
71pub mod serving;
72pub mod session;
73pub mod session_index;
74pub mod sona;
75pub mod speculative;
76pub mod tokenizer;
77pub mod training;
78pub mod types;
79pub mod witness_log;
80
81// Test modules
82#[cfg(test)]
83mod tests;
84
85// Re-exports
86pub use adapter_manager::{AdapterConfig, AdapterManager, LoraAdapter};
87pub use autodetect::{
88    Architecture, ComputeBackend, CoreInfo, CpuFeatures, GpuBackend, GpuCapabilities,
89    InferenceConfig, Platform, SystemCapabilities,
90};
91#[cfg(feature = "candle")]
92pub use backends::CandleBackend;
93pub use backends::{
94    create_backend, DType, DeviceType, GenerateParams, GeneratedToken, LlmBackend,
95    ModelArchitecture, ModelConfig, ModelInfo, Quantization, SharedBackend, SpecialTokens,
96    StreamEvent, TokenStream, Tokenizer,
97};
98#[cfg(feature = "async-runtime")]
99pub use backends::{AsyncTokenStream, LlmBackendAsync};
100pub use claude_flow::{
101    AgentContext,
102    AgentCoordinator,
103    AgentRouter,
104    AgentState,
105    AgentType,
106    AnalyzerStats as ModelAnalyzerStats,
107    ClassificationResult,
108    ClaudeFlowAgent,
109    ClaudeFlowTask,
110    // Claude API Integration (NEW)
111    ClaudeModel,
112    ClaudeRequest,
113    ClaudeResponse,
114    // Model Router (NEW) - Intelligent routing to Haiku/Sonnet/Opus
115    ComplexityFactors,
116    ComplexityScore,
117    ComplexityWeights,
118    ContentBlock,
119    ContextManager,
120    ContextWindow,
121    CoordinatorStats,
122    CostEstimator,
123    FlowOptimizer,
124    HnswDistanceMetric,
125    // HNSW semantic router (150x faster pattern search)
126    HnswRouter,
127    HnswRouterConfig,
128    HnswRouterStats,
129    HnswRoutingResult,
130    HooksConfig,
131    // Hooks Integration (NEW v2.3) - Unified Claude Flow hooks interface
132    HooksIntegration,
133    HybridRouter,
134    LatencySample,
135    LatencyStats as ClaudeLatencyStats,
136    LatencyTracker,
137    LearningMetrics,
138    Message,
139    MessageRole,
140    ModelRouter,
141    ModelRoutingDecision,
142    ModelSelector,
143    OptimizationConfig,
144    OptimizationResult,
145    PatternMatch,
146    PostEditInput,
147    PostEditResult,
148    PostTaskInput,
149    PostTaskResult,
150    PreEditInput,
151    PreEditResult,
152    PreTaskInput,
153    PreTaskResult,
154    QualityAssessment,
155    QualityMonitor,
156    ResponseStreamer,
157    RoutingDecision as AgentRoutingDecision,
158    SelectionCriteria,
159    SelectorStats,
160    SessionEndResult,
161    SessionMetrics,
162    SessionState as HooksSessionState,
163    StepResult,
164    StreamEvent as ClaudeStreamEvent,
165    StreamStats,
166    StreamToken,
167    TaskClassifier,
168    TaskComplexityAnalyzer,
169    TaskPattern,
170    TaskType,
171    UsageStats,
172    WorkflowResult,
173    WorkflowStep,
174};
175pub use error::{Result, RuvLLMError};
176pub use gguf::{
177    GgufFile,
178    GgufHeader,
179    // New GGUF loading types
180    GgufLoader,
181    GgufModelLoader,
182    GgufQuantType,
183    GgufValue,
184    LayerWeights,
185    LoadConfig,
186    LoadProgress,
187    LoadedTensor,
188    LoadedWeights,
189    ModelConfig as GgufModelConfig,
190    ModelInitializer,
191    ModelWeights,
192    ProgressModelBuilder,
193    QuantizedTensor,
194    QuantizedWeight,
195    StreamingLoader,
196    TensorCategory,
197    TensorInfo,
198    TensorNameMapper,
199    WeightTensor,
200};
201pub use hub::{
202    default_cache_dir,
203    get_hf_token,
204    get_model_info,
205    ChecksumVerifier,
206    DatasetInfo,
207    DownloadConfig,
208    DownloadError,
209    DownloadProgress,
210    Framework,
211    HardwareRequirements,
212    // Common
213    HubError,
214    License,
215    MetricResult,
216    // Model Card
217    ModelCard,
218    ModelCardBuilder,
219    // Download
220    ModelDownloader,
221    ModelInfo as HubModelInfo,
222    ModelMetadata,
223    ModelSize,
224    // Upload
225    ModelUploader,
226    MultiProgress,
227    // Progress
228    ProgressBar,
229    ProgressCallback,
230    ProgressIndicator,
231    ProgressStyle,
232    QuantizationLevel,
233    // Registry
234    RuvLtraRegistry,
235    TaskType as HubTaskType,
236    UploadConfig,
237    UploadError,
238    UploadProgress,
239};
240pub use kv_cache::{
241    CacheQuantization, CacheTier, KvCacheConfig, KvCacheStats, PooledKvBlock, PooledKvCache,
242    PooledKvCacheStats, TwoTierKvCache,
243};
244pub use lora::{
245    AdaptFeedback, AdapterComposer, AdapterPool, AdapterRegistry, CompositionStrategy,
246    EwcRegularizer, LearningRateSchedule, MicroLoRA, MicroLoraConfig, TargetModule, TrainingConfig,
247    TrainingPipeline,
248};
249pub use memory_pool::{
250    ArenaStats, BufferPool, BufferPoolStats, BufferSize, InferenceArena, MemoryManager,
251    MemoryManagerConfig, MemoryManagerStats, PooledBuffer, ScratchSpace, ScratchSpaceManager,
252    ScratchStats, CACHE_LINE_SIZE, DEFAULT_ALIGNMENT,
253};
254pub use optimization::{
255    AdaptationResult, BatchSizeStrategy, ConsolidationStrategy, InferenceMetrics,
256    KvCachePressurePolicy, LatencyHistogram, LearningLoopStats, MetricsCollector, MetricsSnapshot,
257    MovingAverage, OptimizationDecision, OptimizationTrigger, RealtimeConfig, RealtimeOptimizer,
258    SonaLlm, SonaLlmConfig, SpeculativeConfig, TokenBudgetAllocation, TrainingSample,
259};
260pub use paged_attention::{PageBlock, PageTable, PagedAttention, PagedAttentionConfig};
261pub use policy_store::{PolicyEntry, PolicyStore, PolicyType, QuantizationPolicy, RouterPolicy};
262pub use quantize::{
263    dequantize_for_ane,
264    // Memory estimation
265    estimate_memory_q4,
266    estimate_memory_q5,
267    estimate_memory_q8,
268    // Quantization functions
269    quantize_ruvltra_q4,
270    quantize_ruvltra_q5,
271    quantize_ruvltra_q8,
272    MemoryEstimate,
273    // Block types
274    Q4KMBlock,
275    Q5KMBlock,
276    Q8Block,
277    QuantConfig,
278    // Progress tracking
279    QuantProgress,
280    QuantStats,
281    // Core quantizer
282    RuvltraQuantizer,
283    TargetFormat,
284};
285pub use serving::{
286    BatchStats,
287    // Batch types
288    BatchedRequest,
289    CompletedRequest,
290    // Scheduler
291    ContinuousBatchScheduler,
292    DecodeTask,
293    FinishReason,
294    GenerationResult,
295    // Request types
296    InferenceRequest,
297    IterationPlan,
298    IterationScheduler,
299    KvCacheAllocation,
300    // KV cache management
301    KvCacheManager,
302    KvCacheManagerStats,
303    KvCachePoolConfig,
304    PreemptionMode,
305    PrefillTask,
306    Priority,
307    PriorityPolicy,
308    RequestId,
309    RequestQueue,
310    RequestState,
311    RunningRequest,
312    ScheduledBatch,
313    SchedulerConfig,
314    SchedulerStats,
315    // Engine
316    ServingEngine,
317    ServingEngineConfig,
318    ServingMetrics,
319    TokenBudget,
320    TokenOutput,
321};
322pub use session::{Session, SessionConfig, SessionManager};
323pub use session_index::{KvCacheReference, SessionIndex, SessionState};
324pub use sona::{LearningLoop, SonaConfig, SonaIntegration};
325pub use speculative::{
326    log_softmax, sample_from_probs, softmax, top_k_filter, top_p_filter, AtomicSpeculativeStats,
327    SpeculationTree, SpeculativeConfig as SpeculativeDecodingConfig, SpeculativeDecoder,
328    SpeculativeStats, TreeNode, VerificationResult,
329};
330pub use tokenizer::{
331    ChatMessage, ChatTemplate, Role, RuvTokenizer, StreamingDecodeBuffer, TokenizerSpecialTokens,
332};
333pub use training::{
334    AugmentationConfig,
335    // Claude task dataset
336    ClaudeTaskDataset,
337    ClaudeTaskExample,
338    ComplexityLevel,
339    DatasetConfig,
340    DatasetGenerator,
341    DatasetStats,
342    DifficultyLevel,
343    DifficultyWeights,
344    DomainType,
345    EvaluationMetrics,
346    GrpoBatch,
347    // GRPO optimizer for reinforcement learning
348    GrpoConfig,
349    GrpoOptimizer,
350    GrpoSample,
351    GrpoStats,
352    GrpoUpdateResult,
353    McpToolCategory,
354    McpToolDef,
355    // MCP tool training
356    McpToolTrainer,
357    McpTrainingConfig,
358    ParamType,
359    SampleGroup,
360    StepBuilder,
361    TaskCategory,
362    TaskMetadata,
363    // Tool calling dataset
364    ToolCallDataset,
365    ToolCallExample,
366    ToolDatasetConfig,
367    ToolDatasetStats,
368    ToolParam,
369    ToolTrajectory,
370    TrainingCheckpoint,
371    TrainingResult,
372    TrainingStats,
373    TrajectoryBuilder,
374    TrajectoryMetadata,
375    TrajectoryStep,
376};
377pub use types::*;
378pub use witness_log::{
379    AsyncWriteConfig, LatencyBreakdown, RoutingDecision, WitnessEntry, WitnessLog, WitnessLogStats,
380};
381
382// RuvLTRA model architecture exports
383pub use models::{
384    AneDispatcher,
385    AneOptimization,
386    MemoryLayout,
387    QuantizationType,
388    RuvLtraAttention,
389    // Configuration
390    RuvLtraConfig,
391    RuvLtraDecoderLayer,
392    RuvLtraMLP,
393    // Model components
394    RuvLtraModel,
395    // Utilities
396    RuvLtraModelInfo,
397};
398
399// Ruvector integration exports (unified entry point for all Ruvector capabilities)
400pub use capabilities::{
401    gate_feature, gate_feature_or, RuvectorCapabilities, ATTENTION_AVAILABLE, GNN_AVAILABLE,
402    GRAPH_AVAILABLE, HNSW_AVAILABLE, PARALLEL_AVAILABLE, SIMD_AVAILABLE, SONA_AVAILABLE,
403};
404pub use ruvector_integration::{
405    IndexStats,
406    IntegrationConfig,
407    IntegrationStats,
408    // Intelligence layer
409    IntelligenceLayer,
410    IntelligenceLayerStats,
411    IntelligentRoutingDecision,
412    // Main integration
413    RuvectorIntegration,
414    SearchResultWithMetadata,
415    // Unified index
416    UnifiedIndex,
417    VectorMetadata,
418};
419
420// Intelligence provider exports
421pub use intelligence::{
422    FileSignalProvider, HumanVerdict, IntelligenceLoader, IntelligenceProvider, Outcome,
423    ProviderError, ProviderQualityWeights, ProviderResult, QualityFactors, QualitySignal,
424};
425
426// Quality scoring exports
427pub use quality::{
428    CoherenceConfig,
429    // Coherence validation
430    CoherenceValidator,
431    CoherenceViolation,
432    CombinedValidator,
433    ComparisonResult,
434    ContradictionResult,
435    DiversificationSuggestion,
436    // Diversity analysis
437    DiversityAnalyzer,
438    DiversityConfig,
439    DiversityResult,
440    FormatValidator,
441    ImprovementRecommendation,
442    JsonSchemaValidator,
443    LogicalFlowResult,
444    ModeCollapseResult,
445    QualityDimension,
446    QualityHistory,
447    // Core metrics
448    QualityMetrics,
449    // Scoring engine
450    QualityScoringEngine,
451    QualitySummary,
452    QualityWeights,
453    RangeValidator,
454    // Schema validators
455    SchemaValidator,
456    ScoringConfig,
457    ScoringContext,
458    SemanticConsistencyResult,
459    TrendAnalysis,
460    TrendDirection,
461    TypeValidator,
462    ValidationCombinator,
463    ValidationError,
464    ValidationResult,
465};
466
467// Context management exports (intelligent pruning and semantic memory)
468pub use context::{
469    // Agentic memory
470    AgenticMemory,
471    AgenticMemoryConfig,
472    AttentionWeights,
473    CacheStats,
474    CachedToolResult,
475    ClaudeFlowBridgeConfig,
476    // Claude Flow bridge
477    ClaudeFlowMemoryBridge,
478    CompressedEpisode,
479    ContextElement,
480    ContextManagerConfig,
481    ElementPriority,
482    Episode,
483    EpisodeMetadata,
484    EpisodeTrajectory,
485    // Episodic memory
486    EpisodicMemory,
487    EpisodicMemoryConfig,
488    // Context manager
489    IntelligentContextManager,
490    MemoryType,
491    PreparedContext,
492    PriorityScorer,
493    ScratchpadEntry,
494    SemanticCacheConfig,
495    // Semantic cache
496    SemanticToolCache,
497    SyncResult,
498    TaskContext,
499    // Working memory
500    WorkingMemory,
501    WorkingMemoryConfig,
502};
503
504// Self-Reflection architecture exports (error recovery and self-correction)
505pub use reflection::{
506    BaseAgent,
507    CompletenessChecker,
508    ConfidenceCheckRecord,
509    // Confidence-based revision (IoE pattern)
510    ConfidenceChecker,
511    ConfidenceConfig,
512    ConfidenceFactorWeights,
513    ConfidenceLevel,
514    ConsistencyChecker,
515    CorrectnessChecker,
516    CritiqueIssue,
517    CritiqueResult,
518    ErrorCategory,
519    ErrorCluster,
520    ErrorLearnerStats,
521    ErrorPattern,
522    // Error pattern learning
523    ErrorPatternLearner,
524    ErrorPatternLearnerConfig,
525    ExecutionContext,
526    ExecutionResult,
527    IssueCategory,
528    // Multi-perspective critique
529    Perspective,
530    PerspectiveConfig,
531    PreviousAttempt,
532    RecoveryOutcome,
533    RecoveryStrategy,
534    RecoverySuggestion,
535    Reflection,
536    ReflectionConfig,
537    ReflectionStrategy,
538    // Reflective agent wrapper
539    ReflectiveAgent,
540    ReflectiveAgentStats,
541    RetryConfig,
542    RevisionResult,
543    SimilarError,
544    UnifiedCritique,
545    WeakPoint,
546    WeaknessType,
547};
548
549// ReasoningBank exports (learning from Claude trajectories)
550pub use reasoning_bank::{
551    CompressedTrajectory,
552    ConsolidationConfig,
553    DistillationConfig,
554    FailurePattern as VerdictFailurePattern,
555    FisherInformation,
556    ImportanceScore,
557    KeyLesson,
558    // Memory distillation
559    MemoryDistiller,
560    Pattern,
561    PatternCategory,
562    // EWC++ consolidation
563    PatternConsolidator,
564    PatternSearchResult,
565    PatternStats,
566    // Pattern storage with HNSW
567    PatternStore,
568    PatternStoreConfig,
569    // Main ReasoningBank
570    ReasoningBank,
571    ReasoningBankConfig,
572    ReasoningBankStats,
573    RecoveryStrategy as VerdictRecoveryStrategy,
574    RootCause,
575    StepOutcome,
576    // Trajectory recording (aliased to avoid conflict with training::TrajectoryStep)
577    Trajectory as ReasoningTrajectory,
578    TrajectoryId,
579    TrajectoryRecorder,
580    TrajectoryStep as ReasoningTrajectoryStep,
581    // Verdict system (aliased to avoid conflict with claude_flow::reasoning_bank::Verdict)
582    Verdict as ReasoningVerdict,
583    VerdictAnalyzer,
584};
585
586// Metal GPU acceleration exports (macOS only)
587#[cfg(all(target_os = "macos", feature = "metal-compute"))]
588pub use metal::{
589    get_device_info, is_metal_available, shader_source, tile_sizes, AttentionParams, GemmParams,
590    MetalBuffer, MetalBufferPool, MetalConfig, MetalContext, MetalDeviceInfo, MetalPipelines,
591    NormParams, RopeParams,
592};
593
594/// RuvLLM engine configuration.
595///
596/// This configuration struct controls all aspects of the RuvLLM engine,
597/// including storage paths, attention mechanisms, KV cache settings,
598/// session management, and SONA learning parameters.
599///
600/// # Example
601///
602/// ```rust,ignore
603/// use ruvllm::{RuvLLMConfig, PagedAttentionConfig, KvCacheConfig};
604///
605/// let config = RuvLLMConfig {
606///     storage_path: "/var/ruvllm".to_string(),
607///     max_sessions: 500,
608///     embedding_dim: 1024,
609///     ..Default::default()
610/// };
611/// ```
612///
613/// # Performance Tuning
614///
615/// | Parameter | Default | High Throughput | Low Latency |
616/// |-----------|---------|-----------------|-------------|
617/// | `max_sessions` | 1000 | 2000 | 500 |
618/// | `embedding_dim` | 768 | 1024 | 512 |
619#[derive(Debug, Clone)]
620pub struct RuvLLMConfig {
621    /// Path to Ruvector storage
622    pub storage_path: String,
623    /// Paged attention configuration
624    pub paged_attention: PagedAttentionConfig,
625    /// KV cache configuration
626    pub kv_cache: KvCacheConfig,
627    /// Session configuration
628    pub session: SessionConfig,
629    /// SONA learning configuration
630    pub sona: SonaConfig,
631    /// Maximum concurrent sessions
632    pub max_sessions: usize,
633    /// Embedding dimension for semantic search
634    pub embedding_dim: usize,
635}
636
637impl Default for RuvLLMConfig {
638    fn default() -> Self {
639        Self {
640            storage_path: ".ruvllm".to_string(),
641            paged_attention: PagedAttentionConfig::default(),
642            kv_cache: KvCacheConfig::default(),
643            session: SessionConfig::default(),
644            sona: SonaConfig::default(),
645            max_sessions: 1000,
646            embedding_dim: 768,
647        }
648    }
649}
650
651/// Main RuvLLM engine for LLM inference with intelligent memory.
652///
653/// The `RuvLLMEngine` is the primary entry point for RuvLLM, providing:
654///
655/// - **Session Management**: Create and manage user sessions with state persistence
656/// - **Policy Storage**: Ruvector-backed semantic search for runtime policies
657/// - **Adapter Management**: Hot-swapping LoRA adapters for task-specific tuning
658/// - **Witness Logging**: Audit trail with HNSW-indexed semantic search
659/// - **SONA Learning**: Three-tier continuous learning integration
660///
661/// # Example
662///
663/// ```rust,ignore
664/// use ruvllm::{RuvLLMEngine, RuvLLMConfig};
665///
666/// // Create engine with configuration
667/// let config = RuvLLMConfig::default();
668/// let engine = RuvLLMEngine::new(config)?;
669///
670/// // Create a session for a user
671/// let session = engine.create_session(Some("user-123"))?;
672///
673/// // Search for relevant policies
674/// let embedding = compute_embedding("code completion task");
675/// let policies = engine.search_policies(&embedding, 5)?;
676///
677/// // Record audit entry
678/// let entry = WitnessEntry::new("completion", latency, routing);
679/// engine.record_witness(entry)?;
680/// ```
681///
682/// # Architecture
683///
684/// ```text
685/// +-------------------+     +-------------------+
686/// | RuvLLMEngine      |---->| PolicyStore       |
687/// |                   |     | (Ruvector)        |
688/// |                   |     +-------------------+
689/// |                   |
690/// |                   |---->| SessionIndex      |
691/// |                   |     | (Ruvector)        |
692/// |                   |     +-------------------+
693/// |                   |
694/// |                   |---->| WitnessLog        |
695/// |                   |     | (HNSW search)     |
696/// +-------------------+     +-------------------+
697/// ```
698pub struct RuvLLMEngine {
699    /// Configuration
700    config: RuvLLMConfig,
701    /// Policy store backed by Ruvector
702    policy_store: PolicyStore,
703    /// Session manager
704    session_manager: SessionManager,
705    /// Session index backed by Ruvector
706    session_index: SessionIndex,
707    /// Adapter manager
708    adapter_manager: AdapterManager,
709    /// Witness log for audit
710    witness_log: WitnessLog,
711    /// SONA learning integration
712    sona: SonaIntegration,
713}
714
715impl RuvLLMEngine {
716    /// Create a new RuvLLM engine with the given configuration.
717    ///
718    /// This initializes all subsystems including:
719    /// - Policy store for learned thresholds
720    /// - Session index for conversation state
721    /// - Witness log for audit trails
722    /// - SONA integration for learning loops
723    ///
724    /// # Arguments
725    ///
726    /// * `config` - Engine configuration
727    ///
728    /// # Errors
729    ///
730    /// Returns an error if storage paths cannot be created or initialized.
731    ///
732    /// # Example
733    ///
734    /// ```rust,ignore
735    /// use ruvllm::{RuvLLMEngine, RuvLLMConfig};
736    ///
737    /// let engine = RuvLLMEngine::new(RuvLLMConfig::default())?;
738    /// ```
739    pub fn new(config: RuvLLMConfig) -> Result<Self> {
740        let storage_path = &config.storage_path;
741
742        let policy_store =
743            PolicyStore::new(&format!("{}/policies", storage_path), config.embedding_dim)?;
744
745        let session_index =
746            SessionIndex::new(&format!("{}/sessions", storage_path), config.embedding_dim)?;
747
748        let witness_log =
749            WitnessLog::new(&format!("{}/witness", storage_path), config.embedding_dim)?;
750
751        let session_manager = SessionManager::new(config.session.clone());
752        let adapter_manager = AdapterManager::new();
753        let sona = SonaIntegration::new(config.sona.clone());
754
755        Ok(Self {
756            config,
757            policy_store,
758            session_manager,
759            session_index,
760            adapter_manager,
761            witness_log,
762            sona,
763        })
764    }
765
766    /// Create a new session for a user.
767    ///
768    /// Sessions track conversation state, KV cache references, and enable
769    /// multi-turn interactions. Each session is automatically indexed in
770    /// Ruvector for semantic retrieval.
771    ///
772    /// # Arguments
773    ///
774    /// * `user_id` - Optional user identifier for session tracking
775    ///
776    /// # Returns
777    ///
778    /// A new `Session` instance with a unique ID.
779    ///
780    /// # Example
781    ///
782    /// ```rust,ignore
783    /// // Anonymous session
784    /// let session = engine.create_session(None)?;
785    ///
786    /// // User-identified session
787    /// let session = engine.create_session(Some("user-123"))?;
788    /// println!("Session ID: {}", session.id());
789    /// ```
790    pub fn create_session(&self, user_id: Option<&str>) -> Result<Session> {
791        let session = self.session_manager.create_session(user_id)?;
792
793        // Index the session in Ruvector
794        let state = SessionState::from_session(&session);
795        self.session_index.store(&state)?;
796
797        Ok(session)
798    }
799
800    /// Get session by ID
801    pub fn get_session(&self, session_id: &str) -> Result<Option<Session>> {
802        self.session_manager.get_session(session_id)
803    }
804
805    /// Search for policies matching the given context embedding.
806    ///
807    /// Uses HNSW-indexed semantic search to find relevant policies
808    /// (quantization settings, routing rules, etc.) based on the
809    /// current request context.
810    ///
811    /// # Arguments
812    ///
813    /// * `context_embedding` - Vector embedding of the current context
814    /// * `limit` - Maximum number of policies to return
815    ///
816    /// # Returns
817    ///
818    /// Vector of matching `PolicyEntry` items, sorted by relevance.
819    ///
820    /// # Example
821    ///
822    /// ```rust,ignore
823    /// let context = compute_embedding("code completion for Python");
824    /// let policies = engine.search_policies(&context, 5)?;
825    ///
826    /// for policy in policies {
827    ///     println!("Policy: {:?}, score: {}", policy.policy_type, policy.score);
828    /// }
829    /// ```
830    pub fn search_policies(
831        &self,
832        context_embedding: &[f32],
833        limit: usize,
834    ) -> Result<Vec<PolicyEntry>> {
835        self.policy_store.search(context_embedding, limit)
836    }
837
838    /// Record a witness entry for audit logging.
839    ///
840    /// Witness entries provide an audit trail of inference decisions,
841    /// including latency breakdowns, routing decisions, and quality scores.
842    /// All entries are HNSW-indexed for semantic search.
843    ///
844    /// # Arguments
845    ///
846    /// * `entry` - The witness entry to record
847    ///
848    /// # Example
849    ///
850    /// ```rust,ignore
851    /// use ruvllm::{WitnessEntry, LatencyBreakdown, RoutingDecision};
852    ///
853    /// let entry = WitnessEntry {
854    ///     session_id: session.id().to_string(),
855    ///     request_type: "completion".to_string(),
856    ///     latency: LatencyBreakdown {
857    ///         prefill_ms: 45.0,
858    ///         decode_ms: 120.0,
859    ///         total_ms: 165.0,
860    ///     },
861    ///     routing: RoutingDecision::default(),
862    ///     ..Default::default()
863    /// };
864    ///
865    /// engine.record_witness(entry)?;
866    /// ```
867    pub fn record_witness(&self, entry: WitnessEntry) -> Result<()> {
868        self.witness_log.record(entry)
869    }
870
871    /// Search witness logs semantically
872    pub fn search_witness(
873        &self,
874        query_embedding: &[f32],
875        limit: usize,
876    ) -> Result<Vec<WitnessEntry>> {
877        self.witness_log.search(query_embedding, limit)
878    }
879
880    /// Get the SONA integration for learning
881    pub fn sona(&self) -> &SonaIntegration {
882        &self.sona
883    }
884
885    /// Get the adapter manager
886    pub fn adapters(&self) -> &AdapterManager {
887        &self.adapter_manager
888    }
889
890    /// Get the policy store
891    pub fn policies(&self) -> &PolicyStore {
892        &self.policy_store
893    }
894}