Skip to main content

batuta/oracle/rag/
mod.rs

1//! RAG Oracle - Intelligent retrieval-augmented generation for Sovereign AI Stack
2//!
3//! Implements the APR-Powered RAG Oracle specification with:
4//! - Content-addressable indexing (BLAKE3)
5//! - Hybrid retrieval (BM25 + dense)
6//! - Heijunka load-leveled reindexing
7//! - Jidoka stop-on-error validation
8//!
9//! # Toyota Production System Principles
10//!
11//! - **Jidoka**: Stop-on-error during indexing
12//! - **Poka-Yoke**: Content hashing prevents stale indexes
13//! - **Heijunka**: Load-leveled incremental reindexing
14//! - **Kaizen**: Continuous embedding improvement
15//! - **Genchi Genbutsu**: Direct observation of source docs
16//! - **Muda**: Delta-only updates eliminate waste
17
18// Allow dead code and unused imports for library implementation
19// Full integration will use all exported types
20pub mod binary_index;
21mod chunker;
22mod falsification;
23pub mod fingerprint;
24mod indexer;
25pub mod persistence;
26pub mod profiling;
27pub mod quantization;
28pub mod query_cache;
29mod retriever;
30pub mod tui;
31mod types;
32mod validator;
33
34// Binary index exports
35#[allow(unused_imports)]
36pub use binary_index::{
37    BinaryIndexError, BinaryIndexReader, BinaryIndexWriter, DocumentEntry, IndexHeader, Posting,
38    MAGIC, VERSION,
39};
40#[allow(unused_imports)]
41pub use chunker::SemanticChunker;
42#[allow(unused_imports)]
43pub use fingerprint::{blake3_hash, ChunkerConfig, DocumentFingerprint};
44#[allow(unused_imports)]
45pub use indexer::HeijunkaReindexer;
46// Profiling exports
47#[allow(unused_imports)]
48pub use profiling::{
49    get_summary, record_cache_hit, record_cache_miss, record_query_latency, reset_metrics, span,
50    Counter, Histogram, HistogramBucket, MetricsSummary, RagMetrics, SpanStats, TimedSpan,
51    GLOBAL_METRICS,
52};
53// Query cache exports
54#[allow(unused_imports)]
55pub use query_cache::{CacheStats, CachedPlan, QueryPlanCache};
56// Scalar Int8 Rescoring exports (specification implementation)
57#[allow(unused_imports)]
58pub use quantization::{
59    CalibrationStats, QuantizationError, QuantizationParams, QuantizedEmbedding, RescoreResult,
60    RescoreRetriever, RescoreRetrieverConfig, SimdBackend,
61};
62#[allow(unused_imports)]
63pub use retriever::{HybridRetriever, InvertedIndex};
64#[allow(unused_imports)]
65pub use types::RetrievalResult;
66#[allow(unused_imports)]
67pub use types::*;
68#[allow(unused_imports)]
69pub use validator::JidokaIndexValidator;
70
71use serde::{Deserialize, Serialize};
72use std::collections::HashMap;
73use std::path::PathBuf;
74
75/// RAG Oracle - Main interface for stack documentation queries
76///
77/// Dogfoods the Sovereign AI Stack:
78/// - `trueno-rag` for chunking and retrieval
79/// - `trueno-db` for vector storage
80/// - `aprender` for embeddings (.apr format)
81/// - `simular` for deterministic testing
82#[derive(Debug)]
83pub struct RagOracle {
84    /// Document index with fingerprints
85    index: DocumentIndex,
86    /// Hybrid retriever (BM25 + dense)
87    retriever: HybridRetriever,
88    /// Jidoka validator
89    validator: JidokaIndexValidator,
90    /// Configuration
91    config: RagOracleConfig,
92}
93
94/// RAG Oracle configuration
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct RagOracleConfig {
97    /// Stack component repositories to index
98    pub repositories: Vec<PathBuf>,
99    /// Document sources to include
100    pub sources: Vec<DocumentSource>,
101    /// Chunk size in tokens
102    pub chunk_size: usize,
103    /// Chunk overlap in tokens
104    pub chunk_overlap: usize,
105    /// Number of results to return
106    pub top_k: usize,
107    /// Reranking depth
108    pub rerank_depth: usize,
109}
110
111impl Default for RagOracleConfig {
112    fn default() -> Self {
113        Self {
114            repositories: vec![],
115            sources: vec![
116                DocumentSource::ClaudeMd,
117                DocumentSource::ReadmeMd,
118                DocumentSource::CargoToml,
119                DocumentSource::DocsDir,
120            ],
121            chunk_size: 512,
122            chunk_overlap: 64,
123            top_k: 5,
124            rerank_depth: 20,
125        }
126    }
127}
128
129/// Document source types with priority (Genchi Genbutsu)
130#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
131pub enum DocumentSource {
132    /// CLAUDE.md - P0 Critical, indexed on every commit
133    ClaudeMd,
134    /// README.md - P1 High, indexed on release
135    ReadmeMd,
136    /// Cargo.toml - P1 High, indexed on version bump
137    CargoToml,
138    /// pyproject.toml - P1 High, Python project metadata
139    PyProjectToml,
140    /// docs/*.md - P2 Medium, weekly scan
141    DocsDir,
142    /// examples/*.rs - P3 Low, monthly scan
143    ExamplesDir,
144    /// Docstrings - P3 Low, on release
145    Docstrings,
146    /// Python source files - P2 Medium, for ground truth corpora
147    PythonSource,
148    /// Python test files - P3 Low, for ground truth validation
149    PythonTests,
150}
151impl DocumentSource {
152    /// Get priority level (0 = highest)
153    pub fn priority(&self) -> u8 {
154        match self {
155            Self::ClaudeMd => 0,
156            Self::ReadmeMd | Self::CargoToml | Self::PyProjectToml => 1,
157            Self::DocsDir | Self::PythonSource => 2,
158            Self::ExamplesDir | Self::Docstrings | Self::PythonTests => 3,
159        }
160    }
161
162    /// Get glob pattern for this source
163    pub fn glob_pattern(&self) -> &'static str {
164        match self {
165            Self::ClaudeMd => "CLAUDE.md",
166            Self::ReadmeMd => "README.md",
167            Self::CargoToml => "Cargo.toml",
168            Self::PyProjectToml => "pyproject.toml",
169            Self::DocsDir => "docs/**/*.md",
170            Self::ExamplesDir => "examples/**/*.rs",
171            Self::Docstrings => "src/**/*.rs",
172            Self::PythonSource => "src/**/*.py",
173            Self::PythonTests => "tests/**/*.py",
174        }
175    }
176}
177
178/// Document index containing all indexed documents
179#[derive(Debug, Default, Clone, Serialize, Deserialize)]
180pub struct DocumentIndex {
181    /// Documents by ID
182    documents: HashMap<String, IndexedDocument>,
183    /// Fingerprints for change detection
184    fingerprints: HashMap<String, DocumentFingerprint>,
185    /// Total chunks indexed
186    total_chunks: usize,
187}
188
189/// An indexed document with chunks
190#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct IndexedDocument {
192    /// Unique document ID
193    pub id: String,
194    /// Source component (e.g., "trueno", "aprender")
195    pub component: String,
196    /// Source file path
197    pub path: PathBuf,
198    /// Document source type
199    pub source_type: DocumentSource,
200    /// Document chunks
201    pub chunks: Vec<DocumentChunk>,
202}
203
204/// A chunk of a document
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct DocumentChunk {
207    /// Chunk ID (document_id + chunk_index)
208    pub id: String,
209    /// Chunk content
210    pub content: String,
211    /// Start line in source document
212    pub start_line: usize,
213    /// End line in source document
214    pub end_line: usize,
215    /// Content hash for deduplication
216    pub content_hash: [u8; 32],
217}
218impl RagOracle {
219    /// Create a new RAG Oracle with default configuration
220    pub fn new() -> Self {
221        Self::with_config(RagOracleConfig::default())
222    }
223
224    /// Create a new RAG Oracle with custom configuration
225    pub fn with_config(config: RagOracleConfig) -> Self {
226        Self {
227            index: DocumentIndex::default(),
228            retriever: HybridRetriever::new(),
229            validator: JidokaIndexValidator::new(384), // 384-dim embeddings
230            config,
231        }
232    }
233
234    /// Query the oracle with natural language
235    pub fn query(&self, query: &str) -> Vec<RetrievalResult> {
236        self.retriever.retrieve(query, &self.index, self.config.top_k)
237    }
238
239    /// Get index statistics
240    pub fn stats(&self) -> IndexStats {
241        IndexStats {
242            total_documents: self.index.documents.len(),
243            total_chunks: self.index.total_chunks,
244            components: self
245                .index
246                .documents
247                .values()
248                .map(|d| d.component.clone())
249                .collect::<std::collections::HashSet<_>>()
250                .len(),
251        }
252    }
253
254    /// Check if a document needs reindexing (Poka-Yoke)
255    pub fn needs_reindex(&self, doc_id: &str, current_hash: [u8; 32]) -> bool {
256        self.index
257            .fingerprints
258            .get(doc_id)
259            .map(|fp| fp.content_hash != current_hash)
260            .unwrap_or(true)
261    }
262}
263
264impl Default for RagOracle {
265    fn default() -> Self {
266        Self::new()
267    }
268}
269
270/// Index statistics
271#[derive(Debug, Clone)]
272pub struct IndexStats {
273    /// Total documents indexed
274    pub total_documents: usize,
275    /// Total chunks indexed
276    pub total_chunks: usize,
277    /// Number of components
278    pub components: usize,
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_rag_oracle_creation() {
287        let oracle = RagOracle::new();
288        let stats = oracle.stats();
289        assert_eq!(stats.total_documents, 0);
290        assert_eq!(stats.total_chunks, 0);
291    }
292
293    #[test]
294    fn test_rag_oracle_default() {
295        let oracle = RagOracle::default();
296        let stats = oracle.stats();
297        assert_eq!(stats.total_documents, 0);
298        assert_eq!(stats.components, 0);
299    }
300
301    #[test]
302    fn test_rag_oracle_with_config() {
303        let config = RagOracleConfig {
304            repositories: vec![PathBuf::from("/test")],
305            sources: vec![DocumentSource::ClaudeMd],
306            chunk_size: 256,
307            chunk_overlap: 32,
308            top_k: 10,
309            rerank_depth: 50,
310        };
311        let oracle = RagOracle::with_config(config);
312        let stats = oracle.stats();
313        assert_eq!(stats.total_documents, 0);
314    }
315
316    #[test]
317    fn test_rag_oracle_query_empty_index() {
318        let oracle = RagOracle::new();
319        let results = oracle.query("test query");
320        assert!(results.is_empty());
321    }
322
323    #[test]
324    fn test_document_source_priority() {
325        assert_eq!(DocumentSource::ClaudeMd.priority(), 0);
326        assert_eq!(DocumentSource::ReadmeMd.priority(), 1);
327        assert_eq!(DocumentSource::CargoToml.priority(), 1);
328        assert_eq!(DocumentSource::PyProjectToml.priority(), 1);
329        assert_eq!(DocumentSource::DocsDir.priority(), 2);
330        assert_eq!(DocumentSource::PythonSource.priority(), 2);
331        assert_eq!(DocumentSource::ExamplesDir.priority(), 3);
332        assert_eq!(DocumentSource::Docstrings.priority(), 3);
333        assert_eq!(DocumentSource::PythonTests.priority(), 3);
334    }
335
336    #[test]
337    fn test_document_source_glob_patterns() {
338        assert_eq!(DocumentSource::ClaudeMd.glob_pattern(), "CLAUDE.md");
339        assert_eq!(DocumentSource::ReadmeMd.glob_pattern(), "README.md");
340        assert_eq!(DocumentSource::CargoToml.glob_pattern(), "Cargo.toml");
341        assert_eq!(DocumentSource::PyProjectToml.glob_pattern(), "pyproject.toml");
342        assert_eq!(DocumentSource::DocsDir.glob_pattern(), "docs/**/*.md");
343        assert_eq!(DocumentSource::ExamplesDir.glob_pattern(), "examples/**/*.rs");
344        assert_eq!(DocumentSource::Docstrings.glob_pattern(), "src/**/*.rs");
345        assert_eq!(DocumentSource::PythonSource.glob_pattern(), "src/**/*.py");
346        assert_eq!(DocumentSource::PythonTests.glob_pattern(), "tests/**/*.py");
347    }
348
349    #[test]
350    fn test_config_defaults() {
351        let config = RagOracleConfig::default();
352        assert_eq!(config.chunk_size, 512);
353        assert_eq!(config.chunk_overlap, 64);
354        assert_eq!(config.top_k, 5);
355        assert_eq!(config.rerank_depth, 20);
356        assert!(config.repositories.is_empty());
357        assert!(!config.sources.is_empty());
358    }
359
360    #[test]
361    fn test_config_default_sources() {
362        let config = RagOracleConfig::default();
363        assert!(config.sources.contains(&DocumentSource::ClaudeMd));
364        assert!(config.sources.contains(&DocumentSource::ReadmeMd));
365        assert!(config.sources.contains(&DocumentSource::CargoToml));
366        assert!(config.sources.contains(&DocumentSource::DocsDir));
367    }
368
369    #[test]
370    fn test_needs_reindex_new_document() {
371        let oracle = RagOracle::new();
372        let hash = [0u8; 32];
373        assert!(oracle.needs_reindex("new_doc", hash));
374    }
375
376    #[test]
377    fn test_document_index_default() {
378        let index = DocumentIndex::default();
379        assert!(index.documents.is_empty());
380        assert!(index.fingerprints.is_empty());
381        assert_eq!(index.total_chunks, 0);
382    }
383
384    #[test]
385    fn test_index_stats_components() {
386        let oracle = RagOracle::new();
387        let stats = oracle.stats();
388        assert_eq!(stats.components, 0);
389    }
390
391    // Property-based tests for RAG Oracle
392    mod proptests {
393        use super::*;
394        use proptest::prelude::*;
395
396        proptest! {
397            #![proptest_config(ProptestConfig::with_cases(50))]
398
399            /// Property: Oracle always returns empty results for empty index
400            #[test]
401            fn prop_empty_oracle_returns_empty(query in "[a-z ]{1,100}") {
402                let oracle = RagOracle::new();
403                let results = oracle.query(&query);
404                prop_assert!(results.is_empty());
405            }
406
407            /// Property: Config chunk_overlap is always less than chunk_size
408            #[test]
409            fn prop_config_overlap_less_than_size(
410                chunk_size in 64usize..1024,
411                overlap_factor in 0.0f64..0.5
412            ) {
413                let overlap = (chunk_size as f64 * overlap_factor) as usize;
414                let config = RagOracleConfig {
415                    chunk_size,
416                    chunk_overlap: overlap,
417                    ..Default::default()
418                };
419                prop_assert!(config.chunk_overlap <= config.chunk_size);
420            }
421
422            /// Property: needs_reindex always returns true for new documents
423            #[test]
424            fn prop_needs_reindex_new_doc(doc_id in "[a-z]{3,20}", hash in prop::array::uniform32(0u8..)) {
425                let oracle = RagOracle::new();
426                prop_assert!(oracle.needs_reindex(&doc_id, hash));
427            }
428
429            /// Property: Document source priorities are valid (0-3)
430            #[test]
431            fn prop_source_priority_valid(source_idx in 0usize..9) {
432                let sources = [
433                    DocumentSource::ClaudeMd,
434                    DocumentSource::ReadmeMd,
435                    DocumentSource::CargoToml,
436                    DocumentSource::PyProjectToml,
437                    DocumentSource::DocsDir,
438                    DocumentSource::ExamplesDir,
439                    DocumentSource::Docstrings,
440                    DocumentSource::PythonSource,
441                    DocumentSource::PythonTests,
442                ];
443                let source = sources[source_idx];
444                prop_assert!(source.priority() <= 3);
445            }
446
447            /// Property: Glob patterns are non-empty
448            #[test]
449            fn prop_glob_pattern_nonempty(source_idx in 0usize..9) {
450                let sources = [
451                    DocumentSource::ClaudeMd,
452                    DocumentSource::ReadmeMd,
453                    DocumentSource::CargoToml,
454                    DocumentSource::PyProjectToml,
455                    DocumentSource::DocsDir,
456                    DocumentSource::ExamplesDir,
457                    DocumentSource::Docstrings,
458                    DocumentSource::PythonSource,
459                    DocumentSource::PythonTests,
460                ];
461                let source = sources[source_idx];
462                prop_assert!(!source.glob_pattern().is_empty());
463            }
464
465            /// Property: Stats are consistent
466            #[test]
467            fn prop_stats_consistent(_seed in 0u64..1000) {
468                let oracle = RagOracle::new();
469                let stats = oracle.stats();
470                // Empty oracle should have all zeros
471                prop_assert_eq!(stats.total_documents, 0);
472                prop_assert_eq!(stats.total_chunks, 0);
473                prop_assert_eq!(stats.components, 0);
474            }
475        }
476    }
477}