Skip to main content

trueno_rag/sqlite/
mod.rs

1//! SQLite+FTS5 persistent storage backend for RAG indices.
2//!
3//! Provides `SqliteIndex` (implements `SparseIndex`) and `SqliteStore`
4//! (convenience wrapper for document + chunk persistence).
5//!
6//! This module replaces in-memory HashMap-based indices with SQLite-backed
7//! storage using FTS5 for BM25 ranking (Robertson & Zaragoza, 2009).
8//!
9//! # Performance Contract
10//!
11//! Median search latency: 10–50 ms on a 5000+ document corpus with warm
12//! page cache (see sqlite-rag-integration spec, Section 3.1).
13
14pub mod fts;
15pub mod schema;
16
17use crate::index::SparseIndex;
18use crate::{Chunk, ChunkId, Document, Result};
19use rusqlite::Connection;
20use std::path::Path;
21use std::sync::Mutex;
22
23/// SQLite-backed sparse index using FTS5 for BM25 search.
24///
25/// Unlike `BM25Index` (in-memory HashMap), this persists to disk and
26/// delegates BM25 scoring to SQLite's FTS5 extension.
27///
28/// The `Connection` is wrapped in a `Mutex` to satisfy the `Send + Sync`
29/// bounds required by `SparseIndex`. `Mutex<T>` is `Sync` when `T: Send`,
30/// and `rusqlite::Connection` is `Send`. SQLite in WAL mode supports
31/// concurrent readers via separate connections; this single-connection
32/// design serializes access within one process.
33pub struct SqliteIndex {
34    conn: Mutex<Connection>,
35}
36
37// Mutex<Connection> is automatically Send+Sync because Connection: Send.
38// No unsafe impl needed.
39
40impl std::fmt::Debug for SqliteIndex {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        f.debug_struct("SqliteIndex").finish_non_exhaustive()
43    }
44}
45
46/// Helper to map mutex poison errors.
47fn lock_err<T>(e: &std::sync::PoisonError<T>) -> crate::Error {
48    crate::Error::Query(format!("Mutex poisoned: {e}"))
49}
50
51impl SqliteIndex {
52    /// Open or create an index at the given path.
53    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
54        let conn = Connection::open(path.as_ref())
55            .map_err(|e| crate::Error::Query(format!("Failed to open SQLite database: {e}")))?;
56        schema::initialize(&conn)?;
57        Ok(Self { conn: Mutex::new(conn) })
58    }
59
60    /// Open an in-memory index (for testing).
61    pub fn open_in_memory() -> Result<Self> {
62        let conn = Connection::open_in_memory()
63            .map_err(|e| crate::Error::Query(format!("Failed to open in-memory database: {e}")))?;
64        schema::initialize(&conn)?;
65        Ok(Self { conn: Mutex::new(conn) })
66    }
67
68    /// Get document count.
69    pub fn document_count(&self) -> Result<usize> {
70        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
71        let count: i64 = conn
72            .query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
73            .map_err(|e| crate::Error::Query(format!("Failed to count documents: {e}")))?;
74        Ok(count as usize)
75    }
76
77    /// Get chunk count.
78    pub fn chunk_count(&self) -> Result<usize> {
79        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
80        let count: i64 = conn
81            .query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
82            .map_err(|e| crate::Error::Query(format!("Failed to count chunks: {e}")))?;
83        Ok(count as usize)
84    }
85
86    /// Check if a document needs reindexing by fingerprint.
87    pub fn needs_reindex(&self, path: &str, hash: &[u8; 32]) -> Result<bool> {
88        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
89        let stored: Option<Vec<u8>> = conn
90            .query_row("SELECT blake3_hash FROM fingerprints WHERE doc_path = ?1", [path], |row| {
91                row.get(0)
92            })
93            .ok();
94
95        match stored {
96            Some(stored_hash) => Ok(stored_hash.as_slice() != hash),
97            None => Ok(true),
98        }
99    }
100
101    /// Batch-insert a document and its chunks within a transaction.
102    pub fn insert_document(
103        &self,
104        doc_id: &str,
105        title: Option<&str>,
106        source: Option<&str>,
107        content: &str,
108        chunks: &[(String, String)],
109        fingerprint: Option<(&str, &[u8; 32])>,
110    ) -> Result<()> {
111        let mut conn = self.conn.lock().map_err(|e| lock_err(&e))?;
112        let tx = conn
113            .transaction()
114            .map_err(|e| crate::Error::Query(format!("Failed to begin transaction: {e}")))?;
115
116        // Delete old document's chunks first (fires FTS5 sync triggers),
117        // then delete the document itself.
118        tx.execute("DELETE FROM chunks WHERE doc_id = ?1", [doc_id])
119            .map_err(|e| crate::Error::Query(format!("Failed to delete old chunks: {e}")))?;
120        tx.execute("DELETE FROM documents WHERE id = ?1", [doc_id])
121            .map_err(|e| crate::Error::Query(format!("Failed to delete old document: {e}")))?;
122
123        tx.execute(
124            "INSERT INTO documents (id, title, source, content, chunk_count) VALUES (?1, ?2, ?3, ?4, ?5)",
125            rusqlite::params![doc_id, title, source, content, chunks.len() as i64],
126        )
127        .map_err(|e| crate::Error::Query(format!("Failed to insert document: {e}")))?;
128
129        {
130            let mut stmt = tx
131                .prepare_cached(
132                    "INSERT OR REPLACE INTO chunks (id, doc_id, content, position) VALUES (?1, ?2, ?3, ?4)",
133                )
134                .map_err(|e| crate::Error::Query(format!("Failed to prepare chunk insert: {e}")))?;
135
136            for (i, (chunk_id, chunk_content)) in chunks.iter().enumerate() {
137                stmt.execute(rusqlite::params![chunk_id, doc_id, chunk_content, i as i64])
138                    .map_err(|e| crate::Error::Query(format!("Failed to insert chunk: {e}")))?;
139            }
140        }
141
142        if let Some((path, hash)) = fingerprint {
143            tx.execute(
144                "INSERT OR REPLACE INTO fingerprints (doc_path, blake3_hash, chunk_count) VALUES (?1, ?2, ?3)",
145                rusqlite::params![path, hash.as_slice(), chunks.len() as i64],
146            )
147            .map_err(|e| crate::Error::Query(format!("Failed to update fingerprint: {e}")))?;
148        }
149
150        tx.commit()
151            .map_err(|e| crate::Error::Query(format!("Failed to commit transaction: {e}")))?;
152
153        Ok(())
154    }
155
156    /// Remove a document and its chunks.
157    ///
158    /// Explicitly deletes chunks first (which fires FTS5 sync triggers),
159    /// then deletes the document row.
160    pub fn remove_document(&self, doc_id: &str) -> Result<()> {
161        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
162        conn.execute("DELETE FROM chunks WHERE doc_id = ?1", [doc_id])
163            .map_err(|e| crate::Error::Query(format!("Failed to delete chunks: {e}")))?;
164        conn.execute("DELETE FROM documents WHERE id = ?1", [doc_id])
165            .map_err(|e| crate::Error::Query(format!("Failed to remove document: {e}")))?;
166        Ok(())
167    }
168
169    /// List all tracked fingerprints (path → blake3 hash).
170    ///
171    /// Used by incremental indexing to detect deleted or changed files.
172    pub fn list_fingerprints(&self) -> Result<Vec<(String, Vec<u8>)>> {
173        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
174        let mut stmt = conn
175            .prepare("SELECT doc_path, blake3_hash FROM fingerprints")
176            .map_err(|e| crate::Error::Query(format!("Failed to list fingerprints: {e}")))?;
177        let rows = stmt
178            .query_map([], |row| {
179                let path: String = row.get(0)?;
180                let hash: Vec<u8> = row.get(1)?;
181                Ok((path, hash))
182            })
183            .map_err(|e| crate::Error::Query(format!("Failed to query fingerprints: {e}")))?;
184        let mut results = Vec::new();
185        for row in rows {
186            results.push(
187                row.map_err(|e| crate::Error::Query(format!("Failed to read fingerprint: {e}")))?,
188            );
189        }
190        Ok(results)
191    }
192
193    /// Remove all documents (and their chunks) with a given source path.
194    ///
195    /// Used by incremental indexing to remove stale documents before re-inserting.
196    pub fn remove_by_source(&self, source: &str) -> Result<usize> {
197        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
198        // Find doc IDs with this source
199        let mut stmt = conn
200            .prepare("SELECT id FROM documents WHERE source = ?1")
201            .map_err(|e| crate::Error::Query(format!("Failed to find docs by source: {e}")))?;
202        let ids: Vec<String> = stmt
203            .query_map([source], |row| row.get(0))
204            .map_err(|e| crate::Error::Query(format!("Failed to query docs: {e}")))?
205            .filter_map(|r| r.ok())
206            .collect();
207
208        for doc_id in &ids {
209            conn.execute("DELETE FROM chunks WHERE doc_id = ?1", [doc_id])
210                .map_err(|e| crate::Error::Query(format!("Failed to delete chunks: {e}")))?;
211            conn.execute("DELETE FROM documents WHERE id = ?1", [doc_id])
212                .map_err(|e| crate::Error::Query(format!("Failed to delete document: {e}")))?;
213        }
214
215        // Remove fingerprint
216        conn.execute("DELETE FROM fingerprints WHERE doc_path = ?1", [source])
217            .map_err(|e| crate::Error::Query(format!("Failed to delete fingerprint: {e}")))?;
218
219        Ok(ids.len())
220    }
221
222    /// FTS5 BM25 search. Returns results ordered by descending relevance.
223    pub fn search_fts(&self, query: &str, k: usize) -> Result<Vec<fts::FtsResult>> {
224        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
225        fts::search(&conn, query, k)
226    }
227
228    /// Get chunk content by ID.
229    pub fn get_chunk(&self, chunk_id: &str) -> Result<Option<String>> {
230        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
231        let content: Option<String> = conn
232            .query_row("SELECT content FROM chunks WHERE id = ?1", [chunk_id], |row| row.get(0))
233            .ok();
234        Ok(content)
235    }
236
237    /// Get a metadata value by key.
238    pub fn get_metadata(&self, key: &str) -> Result<Option<String>> {
239        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
240        let value: Option<String> = conn
241            .query_row("SELECT value FROM metadata WHERE key = ?1", [key], |row| row.get(0))
242            .ok();
243        Ok(value)
244    }
245
246    /// Set a metadata key-value pair.
247    pub fn set_metadata(&self, key: &str, value: &str) -> Result<()> {
248        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
249        conn.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES (?1, ?2)", [key, value])
250            .map_err(|e| crate::Error::Query(format!("Failed to set metadata: {e}")))?;
251        Ok(())
252    }
253
254    /// Vacuum and optimize the database.
255    pub fn optimize(&self) -> Result<()> {
256        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
257        fts::optimize(&conn)?;
258        conn.execute_batch("VACUUM;")
259            .map_err(|e| crate::Error::Query(format!("VACUUM failed: {e}")))?;
260        Ok(())
261    }
262}
263
264impl SparseIndex for SqliteIndex {
265    fn add(&mut self, chunk: &Chunk) {
266        let doc_id = chunk.document_id.to_string();
267        let chunk_id = chunk.id.to_string();
268        if let Ok(conn) = self.conn.lock() {
269            let _ = conn.execute(
270                "INSERT OR IGNORE INTO documents (id, content) VALUES (?1, '')",
271                [&doc_id],
272            );
273            let _ = conn.execute(
274                "INSERT OR REPLACE INTO chunks (id, doc_id, content, position) VALUES (?1, ?2, ?3, 0)",
275                rusqlite::params![chunk_id, doc_id, chunk.content],
276            );
277        }
278    }
279
280    fn add_batch(&mut self, chunks: &[Chunk]) {
281        let Ok(mut conn) = self.conn.lock() else {
282            return;
283        };
284        let Ok(tx) = conn.transaction() else {
285            return;
286        };
287
288        // Track position per document for UNIQUE(doc_id, position)
289        let mut doc_positions: std::collections::HashMap<String, i64> =
290            std::collections::HashMap::new();
291
292        for chunk in chunks {
293            let doc_id = chunk.document_id.to_string();
294            let chunk_id = chunk.id.to_string();
295            let pos = doc_positions.entry(doc_id.clone()).or_insert(0);
296            let _ = tx.execute(
297                "INSERT OR IGNORE INTO documents (id, content) VALUES (?1, '')",
298                [&doc_id],
299            );
300            let _ = tx.execute(
301                "INSERT OR REPLACE INTO chunks (id, doc_id, content, position) VALUES (?1, ?2, ?3, ?4)",
302                rusqlite::params![chunk_id, doc_id, chunk.content, *pos],
303            );
304            *pos += 1;
305        }
306
307        let _ = tx.commit();
308    }
309
310    fn search(&self, query: &str, k: usize) -> Vec<(ChunkId, f32)> {
311        let Ok(conn) = self.conn.lock() else {
312            return Vec::new();
313        };
314        let Ok(results) = fts::search(&conn, query, k) else {
315            return Vec::new();
316        };
317
318        results
319            .into_iter()
320            .filter_map(|r| {
321                uuid::Uuid::parse_str(&r.chunk_id).ok().map(|uuid| (ChunkId(uuid), r.score as f32))
322            })
323            .collect()
324    }
325
326    fn remove(&mut self, chunk_id: ChunkId) {
327        let id_str = chunk_id.to_string();
328        if let Ok(conn) = self.conn.lock() {
329            let _ = conn.execute("DELETE FROM chunks WHERE id = ?1", [&id_str]);
330        }
331    }
332
333    fn len(&self) -> usize {
334        self.chunk_count().unwrap_or(0)
335    }
336}
337
338// --- SqliteStore: convenience wrapper ---
339
340/// Statistics about the SQLite store.
341#[derive(Debug, Clone)]
342pub struct StoreStats {
343    /// Number of documents indexed.
344    pub document_count: usize,
345    /// Number of chunks indexed.
346    pub chunk_count: usize,
347    /// Number of fingerprints tracked.
348    pub fingerprint_count: usize,
349    /// Database file size in bytes (0 for in-memory).
350    pub db_size_bytes: u64,
351}
352
353/// Combined document store + BM25 index backed by SQLite.
354///
355/// Replaces the pattern of `BM25Index` + `VectorStore` + JSON persistence
356/// for users who want disk-backed RAG without managing separate components.
357pub struct SqliteStore {
358    index: SqliteIndex,
359    path: Option<std::path::PathBuf>,
360}
361
362impl std::fmt::Debug for SqliteStore {
363    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
364        f.debug_struct("SqliteStore").field("path", &self.path).finish_non_exhaustive()
365    }
366}
367
368impl SqliteStore {
369    /// Open or create a store at the given path.
370    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
371        let path = path.as_ref().to_path_buf();
372        let index = SqliteIndex::open(&path)?;
373        Ok(Self { index, path: Some(path) })
374    }
375
376    /// Open an in-memory store (for testing).
377    pub fn open_in_memory() -> Result<Self> {
378        let index = SqliteIndex::open_in_memory()?;
379        Ok(Self { index, path: None })
380    }
381
382    /// Index a document with its pre-chunked content.
383    pub fn index_document(
384        &self,
385        doc: &Document,
386        chunks: &[Chunk],
387        fingerprint: Option<(&str, &[u8; 32])>,
388    ) -> Result<()> {
389        let doc_id = doc.id.to_string();
390        let chunk_pairs: Vec<(String, String)> =
391            chunks.iter().map(|c| (c.id.to_string(), c.content.clone())).collect();
392
393        self.index.insert_document(
394            &doc_id,
395            doc.title.as_deref(),
396            doc.source.as_deref(),
397            &doc.content,
398            &chunk_pairs,
399            fingerprint,
400        )
401    }
402
403    /// Search with BM25 and return results.
404    ///
405    /// **Performance contract:** Median latency 10–50 ms (spec Section 3.1).
406    pub fn search(&self, query: &str, k: usize) -> Result<Vec<fts::FtsResult>> {
407        self.index.search_fts(query, k)
408    }
409
410    /// Check if a document needs reindexing by fingerprint.
411    pub fn needs_reindex(&self, path: &str, hash: &[u8; 32]) -> Result<bool> {
412        self.index.needs_reindex(path, hash)
413    }
414
415    /// List all tracked fingerprints.
416    pub fn list_fingerprints(&self) -> Result<Vec<(String, Vec<u8>)>> {
417        self.index.list_fingerprints()
418    }
419
420    /// Remove all documents with a given source path.
421    pub fn remove_by_source(&self, source: &str) -> Result<usize> {
422        self.index.remove_by_source(source)
423    }
424
425    /// Get store statistics.
426    pub fn stats(&self) -> Result<StoreStats> {
427        let db_size_bytes = self
428            .path
429            .as_ref()
430            .and_then(|p| std::fs::metadata(p).ok())
431            .map(|m| m.len())
432            .unwrap_or(0);
433
434        Ok(StoreStats {
435            document_count: self.index.document_count()?,
436            chunk_count: self.index.chunk_count()?,
437            fingerprint_count: self.fingerprint_count()?,
438            db_size_bytes,
439        })
440    }
441
442    /// Get the number of tracked fingerprints.
443    fn fingerprint_count(&self) -> Result<usize> {
444        let conn = self.index.conn.lock().map_err(|e| lock_err(&e))?;
445        let count: i64 = conn
446            .query_row("SELECT COUNT(*) FROM fingerprints", [], |r| r.get(0))
447            .map_err(|e| crate::Error::Query(format!("Failed to count fingerprints: {e}")))?;
448        Ok(count as usize)
449    }
450
451    /// Get/set metadata.
452    pub fn get_metadata(&self, key: &str) -> Result<Option<String>> {
453        self.index.get_metadata(key)
454    }
455
456    /// Set a metadata key-value pair.
457    pub fn set_metadata(&self, key: &str, value: &str) -> Result<()> {
458        self.index.set_metadata(key, value)
459    }
460
461    /// Optimize the database (VACUUM + FTS5 merge).
462    pub fn optimize(&self) -> Result<()> {
463        self.index.optimize()
464    }
465
466    /// Get a reference to the underlying SqliteIndex.
467    pub fn as_index(&self) -> &SqliteIndex {
468        &self.index
469    }
470}
471
472#[cfg(test)]
473mod tests {
474    use super::*;
475    use crate::{Document, DocumentId};
476
477    fn make_doc(content: &str) -> Document {
478        Document::new(content)
479    }
480
481    fn make_chunk(doc_id: DocumentId, content: &str) -> Chunk {
482        Chunk {
483            id: ChunkId::new(),
484            document_id: doc_id,
485            content: content.to_string(),
486            start_offset: 0,
487            end_offset: content.len(),
488            metadata: crate::ChunkMetadata::default(),
489            embedding: None,
490        }
491    }
492
493    // --- SqliteIndex tests ---
494
495    #[test]
496    fn test_index_roundtrip() {
497        let idx = SqliteIndex::open_in_memory().unwrap();
498        idx.insert_document(
499            "doc1",
500            Some("Test Doc"),
501            Some("/test.md"),
502            "full content here",
503            &[
504                ("c1".into(), "SIMD vector operations".into()),
505                ("c2".into(), "GPU kernel dispatch".into()),
506            ],
507            None,
508        )
509        .unwrap();
510
511        assert_eq!(idx.document_count().unwrap(), 1);
512        assert_eq!(idx.chunk_count().unwrap(), 2);
513
514        let content = idx.get_chunk("c1").unwrap();
515        assert_eq!(content.unwrap(), "SIMD vector operations");
516    }
517
518    #[test]
519    fn test_index_search() {
520        let idx = SqliteIndex::open_in_memory().unwrap();
521        idx.insert_document(
522            "doc1",
523            None,
524            None,
525            "",
526            &[
527                ("c1".into(), "machine learning algorithms for classification".into()),
528                ("c2".into(), "database indexing and query optimization".into()),
529            ],
530            None,
531        )
532        .unwrap();
533
534        let results = idx.search_fts("machine learning", 10).unwrap();
535        assert_eq!(results.len(), 1);
536        assert_eq!(results[0].chunk_id, "c1");
537    }
538
539    #[test]
540    fn test_index_fingerprint_reindex() {
541        let idx = SqliteIndex::open_in_memory().unwrap();
542        let hash1 = [1u8; 32];
543        let hash2 = [2u8; 32];
544
545        // First insert with fingerprint
546        idx.insert_document(
547            "doc1",
548            None,
549            None,
550            "",
551            &[("c1".into(), "content".into())],
552            Some(("/test.md", &hash1)),
553        )
554        .unwrap();
555
556        // Same hash should not need reindex
557        assert!(!idx.needs_reindex("/test.md", &hash1).unwrap());
558
559        // Different hash should need reindex
560        assert!(idx.needs_reindex("/test.md", &hash2).unwrap());
561
562        // Unknown path should need reindex
563        assert!(idx.needs_reindex("/unknown.md", &hash1).unwrap());
564    }
565
566    #[test]
567    fn test_index_remove_document() {
568        let idx = SqliteIndex::open_in_memory().unwrap();
569        idx.insert_document("doc1", None, None, "", &[("c1".into(), "some content".into())], None)
570            .unwrap();
571
572        assert_eq!(idx.document_count().unwrap(), 1);
573        idx.remove_document("doc1").unwrap();
574        assert_eq!(idx.document_count().unwrap(), 0);
575        assert_eq!(idx.chunk_count().unwrap(), 0);
576    }
577
578    #[test]
579    fn test_index_metadata() {
580        let idx = SqliteIndex::open_in_memory().unwrap();
581        idx.set_metadata("version", "1.0.0").unwrap();
582        assert_eq!(idx.get_metadata("version").unwrap(), Some("1.0.0".to_string()));
583        assert_eq!(idx.get_metadata("nonexistent").unwrap(), None);
584    }
585
586    #[test]
587    fn test_index_update_document() {
588        let idx = SqliteIndex::open_in_memory().unwrap();
589        idx.insert_document("doc1", None, None, "", &[("c1".into(), "old content".into())], None)
590            .unwrap();
591        idx.insert_document("doc1", None, None, "", &[("c2".into(), "new content".into())], None)
592            .unwrap();
593
594        // Old chunk should be gone, new chunk present
595        assert_eq!(idx.chunk_count().unwrap(), 1);
596        assert!(idx.get_chunk("c1").unwrap().is_none());
597        assert_eq!(idx.get_chunk("c2").unwrap().unwrap(), "new content");
598    }
599
600    // --- SparseIndex trait tests ---
601
602    #[test]
603    fn test_sparse_index_add_and_len() {
604        let mut idx = SqliteIndex::open_in_memory().unwrap();
605        let doc_id = DocumentId::new();
606        let chunk = make_chunk(doc_id, "sparse index test content");
607        idx.add(&chunk);
608        assert_eq!(idx.len(), 1);
609    }
610
611    #[test]
612    fn test_sparse_index_add_batch() {
613        let mut idx = SqliteIndex::open_in_memory().unwrap();
614        let doc_id = DocumentId::new();
615        let chunks = vec![
616            make_chunk(doc_id, "first chunk content"),
617            make_chunk(doc_id, "second chunk content"),
618        ];
619        idx.add_batch(&chunks);
620        assert_eq!(idx.len(), 2);
621    }
622
623    #[test]
624    fn test_sparse_index_remove() {
625        let mut idx = SqliteIndex::open_in_memory().unwrap();
626        let doc_id = DocumentId::new();
627        let chunk = make_chunk(doc_id, "content to remove");
628        let chunk_id = chunk.id;
629        idx.add(&chunk);
630        assert_eq!(idx.len(), 1);
631        idx.remove(chunk_id);
632        assert_eq!(idx.len(), 0);
633    }
634
635    // --- SqliteStore tests ---
636
637    #[test]
638    fn test_store_index_and_search() {
639        let store = SqliteStore::open_in_memory().unwrap();
640        let doc = make_doc("SIMD vector operations for tensor computation");
641        let chunks = vec![make_chunk(doc.id, "SIMD vector operations for tensor computation")];
642        store.index_document(&doc, &chunks, None).unwrap();
643
644        let results = store.search("SIMD tensor", 10).unwrap();
645        assert!(!results.is_empty());
646    }
647
648    #[test]
649    fn test_store_stats() {
650        let store = SqliteStore::open_in_memory().unwrap();
651        let doc = make_doc("content");
652        let chunks = vec![make_chunk(doc.id, "chunk one"), make_chunk(doc.id, "chunk two")];
653        store.index_document(&doc, &chunks, Some(("/test.md", &[0u8; 32]))).unwrap();
654
655        let stats = store.stats().unwrap();
656        assert_eq!(stats.document_count, 1);
657        assert_eq!(stats.chunk_count, 2);
658        assert_eq!(stats.fingerprint_count, 1);
659    }
660
661    #[test]
662    fn test_store_needs_reindex() {
663        let store = SqliteStore::open_in_memory().unwrap();
664        let doc = make_doc("content");
665        let chunks = vec![make_chunk(doc.id, "chunk")];
666        let hash = [42u8; 32];
667        store.index_document(&doc, &chunks, Some(("/doc.md", &hash))).unwrap();
668
669        assert!(!store.needs_reindex("/doc.md", &hash).unwrap());
670        assert!(store.needs_reindex("/doc.md", &[0u8; 32]).unwrap());
671        assert!(store.needs_reindex("/other.md", &hash).unwrap());
672    }
673
674    #[test]
675    fn test_store_metadata() {
676        let store = SqliteStore::open_in_memory().unwrap();
677        store.set_metadata("batuta_version", "0.6.0").unwrap();
678        assert_eq!(store.get_metadata("batuta_version").unwrap(), Some("0.6.0".to_string()));
679    }
680
681    #[test]
682    fn test_store_optimize() {
683        let store = SqliteStore::open_in_memory().unwrap();
684        let doc = make_doc("content");
685        let chunks = vec![make_chunk(doc.id, "some chunk content")];
686        store.index_document(&doc, &chunks, None).unwrap();
687        store.optimize().unwrap(); // Should not panic
688    }
689
690    #[test]
691    fn test_store_large_batch() {
692        let store = SqliteStore::open_in_memory().unwrap();
693
694        // Insert 100 documents with 5 chunks each
695        for i in 0..100 {
696            let doc = make_doc(&format!("Document {i} about machine learning"));
697            let chunks: Vec<Chunk> = (0..5)
698                .map(|j| {
699                    make_chunk(
700                        doc.id,
701                        &format!("Chunk {j} of doc {i}: machine learning algorithms topic {j}"),
702                    )
703                })
704                .collect();
705            store.index_document(&doc, &chunks, None).unwrap();
706        }
707
708        let stats = store.stats().unwrap();
709        assert_eq!(stats.document_count, 100);
710        assert_eq!(stats.chunk_count, 500);
711
712        let results = store.search("machine learning", 10).unwrap();
713        assert_eq!(results.len(), 10);
714    }
715
716    #[test]
717    fn test_search_deterministic() {
718        let store = SqliteStore::open_in_memory().unwrap();
719        let doc = make_doc("determinism test");
720        let chunks = vec![
721            make_chunk(doc.id, "alpha beta gamma delta"),
722            make_chunk(doc.id, "epsilon zeta alpha alpha"),
723        ];
724        store.index_document(&doc, &chunks, None).unwrap();
725
726        // Run the same query 10 times, results should be identical
727        let baseline = store.search("alpha", 10).unwrap();
728        for _ in 0..10 {
729            let results = store.search("alpha", 10).unwrap();
730            assert_eq!(results.len(), baseline.len());
731            for (a, b) in baseline.iter().zip(results.iter()) {
732                assert_eq!(a.chunk_id, b.chunk_id);
733                assert!((a.score - b.score).abs() < f64::EPSILON);
734            }
735        }
736    }
737
738    // --- Incremental indexing tests ---
739
740    #[test]
741    fn test_list_fingerprints_empty() {
742        let idx = SqliteIndex::open_in_memory().unwrap();
743        let fps = idx.list_fingerprints().unwrap();
744        assert!(fps.is_empty());
745    }
746
747    #[test]
748    fn test_list_fingerprints_populated() {
749        let idx = SqliteIndex::open_in_memory().unwrap();
750        let hash1 = [1u8; 32];
751        let hash2 = [2u8; 32];
752
753        idx.insert_document(
754            "doc1",
755            None,
756            Some("/a.md"),
757            "",
758            &[("c1".into(), "content a".into())],
759            Some(("/a.md", &hash1)),
760        )
761        .unwrap();
762        idx.insert_document(
763            "doc2",
764            None,
765            Some("/b.md"),
766            "",
767            &[("c2".into(), "content b".into())],
768            Some(("/b.md", &hash2)),
769        )
770        .unwrap();
771
772        let fps = idx.list_fingerprints().unwrap();
773        assert_eq!(fps.len(), 2);
774        let paths: Vec<&str> = fps.iter().map(|(p, _)| p.as_str()).collect();
775        assert!(paths.contains(&"/a.md"));
776        assert!(paths.contains(&"/b.md"));
777    }
778
779    #[test]
780    fn test_remove_by_source() {
781        let idx = SqliteIndex::open_in_memory().unwrap();
782        let hash = [1u8; 32];
783
784        idx.insert_document(
785            "doc1",
786            None,
787            Some("/a.md"),
788            "full content",
789            &[("c1".into(), "chunk 1".into()), ("c2".into(), "chunk 2".into())],
790            Some(("/a.md", &hash)),
791        )
792        .unwrap();
793        idx.insert_document(
794            "doc2",
795            None,
796            Some("/b.md"),
797            "other content",
798            &[("c3".into(), "chunk 3".into())],
799            Some(("/b.md", &hash)),
800        )
801        .unwrap();
802
803        assert_eq!(idx.document_count().unwrap(), 2);
804        assert_eq!(idx.chunk_count().unwrap(), 3);
805
806        let removed = idx.remove_by_source("/a.md").unwrap();
807        assert_eq!(removed, 1);
808        assert_eq!(idx.document_count().unwrap(), 1);
809        assert_eq!(idx.chunk_count().unwrap(), 1);
810
811        // Fingerprint should also be removed
812        assert!(idx.needs_reindex("/a.md", &hash).unwrap());
813        // Other doc unaffected
814        assert!(!idx.needs_reindex("/b.md", &hash).unwrap());
815    }
816
817    #[test]
818    fn test_remove_by_source_nonexistent() {
819        let idx = SqliteIndex::open_in_memory().unwrap();
820        let removed = idx.remove_by_source("/nonexistent.md").unwrap();
821        assert_eq!(removed, 0);
822    }
823}