Skip to main content

trueno_rag/sqlite/
mod.rs

1//! SQLite+FTS5 persistent storage backend for RAG indices.
2//!
3//! Provides `SqliteIndex` (implements `SparseIndex`) and `SqliteStore`
4//! (convenience wrapper for document + chunk persistence).
5//!
6//! This module replaces in-memory HashMap-based indices with SQLite-backed
7//! storage using FTS5 for BM25 ranking (Robertson & Zaragoza, 2009).
8//!
9//! # Performance Contract
10//!
11//! Median search latency: 10–50 ms on a 5000+ document corpus with warm
12//! page cache (see sqlite-rag-integration spec, Section 3.1).
13
14pub mod fts;
15pub mod schema;
16
17use crate::index::SparseIndex;
18use crate::{Chunk, ChunkId, Document, Result};
19use rusqlite::Connection;
20use std::path::Path;
21use std::sync::Mutex;
22
23/// SQLite-backed sparse index using FTS5 for BM25 search.
24///
25/// Unlike `BM25Index` (in-memory HashMap), this persists to disk and
26/// delegates BM25 scoring to SQLite's FTS5 extension.
27///
28/// The `Connection` is wrapped in a `Mutex` to satisfy the `Send + Sync`
29/// bounds required by `SparseIndex`. `Mutex<T>` is `Sync` when `T: Send`,
30/// and `rusqlite::Connection` is `Send`. SQLite in WAL mode supports
31/// concurrent readers via separate connections; this single-connection
32/// design serializes access within one process.
33pub struct SqliteIndex {
34    conn: Mutex<Connection>,
35}
36
37// Mutex<Connection> is automatically Send+Sync because Connection: Send.
38// No unsafe impl needed.
39
40impl std::fmt::Debug for SqliteIndex {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        f.debug_struct("SqliteIndex").finish_non_exhaustive()
43    }
44}
45
46/// Helper to map mutex poison errors.
47fn lock_err<T>(e: &std::sync::PoisonError<T>) -> crate::Error {
48    crate::Error::Query(format!("Mutex poisoned: {e}"))
49}
50
51impl SqliteIndex {
52    /// Open or create an index at the given path.
53    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
54        let conn = Connection::open(path.as_ref())
55            .map_err(|e| crate::Error::Query(format!("Failed to open SQLite database: {e}")))?;
56        schema::initialize(&conn)?;
57        Ok(Self { conn: Mutex::new(conn) })
58    }
59
60    /// Open an in-memory index (for testing).
61    pub fn open_in_memory() -> Result<Self> {
62        let conn = Connection::open_in_memory()
63            .map_err(|e| crate::Error::Query(format!("Failed to open in-memory database: {e}")))?;
64        schema::initialize(&conn)?;
65        Ok(Self { conn: Mutex::new(conn) })
66    }
67
68    /// Get document count.
69    pub fn document_count(&self) -> Result<usize> {
70        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
71        let count: i64 = conn
72            .query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
73            .map_err(|e| crate::Error::Query(format!("Failed to count documents: {e}")))?;
74        Ok(count as usize)
75    }
76
77    /// Get chunk count.
78    pub fn chunk_count(&self) -> Result<usize> {
79        // Contract: configuration-v1.yaml precondition (pv codegen)
80        contract_pre_configuration!();
81        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
82        let count: i64 = conn
83            .query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
84            .map_err(|e| crate::Error::Query(format!("Failed to count chunks: {e}")))?;
85        Ok(count as usize)
86    }
87
88    /// Check if a document needs reindexing by fingerprint.
89    pub fn needs_reindex(&self, path: &str, hash: &[u8; 32]) -> Result<bool> {
90        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
91        let stored: Option<Vec<u8>> = conn
92            .query_row("SELECT blake3_hash FROM fingerprints WHERE doc_path = ?1", [path], |row| {
93                row.get(0)
94            })
95            .ok();
96
97        match stored {
98            Some(stored_hash) => Ok(stored_hash.as_slice() != hash),
99            None => Ok(true),
100        }
101    }
102
103    /// Batch-insert a document and its chunks within a transaction.
104    pub fn insert_document(
105        &self,
106        doc_id: &str,
107        title: Option<&str>,
108        source: Option<&str>,
109        content: &str,
110        chunks: &[(String, String)],
111        fingerprint: Option<(&str, &[u8; 32])>,
112    ) -> Result<()> {
113        // Contract: configuration-v1.yaml precondition (pv codegen)
114        contract_pre_configuration!(doc_id.as_bytes());
115
116        let mut conn = self.conn.lock().map_err(|e| lock_err(&e))?;
117        let tx = conn
118            .transaction()
119            .map_err(|e| crate::Error::Query(format!("Failed to begin transaction: {e}")))?;
120
121        // Delete old document's chunks first (fires FTS5 sync triggers),
122        // then delete the document itself.
123        tx.execute("DELETE FROM chunks WHERE doc_id = ?1", [doc_id])
124            .map_err(|e| crate::Error::Query(format!("Failed to delete old chunks: {e}")))?;
125        tx.execute("DELETE FROM documents WHERE id = ?1", [doc_id])
126            .map_err(|e| crate::Error::Query(format!("Failed to delete old document: {e}")))?;
127
128        tx.execute(
129            "INSERT INTO documents (id, title, source, content, chunk_count) VALUES (?1, ?2, ?3, ?4, ?5)",
130            rusqlite::params![doc_id, title, source, content, chunks.len() as i64],
131        )
132        .map_err(|e| crate::Error::Query(format!("Failed to insert document: {e}")))?;
133
134        {
135            let mut stmt = tx
136                .prepare_cached(
137                    "INSERT OR REPLACE INTO chunks (id, doc_id, content, position) VALUES (?1, ?2, ?3, ?4)",
138                )
139                .map_err(|e| crate::Error::Query(format!("Failed to prepare chunk insert: {e}")))?;
140
141            for (i, (chunk_id, chunk_content)) in chunks.iter().enumerate() {
142                stmt.execute(rusqlite::params![chunk_id, doc_id, chunk_content, i as i64])
143                    .map_err(|e| crate::Error::Query(format!("Failed to insert chunk: {e}")))?;
144            }
145        }
146
147        if let Some((path, hash)) = fingerprint {
148            tx.execute(
149                "INSERT OR REPLACE INTO fingerprints (doc_path, blake3_hash, chunk_count) VALUES (?1, ?2, ?3)",
150                rusqlite::params![path, hash.as_slice(), chunks.len() as i64],
151            )
152            .map_err(|e| crate::Error::Query(format!("Failed to update fingerprint: {e}")))?;
153        }
154
155        tx.commit()
156            .map_err(|e| crate::Error::Query(format!("Failed to commit transaction: {e}")))?;
157
158        Ok(())
159    }
160
161    /// Remove a document and its chunks.
162    ///
163    /// Explicitly deletes chunks first (which fires FTS5 sync triggers),
164    /// then deletes the document row.
165    pub fn remove_document(&self, doc_id: &str) -> Result<()> {
166        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
167        conn.execute("DELETE FROM chunks WHERE doc_id = ?1", [doc_id])
168            .map_err(|e| crate::Error::Query(format!("Failed to delete chunks: {e}")))?;
169        conn.execute("DELETE FROM documents WHERE id = ?1", [doc_id])
170            .map_err(|e| crate::Error::Query(format!("Failed to remove document: {e}")))?;
171        Ok(())
172    }
173
174    /// List all tracked fingerprints (path → blake3 hash).
175    ///
176    /// Used by incremental indexing to detect deleted or changed files.
177    pub fn list_fingerprints(&self) -> Result<Vec<(String, Vec<u8>)>> {
178        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
179        let mut stmt = conn
180            .prepare("SELECT doc_path, blake3_hash FROM fingerprints")
181            .map_err(|e| crate::Error::Query(format!("Failed to list fingerprints: {e}")))?;
182        let rows = stmt
183            .query_map([], |row| {
184                let path: String = row.get(0)?;
185                let hash: Vec<u8> = row.get(1)?;
186                Ok((path, hash))
187            })
188            .map_err(|e| crate::Error::Query(format!("Failed to query fingerprints: {e}")))?;
189        let mut results = Vec::new();
190        for row in rows {
191            results.push(
192                row.map_err(|e| crate::Error::Query(format!("Failed to read fingerprint: {e}")))?,
193            );
194        }
195        Ok(results)
196    }
197
198    /// Remove all documents (and their chunks) with a given source path.
199    ///
200    /// Used by incremental indexing to remove stale documents before re-inserting.
201    pub fn remove_by_source(&self, source: &str) -> Result<usize> {
202        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
203        // Find doc IDs with this source
204        let mut stmt = conn
205            .prepare("SELECT id FROM documents WHERE source = ?1")
206            .map_err(|e| crate::Error::Query(format!("Failed to find docs by source: {e}")))?;
207        let ids: Vec<String> = stmt
208            .query_map([source], |row| row.get(0))
209            .map_err(|e| crate::Error::Query(format!("Failed to query docs: {e}")))?
210            .filter_map(|r| r.ok())
211            .collect();
212
213        for doc_id in &ids {
214            conn.execute("DELETE FROM chunks WHERE doc_id = ?1", [doc_id])
215                .map_err(|e| crate::Error::Query(format!("Failed to delete chunks: {e}")))?;
216            conn.execute("DELETE FROM documents WHERE id = ?1", [doc_id])
217                .map_err(|e| crate::Error::Query(format!("Failed to delete document: {e}")))?;
218        }
219
220        // Remove fingerprint
221        conn.execute("DELETE FROM fingerprints WHERE doc_path = ?1", [source])
222            .map_err(|e| crate::Error::Query(format!("Failed to delete fingerprint: {e}")))?;
223
224        Ok(ids.len())
225    }
226
227    /// FTS5 BM25 search. Returns results ordered by descending relevance.
228    pub fn search_fts(&self, query: &str, k: usize) -> Result<Vec<fts::FtsResult>> {
229        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
230        fts::search(&conn, query, k)
231    }
232
233    /// Get chunk content by ID.
234    pub fn get_chunk(&self, chunk_id: &str) -> Result<Option<String>> {
235        // Contract: configuration-v1.yaml precondition (pv codegen)
236        contract_pre_configuration!(chunk_id.as_bytes());
237        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
238        let content: Option<String> = conn
239            .query_row("SELECT content FROM chunks WHERE id = ?1", [chunk_id], |row| row.get(0))
240            .ok();
241        Ok(content)
242    }
243
244    /// Get a metadata value by key.
245    pub fn get_metadata(&self, key: &str) -> Result<Option<String>> {
246        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
247        let value: Option<String> = conn
248            .query_row("SELECT value FROM metadata WHERE key = ?1", [key], |row| row.get(0))
249            .ok();
250        Ok(value)
251    }
252
253    /// Set a metadata key-value pair.
254    pub fn set_metadata(&self, key: &str, value: &str) -> Result<()> {
255        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
256        conn.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES (?1, ?2)", [key, value])
257            .map_err(|e| crate::Error::Query(format!("Failed to set metadata: {e}")))?;
258        Ok(())
259    }
260
261    /// Vacuum and optimize the database.
262    pub fn optimize(&self) -> Result<()> {
263        let conn = self.conn.lock().map_err(|e| lock_err(&e))?;
264        fts::optimize(&conn)?;
265        conn.execute_batch("VACUUM;")
266            .map_err(|e| crate::Error::Query(format!("VACUUM failed: {e}")))?;
267        Ok(())
268    }
269}
270
271impl SparseIndex for SqliteIndex {
272    fn add(&mut self, chunk: &Chunk) {
273        let doc_id = chunk.document_id.to_string();
274        let chunk_id = chunk.id.to_string();
275        if let Ok(conn) = self.conn.lock() {
276            let _ = conn.execute(
277                "INSERT OR IGNORE INTO documents (id, content) VALUES (?1, '')",
278                [&doc_id],
279            );
280            let _ = conn.execute(
281                "INSERT OR REPLACE INTO chunks (id, doc_id, content, position) VALUES (?1, ?2, ?3, 0)",
282                rusqlite::params![chunk_id, doc_id, chunk.content],
283            );
284        }
285    }
286
287    fn add_batch(&mut self, chunks: &[Chunk]) {
288        let Ok(mut conn) = self.conn.lock() else {
289            return;
290        };
291        let Ok(tx) = conn.transaction() else {
292            return;
293        };
294
295        // Track position per document for UNIQUE(doc_id, position)
296        let mut doc_positions: std::collections::HashMap<String, i64> =
297            std::collections::HashMap::new();
298
299        for chunk in chunks {
300            let doc_id = chunk.document_id.to_string();
301            let chunk_id = chunk.id.to_string();
302            let pos = doc_positions.entry(doc_id.clone()).or_insert(0);
303            let _ = tx.execute(
304                "INSERT OR IGNORE INTO documents (id, content) VALUES (?1, '')",
305                [&doc_id],
306            );
307            let _ = tx.execute(
308                "INSERT OR REPLACE INTO chunks (id, doc_id, content, position) VALUES (?1, ?2, ?3, ?4)",
309                rusqlite::params![chunk_id, doc_id, chunk.content, *pos],
310            );
311            *pos += 1;
312        }
313
314        let _ = tx.commit();
315    }
316
317    fn search(&self, query: &str, k: usize) -> Vec<(ChunkId, f32)> {
318        let Ok(conn) = self.conn.lock() else {
319            return Vec::new();
320        };
321        let Ok(results) = fts::search(&conn, query, k) else {
322            return Vec::new();
323        };
324
325        results
326            .into_iter()
327            .filter_map(|r| {
328                uuid::Uuid::parse_str(&r.chunk_id).ok().map(|uuid| (ChunkId(uuid), r.score as f32))
329            })
330            .collect()
331    }
332
333    fn remove(&mut self, chunk_id: ChunkId) {
334        let id_str = chunk_id.to_string();
335        if let Ok(conn) = self.conn.lock() {
336            let _ = conn.execute("DELETE FROM chunks WHERE id = ?1", [&id_str]);
337        }
338    }
339
340    fn len(&self) -> usize {
341        self.chunk_count().unwrap_or(0)
342    }
343}
344
345// --- SqliteStore: convenience wrapper ---
346
347/// Statistics about the SQLite store.
348#[derive(Debug, Clone)]
349pub struct StoreStats {
350    /// Number of documents indexed.
351    pub document_count: usize,
352    /// Number of chunks indexed.
353    pub chunk_count: usize,
354    /// Number of fingerprints tracked.
355    pub fingerprint_count: usize,
356    /// Database file size in bytes (0 for in-memory).
357    pub db_size_bytes: u64,
358}
359
360/// Combined document store + BM25 index backed by SQLite.
361///
362/// Replaces the pattern of `BM25Index` + `VectorStore` + JSON persistence
363/// for users who want disk-backed RAG without managing separate components.
364pub struct SqliteStore {
365    index: SqliteIndex,
366    path: Option<std::path::PathBuf>,
367}
368
369impl std::fmt::Debug for SqliteStore {
370    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
371        f.debug_struct("SqliteStore").field("path", &self.path).finish_non_exhaustive()
372    }
373}
374
375impl SqliteStore {
376    /// Open or create a store at the given path.
377    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
378        let path = path.as_ref().to_path_buf();
379        let index = SqliteIndex::open(&path)?;
380        Ok(Self { index, path: Some(path) })
381    }
382
383    /// Open an in-memory store (for testing).
384    pub fn open_in_memory() -> Result<Self> {
385        let index = SqliteIndex::open_in_memory()?;
386        Ok(Self { index, path: None })
387    }
388
389    /// Index a document with its pre-chunked content.
390    pub fn index_document(
391        &self,
392        doc: &Document,
393        chunks: &[Chunk],
394        fingerprint: Option<(&str, &[u8; 32])>,
395    ) -> Result<()> {
396        let doc_id = doc.id.to_string();
397        let chunk_pairs: Vec<(String, String)> =
398            chunks.iter().map(|c| (c.id.to_string(), c.content.clone())).collect();
399
400        self.index.insert_document(
401            &doc_id,
402            doc.title.as_deref(),
403            doc.source.as_deref(),
404            &doc.content,
405            &chunk_pairs,
406            fingerprint,
407        )
408    }
409
410    /// Search with BM25 and return results.
411    ///
412    /// **Performance contract:** Median latency 10–50 ms (spec Section 3.1).
413    pub fn search(&self, query: &str, k: usize) -> Result<Vec<fts::FtsResult>> {
414        self.index.search_fts(query, k)
415    }
416
417    /// Check if a document needs reindexing by fingerprint.
418    pub fn needs_reindex(&self, path: &str, hash: &[u8; 32]) -> Result<bool> {
419        self.index.needs_reindex(path, hash)
420    }
421
422    /// List all tracked fingerprints.
423    pub fn list_fingerprints(&self) -> Result<Vec<(String, Vec<u8>)>> {
424        self.index.list_fingerprints()
425    }
426
427    /// Remove all documents with a given source path.
428    pub fn remove_by_source(&self, source: &str) -> Result<usize> {
429        self.index.remove_by_source(source)
430    }
431
432    /// Get store statistics.
433    pub fn stats(&self) -> Result<StoreStats> {
434        let db_size_bytes = self
435            .path
436            .as_ref()
437            .and_then(|p| std::fs::metadata(p).ok())
438            .map(|m| m.len())
439            .unwrap_or(0);
440
441        Ok(StoreStats {
442            document_count: self.index.document_count()?,
443            chunk_count: self.index.chunk_count()?,
444            fingerprint_count: self.fingerprint_count()?,
445            db_size_bytes,
446        })
447    }
448
449    /// Get the number of tracked fingerprints.
450    fn fingerprint_count(&self) -> Result<usize> {
451        let conn = self.index.conn.lock().map_err(|e| lock_err(&e))?;
452        let count: i64 = conn
453            .query_row("SELECT COUNT(*) FROM fingerprints", [], |r| r.get(0))
454            .map_err(|e| crate::Error::Query(format!("Failed to count fingerprints: {e}")))?;
455        Ok(count as usize)
456    }
457
458    /// Get/set metadata.
459    pub fn get_metadata(&self, key: &str) -> Result<Option<String>> {
460        self.index.get_metadata(key)
461    }
462
463    /// Set a metadata key-value pair.
464    pub fn set_metadata(&self, key: &str, value: &str) -> Result<()> {
465        self.index.set_metadata(key, value)
466    }
467
468    /// Optimize the database (VACUUM + FTS5 merge).
469    pub fn optimize(&self) -> Result<()> {
470        self.index.optimize()
471    }
472
473    /// Get a reference to the underlying SqliteIndex.
474    pub fn as_index(&self) -> &SqliteIndex {
475        &self.index
476    }
477}
478
479#[cfg(test)]
480mod tests {
481    use super::*;
482    use crate::{Document, DocumentId};
483
484    fn make_doc(content: &str) -> Document {
485        Document::new(content)
486    }
487
488    fn make_chunk(doc_id: DocumentId, content: &str) -> Chunk {
489        Chunk {
490            id: ChunkId::new(),
491            document_id: doc_id,
492            content: content.to_string(),
493            start_offset: 0,
494            end_offset: content.len(),
495            metadata: crate::ChunkMetadata::default(),
496            embedding: None,
497        }
498    }
499
500    // --- SqliteIndex tests ---
501
502    #[test]
503    fn test_index_roundtrip() {
504        let idx = SqliteIndex::open_in_memory().unwrap();
505        idx.insert_document(
506            "doc1",
507            Some("Test Doc"),
508            Some("/test.md"),
509            "full content here",
510            &[
511                ("c1".into(), "SIMD vector operations".into()),
512                ("c2".into(), "GPU kernel dispatch".into()),
513            ],
514            None,
515        )
516        .unwrap();
517
518        assert_eq!(idx.document_count().unwrap(), 1);
519        assert_eq!(idx.chunk_count().unwrap(), 2);
520
521        let content = idx.get_chunk("c1").unwrap();
522        assert_eq!(content.unwrap(), "SIMD vector operations");
523    }
524
525    #[test]
526    fn test_index_search() {
527        let idx = SqliteIndex::open_in_memory().unwrap();
528        idx.insert_document(
529            "doc1",
530            None,
531            None,
532            "",
533            &[
534                ("c1".into(), "machine learning algorithms for classification".into()),
535                ("c2".into(), "database indexing and query optimization".into()),
536            ],
537            None,
538        )
539        .unwrap();
540
541        let results = idx.search_fts("machine learning", 10).unwrap();
542        assert_eq!(results.len(), 1);
543        assert_eq!(results[0].chunk_id, "c1");
544    }
545
546    #[test]
547    fn test_index_fingerprint_reindex() {
548        let idx = SqliteIndex::open_in_memory().unwrap();
549        let hash1 = [1u8; 32];
550        let hash2 = [2u8; 32];
551
552        // First insert with fingerprint
553        idx.insert_document(
554            "doc1",
555            None,
556            None,
557            "",
558            &[("c1".into(), "content".into())],
559            Some(("/test.md", &hash1)),
560        )
561        .unwrap();
562
563        // Same hash should not need reindex
564        assert!(!idx.needs_reindex("/test.md", &hash1).unwrap());
565
566        // Different hash should need reindex
567        assert!(idx.needs_reindex("/test.md", &hash2).unwrap());
568
569        // Unknown path should need reindex
570        assert!(idx.needs_reindex("/unknown.md", &hash1).unwrap());
571    }
572
573    #[test]
574    fn test_index_remove_document() {
575        let idx = SqliteIndex::open_in_memory().unwrap();
576        idx.insert_document("doc1", None, None, "", &[("c1".into(), "some content".into())], None)
577            .unwrap();
578
579        assert_eq!(idx.document_count().unwrap(), 1);
580        idx.remove_document("doc1").unwrap();
581        assert_eq!(idx.document_count().unwrap(), 0);
582        assert_eq!(idx.chunk_count().unwrap(), 0);
583    }
584
585    #[test]
586    fn test_index_metadata() {
587        let idx = SqliteIndex::open_in_memory().unwrap();
588        idx.set_metadata("version", "1.0.0").unwrap();
589        assert_eq!(idx.get_metadata("version").unwrap(), Some("1.0.0".to_string()));
590        assert_eq!(idx.get_metadata("nonexistent").unwrap(), None);
591    }
592
593    #[test]
594    fn test_index_update_document() {
595        let idx = SqliteIndex::open_in_memory().unwrap();
596        idx.insert_document("doc1", None, None, "", &[("c1".into(), "old content".into())], None)
597            .unwrap();
598        idx.insert_document("doc1", None, None, "", &[("c2".into(), "new content".into())], None)
599            .unwrap();
600
601        // Old chunk should be gone, new chunk present
602        assert_eq!(idx.chunk_count().unwrap(), 1);
603        assert!(idx.get_chunk("c1").unwrap().is_none());
604        assert_eq!(idx.get_chunk("c2").unwrap().unwrap(), "new content");
605    }
606
607    // --- SparseIndex trait tests ---
608
609    #[test]
610    fn test_sparse_index_add_and_len() {
611        let mut idx = SqliteIndex::open_in_memory().unwrap();
612        let doc_id = DocumentId::new();
613        let chunk = make_chunk(doc_id, "sparse index test content");
614        idx.add(&chunk);
615        assert_eq!(idx.len(), 1);
616    }
617
618    #[test]
619    fn test_sparse_index_add_batch() {
620        let mut idx = SqliteIndex::open_in_memory().unwrap();
621        let doc_id = DocumentId::new();
622        let chunks = vec![
623            make_chunk(doc_id, "first chunk content"),
624            make_chunk(doc_id, "second chunk content"),
625        ];
626        idx.add_batch(&chunks);
627        assert_eq!(idx.len(), 2);
628    }
629
630    #[test]
631    fn test_sparse_index_remove() {
632        let mut idx = SqliteIndex::open_in_memory().unwrap();
633        let doc_id = DocumentId::new();
634        let chunk = make_chunk(doc_id, "content to remove");
635        let chunk_id = chunk.id;
636        idx.add(&chunk);
637        assert_eq!(idx.len(), 1);
638        idx.remove(chunk_id);
639        assert_eq!(idx.len(), 0);
640    }
641
642    // --- SqliteStore tests ---
643
644    #[test]
645    fn test_store_index_and_search() {
646        let store = SqliteStore::open_in_memory().unwrap();
647        let doc = make_doc("SIMD vector operations for tensor computation");
648        let chunks = vec![make_chunk(doc.id, "SIMD vector operations for tensor computation")];
649        store.index_document(&doc, &chunks, None).unwrap();
650
651        let results = store.search("SIMD tensor", 10).unwrap();
652        assert!(!results.is_empty());
653    }
654
655    #[test]
656    fn test_store_stats() {
657        let store = SqliteStore::open_in_memory().unwrap();
658        let doc = make_doc("content");
659        let chunks = vec![make_chunk(doc.id, "chunk one"), make_chunk(doc.id, "chunk two")];
660        store.index_document(&doc, &chunks, Some(("/test.md", &[0u8; 32]))).unwrap();
661
662        let stats = store.stats().unwrap();
663        assert_eq!(stats.document_count, 1);
664        assert_eq!(stats.chunk_count, 2);
665        assert_eq!(stats.fingerprint_count, 1);
666    }
667
668    #[test]
669    fn test_store_needs_reindex() {
670        let store = SqliteStore::open_in_memory().unwrap();
671        let doc = make_doc("content");
672        let chunks = vec![make_chunk(doc.id, "chunk")];
673        let hash = [42u8; 32];
674        store.index_document(&doc, &chunks, Some(("/doc.md", &hash))).unwrap();
675
676        assert!(!store.needs_reindex("/doc.md", &hash).unwrap());
677        assert!(store.needs_reindex("/doc.md", &[0u8; 32]).unwrap());
678        assert!(store.needs_reindex("/other.md", &hash).unwrap());
679    }
680
681    #[test]
682    fn test_store_metadata() {
683        let store = SqliteStore::open_in_memory().unwrap();
684        store.set_metadata("batuta_version", "0.6.0").unwrap();
685        assert_eq!(store.get_metadata("batuta_version").unwrap(), Some("0.6.0".to_string()));
686    }
687
688    #[test]
689    fn test_store_optimize() {
690        let store = SqliteStore::open_in_memory().unwrap();
691        let doc = make_doc("content");
692        let chunks = vec![make_chunk(doc.id, "some chunk content")];
693        store.index_document(&doc, &chunks, None).unwrap();
694        store.optimize().unwrap(); // Should not panic
695    }
696
697    #[test]
698    fn test_store_large_batch() {
699        let store = SqliteStore::open_in_memory().unwrap();
700
701        // Insert 100 documents with 5 chunks each
702        for i in 0..100 {
703            let doc = make_doc(&format!("Document {i} about machine learning"));
704            let chunks: Vec<Chunk> = (0..5)
705                .map(|j| {
706                    make_chunk(
707                        doc.id,
708                        &format!("Chunk {j} of doc {i}: machine learning algorithms topic {j}"),
709                    )
710                })
711                .collect();
712            store.index_document(&doc, &chunks, None).unwrap();
713        }
714
715        let stats = store.stats().unwrap();
716        assert_eq!(stats.document_count, 100);
717        assert_eq!(stats.chunk_count, 500);
718
719        let results = store.search("machine learning", 10).unwrap();
720        assert_eq!(results.len(), 10);
721    }
722
723    #[test]
724    fn test_search_deterministic() {
725        let store = SqliteStore::open_in_memory().unwrap();
726        let doc = make_doc("determinism test");
727        let chunks = vec![
728            make_chunk(doc.id, "alpha beta gamma delta"),
729            make_chunk(doc.id, "epsilon zeta alpha alpha"),
730        ];
731        store.index_document(&doc, &chunks, None).unwrap();
732
733        // Run the same query 10 times, results should be identical
734        let baseline = store.search("alpha", 10).unwrap();
735        for _ in 0..10 {
736            let results = store.search("alpha", 10).unwrap();
737            assert_eq!(results.len(), baseline.len());
738            for (a, b) in baseline.iter().zip(results.iter()) {
739                assert_eq!(a.chunk_id, b.chunk_id);
740                assert!((a.score - b.score).abs() < f64::EPSILON);
741            }
742        }
743    }
744
745    // --- Incremental indexing tests ---
746
747    #[test]
748    fn test_list_fingerprints_empty() {
749        let idx = SqliteIndex::open_in_memory().unwrap();
750        let fps = idx.list_fingerprints().unwrap();
751        assert!(fps.is_empty());
752    }
753
754    #[test]
755    fn test_list_fingerprints_populated() {
756        let idx = SqliteIndex::open_in_memory().unwrap();
757        let hash1 = [1u8; 32];
758        let hash2 = [2u8; 32];
759
760        idx.insert_document(
761            "doc1",
762            None,
763            Some("/a.md"),
764            "",
765            &[("c1".into(), "content a".into())],
766            Some(("/a.md", &hash1)),
767        )
768        .unwrap();
769        idx.insert_document(
770            "doc2",
771            None,
772            Some("/b.md"),
773            "",
774            &[("c2".into(), "content b".into())],
775            Some(("/b.md", &hash2)),
776        )
777        .unwrap();
778
779        let fps = idx.list_fingerprints().unwrap();
780        assert_eq!(fps.len(), 2);
781        let paths: Vec<&str> = fps.iter().map(|(p, _)| p.as_str()).collect();
782        assert!(paths.contains(&"/a.md"));
783        assert!(paths.contains(&"/b.md"));
784    }
785
786    #[test]
787    fn test_remove_by_source() {
788        let idx = SqliteIndex::open_in_memory().unwrap();
789        let hash = [1u8; 32];
790
791        idx.insert_document(
792            "doc1",
793            None,
794            Some("/a.md"),
795            "full content",
796            &[("c1".into(), "chunk 1".into()), ("c2".into(), "chunk 2".into())],
797            Some(("/a.md", &hash)),
798        )
799        .unwrap();
800        idx.insert_document(
801            "doc2",
802            None,
803            Some("/b.md"),
804            "other content",
805            &[("c3".into(), "chunk 3".into())],
806            Some(("/b.md", &hash)),
807        )
808        .unwrap();
809
810        assert_eq!(idx.document_count().unwrap(), 2);
811        assert_eq!(idx.chunk_count().unwrap(), 3);
812
813        let removed = idx.remove_by_source("/a.md").unwrap();
814        assert_eq!(removed, 1);
815        assert_eq!(idx.document_count().unwrap(), 1);
816        assert_eq!(idx.chunk_count().unwrap(), 1);
817
818        // Fingerprint should also be removed
819        assert!(idx.needs_reindex("/a.md", &hash).unwrap());
820        // Other doc unaffected
821        assert!(!idx.needs_reindex("/b.md", &hash).unwrap());
822    }
823
824    #[test]
825    fn test_remove_by_source_nonexistent() {
826        let idx = SqliteIndex::open_in_memory().unwrap();
827        let removed = idx.remove_by_source("/nonexistent.md").unwrap();
828        assert_eq!(removed, 0);
829    }
830}