Skip to main content

trueno_rag/sqlite/
schema.rs

1//! SQLite schema definition and migration logic.
2//!
3//! Implements the normalized schema from the sqlite-rag-integration spec:
4//! documents, chunks, chunks_fts (FTS5 with Porter stemmer), fingerprints,
5//! and metadata tables.
6
7use crate::Result;
8use rusqlite::Connection;
9
10/// Current schema version for migration tracking.
11///
12/// v1.0.0: Initial schema with content FTS5 (stores duplicate text)
13/// v2.0.0: External content FTS5 (content=chunks) — eliminates ~135MB shadow table
14pub const SCHEMA_VERSION: &str = "2.0.0";
15
16/// Initialize the database schema and set connection pragmas.
17///
18/// Pragmas follow the sqlite-rag-integration spec Section 4.1.2:
19/// - WAL journal mode for concurrent readers (Hipp, 2014)
20/// - 64 MB page cache
21/// - 256 MB mmap for reduced syscall overhead
22///
23/// Handles migration from v1 (content FTS5) to v2 (external content FTS5).
24pub fn initialize(conn: &Connection) -> Result<()> {
25    set_pragmas(conn)?;
26    let migrated = migrate_if_needed(conn)?;
27    create_tables(conn)?;
28    if migrated {
29        rebuild_fts(conn)?;
30    }
31    set_schema_version(conn)?;
32    Ok(())
33}
34
35/// Migrate from v1 (content FTS5) to v2 (external content FTS5).
36///
37/// Detects v1 schema by checking if `chunks_fts_content` shadow table exists
38/// (content FTS5 creates it; external content FTS5 does not). Drops old FTS
39/// table + triggers, so `create_tables` can recreate with external content.
40/// After table creation, triggers handle sync; existing data is rebuilt via
41/// `INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')`.
42fn migrate_if_needed(conn: &Connection) -> Result<bool> {
43    // Check if chunks_fts_content shadow table exists (sign of v1 content FTS5)
44    let has_shadow: bool = conn
45        .query_row(
46            "SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name='chunks_fts_content'",
47            [],
48            |row| row.get(0),
49        )
50        .unwrap_or(false);
51
52    if !has_shadow {
53        return Ok(false);
54    }
55
56    // v1 → v2 migration: drop content FTS5, triggers will be recreated
57    conn.execute_batch(
58        "DROP TRIGGER IF EXISTS chunks_ai;
59         DROP TRIGGER IF EXISTS chunks_ad;
60         DROP TRIGGER IF EXISTS chunks_au;
61         DROP TABLE IF EXISTS chunks_fts;",
62    )
63    .map_err(|e| crate::Error::Query(format!("v1→v2 migration: drop old FTS failed: {e}")))?;
64
65    Ok(true)
66}
67
68fn set_pragmas(conn: &Connection) -> Result<()> {
69    conn.execute_batch(
70        "PRAGMA journal_mode = WAL;
71         PRAGMA synchronous = NORMAL;
72         PRAGMA cache_size = -65536;
73         PRAGMA mmap_size = 268435456;
74         PRAGMA foreign_keys = ON;
75         PRAGMA busy_timeout = 5000;",
76    )
77    .map_err(|e| crate::Error::Query(format!("Failed to set pragmas: {e}")))?;
78    Ok(())
79}
80
81fn create_tables(conn: &Connection) -> Result<()> {
82    conn.execute_batch(
83        "CREATE TABLE IF NOT EXISTS documents (
84            id          TEXT PRIMARY KEY,
85            title       TEXT,
86            source      TEXT,
87            content     TEXT NOT NULL,
88            metadata    TEXT,
89            chunk_count INTEGER DEFAULT 0,
90            created_at  INTEGER NOT NULL DEFAULT (unixepoch()),
91            updated_at  INTEGER NOT NULL DEFAULT (unixepoch())
92        );
93
94        CREATE TABLE IF NOT EXISTS chunks (
95            id          TEXT PRIMARY KEY,
96            doc_id      TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
97            content     TEXT NOT NULL,
98            position    INTEGER NOT NULL,
99            metadata    TEXT,
100            UNIQUE(doc_id, position)
101        );
102
103        CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
104            content,
105            content=chunks,
106            content_rowid=rowid,
107            tokenize='porter unicode61'
108        );
109
110        CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
111            INSERT INTO chunks_fts(rowid, content) VALUES (new.rowid, new.content);
112        END;
113
114        CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
115            INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES('delete', old.rowid, old.content);
116        END;
117
118        CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
119            INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES('delete', old.rowid, old.content);
120            INSERT INTO chunks_fts(rowid, content) VALUES (new.rowid, new.content);
121        END;
122
123        CREATE TABLE IF NOT EXISTS fingerprints (
124            doc_path    TEXT PRIMARY KEY,
125            blake3_hash BLOB NOT NULL,
126            chunk_count INTEGER NOT NULL,
127            indexed_at  INTEGER NOT NULL DEFAULT (unixepoch())
128        );
129
130        CREATE TABLE IF NOT EXISTS metadata (
131            key   TEXT PRIMARY KEY,
132            value TEXT NOT NULL
133        );",
134    )
135    .map_err(|e| crate::Error::Query(format!("Failed to create tables: {e}")))?;
136    Ok(())
137}
138
139/// Rebuild FTS5 index from the external content table (chunks).
140///
141/// Called after v1→v2 migration to populate the new external-content FTS5
142/// index from existing chunk data.
143fn rebuild_fts(conn: &Connection) -> Result<()> {
144    conn.execute("INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')", [])
145        .map_err(|e| crate::Error::Query(format!("FTS5 rebuild failed: {e}")))?;
146    Ok(())
147}
148
149fn set_schema_version(conn: &Connection) -> Result<()> {
150    conn.execute(
151        "INSERT OR REPLACE INTO metadata (key, value) VALUES ('schema_version', ?1)",
152        [SCHEMA_VERSION],
153    )
154    .map_err(|e| crate::Error::Query(format!("Failed to set schema version: {e}")))?;
155    Ok(())
156}
157
158/// Check if the schema version matches the current version.
159pub fn check_version(conn: &Connection) -> Result<bool> {
160    let version: Option<String> = conn
161        .query_row("SELECT value FROM metadata WHERE key = 'schema_version'", [], |row| row.get(0))
162        .ok();
163    Ok(version.as_deref() == Some(SCHEMA_VERSION))
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    #[test]
171    fn test_initialize_creates_tables() {
172        let conn = Connection::open_in_memory().unwrap();
173        initialize(&conn).unwrap();
174
175        // Verify tables exist by querying them
176        let doc_count: i64 =
177            conn.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0)).unwrap();
178        assert_eq!(doc_count, 0);
179
180        let chunk_count: i64 =
181            conn.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0)).unwrap();
182        assert_eq!(chunk_count, 0);
183
184        let fp_count: i64 =
185            conn.query_row("SELECT COUNT(*) FROM fingerprints", [], |r| r.get(0)).unwrap();
186        assert_eq!(fp_count, 0);
187    }
188
189    #[test]
190    fn test_schema_version_set() {
191        let conn = Connection::open_in_memory().unwrap();
192        initialize(&conn).unwrap();
193        assert!(check_version(&conn).unwrap());
194    }
195
196    #[test]
197    fn test_initialize_idempotent() {
198        let conn = Connection::open_in_memory().unwrap();
199        initialize(&conn).unwrap();
200        initialize(&conn).unwrap(); // Should not fail on second call
201        assert!(check_version(&conn).unwrap());
202    }
203
204    #[test]
205    fn test_wal_mode_enabled() {
206        let conn = Connection::open_in_memory().unwrap();
207        initialize(&conn).unwrap();
208        // In-memory databases use "memory" journal mode, not WAL
209        // but the pragma should not error
210        let mode: String = conn.query_row("PRAGMA journal_mode", [], |r| r.get(0)).unwrap();
211        assert!(!mode.is_empty());
212    }
213
214    #[test]
215    fn test_external_content_fts5_no_shadow_content_table() {
216        let conn = Connection::open_in_memory().unwrap();
217        initialize(&conn).unwrap();
218
219        // External content FTS5 should NOT create chunks_fts_content shadow table
220        let has_content_table: bool = conn
221            .query_row(
222                "SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name='chunks_fts_content'",
223                [],
224                |row| row.get(0),
225            )
226            .unwrap();
227        assert!(!has_content_table, "External content FTS5 should not create chunks_fts_content");
228    }
229
230    #[test]
231    fn test_v1_to_v2_migration() {
232        let conn = Connection::open_in_memory().unwrap();
233
234        // Simulate v1 schema: create content FTS5 (with shadow table)
235        set_pragmas(&conn).unwrap();
236        conn.execute_batch(
237            "CREATE TABLE IF NOT EXISTS documents (
238                id TEXT PRIMARY KEY, title TEXT, source TEXT, content TEXT NOT NULL,
239                metadata TEXT, chunk_count INTEGER DEFAULT 0,
240                created_at INTEGER NOT NULL DEFAULT (unixepoch()),
241                updated_at INTEGER NOT NULL DEFAULT (unixepoch())
242            );
243            CREATE TABLE IF NOT EXISTS chunks (
244                id TEXT PRIMARY KEY,
245                doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
246                content TEXT NOT NULL, position INTEGER NOT NULL, metadata TEXT,
247                UNIQUE(doc_id, position)
248            );
249            CREATE VIRTUAL TABLE chunks_fts USING fts5(content, tokenize='porter unicode61');
250            CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
251                INSERT INTO chunks_fts(rowid, content) VALUES (new.rowid, new.content);
252            END;
253            CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
254                DELETE FROM chunks_fts WHERE rowid = old.rowid;
255            END;
256            CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
257                DELETE FROM chunks_fts WHERE rowid = old.rowid;
258                INSERT INTO chunks_fts(rowid, content) VALUES (new.rowid, new.content);
259            END;
260            CREATE TABLE IF NOT EXISTS fingerprints (
261                doc_path TEXT PRIMARY KEY, blake3_hash BLOB NOT NULL,
262                chunk_count INTEGER NOT NULL, indexed_at INTEGER NOT NULL DEFAULT (unixepoch())
263            );
264            CREATE TABLE IF NOT EXISTS metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
265            INSERT OR REPLACE INTO metadata (key, value) VALUES ('schema_version', '1.0.0');",
266        )
267        .unwrap();
268
269        // Insert test data under v1 schema
270        conn.execute("INSERT INTO documents (id, content) VALUES ('doc1', 'test document')", [])
271            .unwrap();
272        conn.execute(
273            "INSERT INTO chunks (id, doc_id, content, position) VALUES ('c1', 'doc1', 'SIMD vector operations', 0)",
274            [],
275        )
276        .unwrap();
277
278        // Verify v1 has chunks_fts_content shadow table
279        let has_shadow: bool = conn
280            .query_row(
281                "SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name='chunks_fts_content'",
282                [],
283                |row| row.get(0),
284            )
285            .unwrap();
286        assert!(has_shadow, "v1 should have chunks_fts_content shadow table");
287
288        // Run initialize — should migrate v1→v2
289        initialize(&conn).unwrap();
290
291        // Verify shadow table is gone
292        let has_shadow_after: bool = conn
293            .query_row(
294                "SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name='chunks_fts_content'",
295                [],
296                |row| row.get(0),
297            )
298            .unwrap();
299        assert!(!has_shadow_after, "v2 should not have chunks_fts_content");
300
301        // Verify version is 2.0.0
302        assert!(check_version(&conn).unwrap());
303
304        // Verify FTS search still works (data was rebuilt)
305        let count: i64 = conn
306            .query_row("SELECT COUNT(*) FROM chunks_fts WHERE chunks_fts MATCH 'SIMD'", [], |r| {
307                r.get(0)
308            })
309            .unwrap();
310        assert_eq!(count, 1, "FTS search should find migrated data");
311    }
312
313    #[test]
314    fn test_v2_fresh_install_no_migration() {
315        let conn = Connection::open_in_memory().unwrap();
316        let migrated = migrate_if_needed(&conn).unwrap();
317        assert!(!migrated, "Fresh install should not trigger migration");
318    }
319}