Skip to main content

sqlite_graphrag/storage/
connection.rs

1//! SQLite connection setup with PRAGMAs and 0600 permissions.
2//!
3//! v1.0.76: opens (or creates) the database file. The `sqlite-vec` extension
4//! was REMOVED; vector similarity is now computed in pure Rust over the
5//! `memory_embeddings(memory_id, embedding BLOB, source)` table. WAL/journal
6//! PRAGMAs and 0600 file permissions on Unix are unchanged.
7
8use crate::errors::AppError;
9use crate::paths::AppPaths;
10use crate::pragmas::{apply_connection_pragmas, apply_init_pragmas, ensure_wal_mode};
11use rusqlite::Connection;
12use std::path::Path;
13
14/// v1.0.76: no-op stub. Kept for source compatibility with callers that
15/// still call `register_vec_extension()` during auto-init. The actual
16/// extension registration is gone; the function is now a marker that
17/// the LLM-only build does not need any vector extension.
18pub fn register_vec_extension() {}
19
20pub fn open_rw(path: &Path) -> Result<Connection, AppError> {
21    let conn = Connection::open(path)?;
22    apply_connection_pragmas(&conn)?;
23    apply_secure_permissions(path);
24    adopt_embedding_dim(&conn);
25    Ok(conn)
26}
27
28/// G42/S1 follow-up (G43): adopts the dimensionality recorded in
29/// `schema_meta.dim` for this process, so EVERY command that opens the
30/// database — not only the `ensure_db_ready` auto-init path — produces
31/// and queries vectors of the database dimensionality. Pre-G43 the
32/// adoption only ran in `ensure_db_ready`, which `remember` / `edit` /
33/// `recall` / `hybrid-search` never call; those commands silently used
34/// the compiled default (64) against pre-v1.0.79 384-dim databases,
35/// writing mixed-dim embeddings that cosine-score 0.0 against each
36/// other.
37///
38/// Read-only and best-effort by design: a virgin database without
39/// `schema_meta` is a no-op (the table is created and persisted later
40/// by `ensure_schema` / `ensure_db_ready`). The env/flag override
41/// always wins and is handled inside `constants::embedding_dim`.
42fn adopt_embedding_dim(conn: &Connection) {
43    if crate::constants::embedding_dim_from_env().is_some() {
44        return;
45    }
46    if let Ok(value) = conn.query_row(
47        "SELECT value FROM schema_meta WHERE key = 'dim'",
48        [],
49        |row| row.get::<_, String>(0),
50    ) {
51        if let Ok(dim) = value.parse::<usize>() {
52            crate::constants::set_active_embedding_dim(dim);
53        }
54    }
55}
56
57pub fn ensure_schema(conn: &mut Connection) -> Result<(), AppError> {
58    crate::migrations::runner()
59        .run(conn)
60        .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
61    conn.execute_batch(&format!(
62        "PRAGMA user_version = {};",
63        crate::constants::SCHEMA_USER_VERSION
64    ))?;
65    Ok(())
66}
67
68/// Ensures the database file exists and the schema is at the current version.
69///
70/// Behavior:
71/// - DB does not exist: creates the file, applies init PRAGMAs, runs all migrations,
72///   sets `PRAGMA user_version`, and populates `schema_meta` with default values.
73///   Emits `tracing::info!` on creation.
74/// - DB exists with `user_version` below `SCHEMA_USER_VERSION`: runs the remaining
75///   migrations and updates `user_version`. Emits `tracing::warn!` on auto-migration.
76/// - DB exists with `user_version` equal to `SCHEMA_USER_VERSION`: no-op.
77///
78/// This helper unifies the auto-init contract across CRUD handlers so users can run
79/// any subcommand on a fresh directory without invoking `init` first. Idempotent
80/// and safe to call before every handler that needs a ready database.
81pub fn ensure_db_ready(paths: &AppPaths) -> Result<(), AppError> {
82    register_vec_extension();
83    paths.ensure_dirs()?;
84
85    let db_existed = paths.db.exists();
86
87    if !db_existed {
88        tracing::info!(target: "storage",
89            path = %paths.db.display(),
90            schema_version = crate::constants::CURRENT_SCHEMA_VERSION,
91            "creating database (auto-init)"
92        );
93    }
94
95    let mut conn = open_rw(&paths.db)?;
96
97    if !db_existed {
98        apply_init_pragmas(&conn)?;
99    }
100
101    let current_user_version: i64 = conn
102        .query_row("PRAGMA user_version", [], |row| row.get(0))
103        .unwrap_or(0);
104    let target_user_version = crate::constants::SCHEMA_USER_VERSION;
105
106    if current_user_version < target_user_version {
107        if db_existed {
108            tracing::warn!(target: "storage",
109                from = current_user_version,
110                to = target_user_version,
111                path = %paths.db.display(),
112                "auto-migrating database schema"
113            );
114        }
115        crate::migrations::runner()
116            .run(&mut conn)
117            .map_err(|e| AppError::Internal(anyhow::anyhow!("auto-migration failed: {e}")))?;
118        conn.execute_batch(&format!("PRAGMA user_version = {target_user_version};"))?;
119
120        if !db_existed {
121            insert_default_schema_meta(&conn)?;
122        }
123
124        // Defensive re-assertion: refinery's migration runner may open internal
125        // handles that revert journal_mode to delete on some platforms. Re-apply
126        // WAL after migrations to guarantee the documented contract holds for
127        // every command that goes through the auto-init path.
128        ensure_wal_mode(&conn)?;
129    }
130
131    // G41 repair: if V013 is in history but embedding tables are missing,
132    // execute V013 SQL directly. Runs unconditionally because databases
133    // corrupted by G41 already have user_version=50 and skip the block above.
134    crate::commands::migrate::ensure_v013_tables_exist(&conn)?;
135
136    // G42/S1 (v1.0.79): synchronise the active embedding dimensionality
137    // with the database. Existing databases keep their recorded `dim`
138    // (e.g. 384 from pre-v1.0.79); an explicit env/flag override is
139    // persisted back so `health --json` reports the truth. This is an
140    // UPDATE of an existing `schema_meta` key — ZERO schema change.
141    sync_embedding_dim_meta(&conn)?;
142
143    Ok(())
144}
145
146/// G42/S1: two-way sync between `schema_meta.dim` and the process-wide
147/// active embedding dimensionality.
148///
149/// - env/flag override set → persist it into `schema_meta.dim`;
150/// - no override → adopt the database value via
151///   [`crate::constants::set_active_embedding_dim`] so old 384-dim
152///   databases keep producing and querying 384-dim vectors;
153/// - key missing (legacy/corrupt meta) → write the resolved default.
154fn sync_embedding_dim_meta(conn: &Connection) -> Result<(), AppError> {
155    let db_dim: Option<usize> = conn
156        .query_row(
157            "SELECT value FROM schema_meta WHERE key = 'dim'",
158            [],
159            |row| row.get::<_, String>(0),
160        )
161        .ok()
162        .and_then(|v| v.parse::<usize>().ok());
163
164    if let Some(env_dim) = crate::constants::embedding_dim_from_env() {
165        if db_dim != Some(env_dim) {
166            conn.execute(
167                "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('dim', ?1)",
168                rusqlite::params![env_dim.to_string()],
169            )?;
170        }
171        return Ok(());
172    }
173
174    match db_dim {
175        Some(dim) => crate::constants::set_active_embedding_dim(dim),
176        None => {
177            conn.execute(
178                "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('dim', ?1)",
179                rusqlite::params![crate::constants::embedding_dim().to_string()],
180            )?;
181        }
182    }
183    Ok(())
184}
185
186fn insert_default_schema_meta(conn: &Connection) -> Result<(), AppError> {
187    conn.execute(
188        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
189        rusqlite::params![crate::constants::CURRENT_SCHEMA_VERSION.to_string()],
190    )?;
191    conn.execute(
192        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('model', 'multilingual-e5-small')",
193        [],
194    )?;
195    conn.execute(
196        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('dim', ?1)",
197        rusqlite::params![crate::constants::embedding_dim().to_string()],
198    )?;
199    conn.execute(
200        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('created_at', CAST(unixepoch() AS TEXT))",
201        [],
202    )?;
203    conn.execute(
204        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('sqlite-graphrag_version', ?1)",
205        rusqlite::params![crate::constants::SQLITE_GRAPHRAG_VERSION],
206    )?;
207    Ok(())
208}
209
210/// Applies 600 permissions (owner read/write only) to the SQLite file and its WAL/SHM
211/// companion files on Unix to prevent leaking private memories in shared directories
212/// (e.g. multi-user /tmp, Dropbox, NFS). On Windows, NTFS DACL default is private-to-user
213/// so explicit permission setting is unnecessary; a debug log records the skip. Failures
214/// are silent to avoid blocking the operation when the process does not own the file
215/// (e.g. read-only mount).
216#[allow(unused_variables)]
217fn apply_secure_permissions(path: &Path) {
218    #[cfg(unix)]
219    {
220        use std::os::unix::fs::PermissionsExt;
221        let candidates = [
222            path.to_path_buf(),
223            path.with_extension(format!(
224                "{}-wal",
225                path.extension()
226                    .and_then(|e| e.to_str())
227                    .unwrap_or("sqlite")
228            )),
229            path.with_extension(format!(
230                "{}-shm",
231                path.extension()
232                    .and_then(|e| e.to_str())
233                    .unwrap_or("sqlite")
234            )),
235        ];
236        for file in candidates.iter() {
237            if file.exists() {
238                if let Ok(meta) = std::fs::metadata(file) {
239                    let mut perms = meta.permissions();
240                    perms.set_mode(0o600);
241                    let _ = std::fs::set_permissions(file, perms);
242                }
243            }
244        }
245    }
246    #[cfg(windows)]
247    {
248        tracing::debug!(target: "storage",
249            path = %path.display(),
250            "skipping Unix mode 0o600 on Windows; NTFS DACL default is private-to-user"
251        );
252    }
253}
254
255pub fn open_ro(path: &Path) -> Result<Connection, AppError> {
256    let conn = Connection::open_with_flags(
257        path,
258        rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY | rusqlite::OpenFlags::SQLITE_OPEN_URI,
259    )?;
260    conn.execute_batch("PRAGMA foreign_keys = ON;")?;
261    // G43: read-only commands (`recall`, `hybrid-search`) embed the QUERY
262    // text, so they must adopt the database dimensionality too.
263    adopt_embedding_dim(&conn);
264    Ok(conn)
265}
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270
271    /// G43 regression: `open_rw` must adopt `schema_meta.dim` so EVERY
272    /// command (not only the `ensure_db_ready` auto-init path) produces
273    /// vectors of the database dimensionality. Pre-G43, `remember` /
274    /// `edit` / `recall` / `hybrid-search` used the compiled default
275    /// against pre-v1.0.79 384-dim databases, silently writing
276    /// mixed-dim embeddings that cosine-score 0.0 against each other.
277    #[test]
278    #[serial_test::serial(env)]
279    fn open_rw_adopts_schema_meta_dim() {
280        let dir = tempfile::tempdir().expect("tempdir");
281        let db = dir.path().join("g43.sqlite");
282        {
283            let conn = Connection::open(&db).expect("create seed db");
284            conn.execute_batch(
285                "CREATE TABLE schema_meta (key TEXT PRIMARY KEY, value TEXT);
286                 INSERT INTO schema_meta VALUES ('dim', '128');",
287            )
288            .expect("seed schema_meta");
289        }
290        std::env::remove_var("SQLITE_GRAPHRAG_EMBEDDING_DIM");
291        let _conn = open_rw(&db).expect("open_rw");
292        let adopted = crate::constants::embedding_dim();
293        // Restore the process-wide default before asserting so a failure
294        // does not leak 128 into parallel tests.
295        crate::constants::set_active_embedding_dim(crate::constants::DEFAULT_EMBEDDING_DIM);
296        assert_eq!(adopted, 128, "open_rw must adopt the recorded db dim (G43)");
297    }
298
299    /// G43 regression: `open_ro` (used by `recall` / `hybrid-search` to
300    /// embed the QUERY text) must adopt the database dim too.
301    #[test]
302    #[serial_test::serial(env)]
303    fn open_ro_adopts_schema_meta_dim() {
304        let dir = tempfile::tempdir().expect("tempdir");
305        let db = dir.path().join("g43-ro.sqlite");
306        {
307            let conn = Connection::open(&db).expect("create seed db");
308            conn.execute_batch(
309                "CREATE TABLE schema_meta (key TEXT PRIMARY KEY, value TEXT);
310                 INSERT INTO schema_meta VALUES ('dim', '256');",
311            )
312            .expect("seed schema_meta");
313        }
314        std::env::remove_var("SQLITE_GRAPHRAG_EMBEDDING_DIM");
315        let _conn = open_ro(&db).expect("open_ro");
316        let adopted = crate::constants::embedding_dim();
317        crate::constants::set_active_embedding_dim(crate::constants::DEFAULT_EMBEDDING_DIM);
318        assert_eq!(adopted, 256, "open_ro must adopt the recorded db dim (G43)");
319    }
320
321    /// G43: the env override always wins over the recorded database dim
322    /// (precedence contract of `constants::embedding_dim`).
323    #[test]
324    #[serial_test::serial(env)]
325    fn env_override_wins_over_schema_meta_dim() {
326        let dir = tempfile::tempdir().expect("tempdir");
327        let db = dir.path().join("g43-env.sqlite");
328        {
329            let conn = Connection::open(&db).expect("create seed db");
330            conn.execute_batch(
331                "CREATE TABLE schema_meta (key TEXT PRIMARY KEY, value TEXT);
332                 INSERT INTO schema_meta VALUES ('dim', '128');",
333            )
334            .expect("seed schema_meta");
335        }
336        std::env::set_var("SQLITE_GRAPHRAG_EMBEDDING_DIM", "96");
337        let _conn = open_rw(&db).expect("open_rw");
338        let adopted = crate::constants::embedding_dim();
339        std::env::remove_var("SQLITE_GRAPHRAG_EMBEDDING_DIM");
340        crate::constants::set_active_embedding_dim(crate::constants::DEFAULT_EMBEDDING_DIM);
341        assert_eq!(adopted, 96, "env override must win over the db dim (G43)");
342    }
343
344    /// G43: a virgin database without `schema_meta` must open cleanly
345    /// (best-effort adoption is a no-op, never an error).
346    #[test]
347    #[serial_test::serial(env)]
348    fn open_rw_on_virgin_db_is_a_noop() {
349        let dir = tempfile::tempdir().expect("tempdir");
350        let db = dir.path().join("g43-virgin.sqlite");
351        std::env::remove_var("SQLITE_GRAPHRAG_EMBEDDING_DIM");
352        crate::constants::set_active_embedding_dim(crate::constants::DEFAULT_EMBEDDING_DIM);
353        let _conn = open_rw(&db).expect("open_rw on virgin db must not fail");
354        assert_eq!(
355            crate::constants::embedding_dim(),
356            crate::constants::DEFAULT_EMBEDDING_DIM,
357            "virgin db must keep the compiled default (G43)"
358        );
359    }
360}