solo-storage 0.5.0

Solo: SQLite + SQLCipher persistence layer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
// SPDX-License-Identifier: Apache-2.0

//! Online SQLCipher backup.
//!
//! Solo derives its 32-byte SQLCipher key on the fly via Argon2id from the
//! user's passphrase + the persisted salt in `solo.config.toml`. SQLCipher's
//! standard CLI `.backup` command uses PBKDF2 to turn a passphrase into a
//! key, which produces a different value than Solo's Argon2id derivation —
//! so the obvious `sqlcipher … PRAGMA key = 'passphrase'; .backup target.db`
//! recipe fails with "file is not a database" against a Solo data dir.
//!
//! This module exposes [`backup_database`] — a programmatic equivalent that
//! threads the raw key through SQLite's online backup API. Both source and
//! destination are opened with `PRAGMA key = "x'<hex>'"` (raw form), so the
//! resulting backup file is encrypted with the same key as the source and
//! restores cleanly when paired with a copy of `solo.config.toml`.
//!
//! ## What this is not
//!
//! Not a "hot backup against a running daemon." Callers must hold the
//! `solo.lock` lockfile around the call (i.e., no other Solo process can
//! be touching the data dir). Future work — a `WriteCommand::Backup`
//! variant that runs against the writer's existing connection — would
//! unlock daemon-side hot backup, but isn't shipped today.

use std::path::Path;

use rusqlite::Connection;
use rusqlite::backup::Backup;

use crate::init::open_sqlcipher;
use crate::key_material::KeyMaterial;
use solo_core::{Error, Result};

/// Default page-step size for the backup loop. SQLCipher pages are 4 KiB by
/// default, so 100 pages = 400 KiB per step. Small enough that a SIGINT
/// during backup tears down quickly; large enough that the per-step
/// overhead is negligible for typical (single-digit GB) corpora.
pub const DEFAULT_BACKUP_PAGES_PER_STEP: i32 = 100;

/// Run an online SQLCipher backup of `src_path` to `dest_path`, encrypting
/// the destination with the same raw key.
///
/// Both source and destination are opened with `PRAGMA key = "x'<hex>'"`
/// (raw key form). The destination file is created if missing; if it
/// already exists, its contents are overwritten by the backup.
///
/// Returns `Err(Conflict)` if the source can't be opened with the supplied
/// key (typically a wrong passphrase / wrong salt — the source isn't
/// actually decryptable).
///
/// ## Lockfile responsibility
///
/// Callers must hold `solo.lock` around this call. The function does not
/// acquire it itself — that's a one-shot-vs-daemon coordination concern
/// best left to the caller.
pub fn backup_database(
    src_path: &Path,
    dest_path: &Path,
    key: &KeyMaterial,
) -> Result<()> {
    // Source: full Solo-style open (PRAGMA key + WAL + foreign_keys +
    // busy_timeout). open_sqlcipher's `PRAGMA journal_mode = wal` query
    // forces decryption — a wrong key surfaces here, before we touch
    // the destination.
    let src = open_sqlcipher(src_path, key)?;
    let result = backup_from_connection(&src, dest_path, key);
    // Close the source explicitly so any deferred error (e.g. WAL
    // checkpoint failure) surfaces here rather than on Drop.
    if let Err((_, e)) = src.close() {
        return Err(Error::storage(format!("close source after backup: {e}")));
    }
    result
}

/// Compare two paths for "do they refer to the same file on disk." Both
/// paths are canonicalised; the destination may not exist yet, so its
/// parent is canonicalised and the filename reattached. Returns false
/// for any path that can't be canonicalised at all (don't infer
/// equality from a missing source).
///
/// Callers (CLI `solo backup`, HTTP `POST /backup`, the daemon's
/// `WriteCommand::Backup`) use this to refuse a backup that would
/// destroy the live source database. The `--force` flag's
/// `remove_file(dest)` step is destructive when `dest == source`, so
/// the check MUST run before that step — see the v0.3.4 release notes
/// for what happens when this guard is missing.
pub fn paths_refer_to_same_file(src: &Path, dest: &Path) -> bool {
    let src_canon = match std::fs::canonicalize(src) {
        Ok(p) => p,
        Err(_) => return false,
    };
    // `Path::parent` returns `Some("")` for a bare filename like
    // `solo.db`. Treat that as the current directory so
    // canonicalisation succeeds.
    let dest_parent = match dest.parent() {
        Some(p) if !p.as_os_str().is_empty() => p,
        _ => Path::new("."),
    };
    let (Ok(dest_parent_canon), Some(dest_file)) =
        (std::fs::canonicalize(dest_parent), dest.file_name())
    else {
        return false;
    };
    let dest_canon = dest_parent_canon.join(dest_file);
    src_canon == dest_canon
}

/// Run an online SQLCipher backup using an already-open source connection.
///
/// The daemon-side hot-backup path uses this: the writer's existing
/// connection is the source (so the backup runs against live in-flight
/// writer state via SQLite's page-level snapshot), and we open + key the
/// destination fresh. Callers that don't have an open connection can use
/// [`backup_database`] instead.
///
/// `key` is the same raw `KeyMaterial` the source connection was opened
/// with — used to encrypt the destination so it restores under the same
/// passphrase + salt.
pub fn backup_from_connection(
    src: &Connection,
    dest_path: &Path,
    key: &KeyMaterial,
) -> Result<()> {
    // Defense-in-depth: refuse if dest is the same file as src. SQLite's
    // online backup is undefined behavior when source and destination
    // are the same database. Note: the CLI / HTTP layers check this
    // BEFORE any destructive `remove_file(dest)` for `--force`. By the
    // time we reach this function, that pre-flight has already passed
    // (or there was no `--force`); this is the second line of defense.
    if let Some(src_str) = src.path() {
        if paths_refer_to_same_file(Path::new(src_str), dest_path) {
            return Err(Error::invalid_input(format!(
                "backup destination {} is the same file as the source database; \
                 refusing to overwrite (would corrupt the live database)",
                dest_path.display()
            )));
        }
    }

    // Destination: minimal open. We don't run startup pragmas; the
    // backup overwrites the entire database (header + pages), so any
    // pragma we set here would be discarded. We DO need PRAGMA key
    // upfront so SQLCipher writes encrypted pages.
    let mut dst = Connection::open(dest_path).map_err(|e| {
        Error::storage(format!(
            "open backup destination {}: {e}",
            dest_path.display()
        ))
    })?;
    let key_pragma = {
        let hex = key.as_hex();
        format!("PRAGMA key = \"x'{}'\"", &*hex)
    };
    dst.execute_batch(&key_pragma)
        .map_err(|e| Error::storage(format!("PRAGMA key on backup destination: {e}")))?;

    // SQLite's online backup. `Backup::new` borrows both connections;
    // `run_to_completion` drives the page-copy loop in-process. SQLite
    // takes a page-level snapshot of `src`, so concurrent writes on
    // the source are safe — the backup sees a consistent view as of
    // `Backup::new` time. The `pause_between_pages_ms = 0` argument
    // means "no throttle" — for a personal-scale corpus the backup
    // finishes in well under a second per GB of source.
    let backup = Backup::new(src, &mut dst)
        .map_err(|e| Error::storage(format!("Backup::new: {e}")))?;
    backup
        .run_to_completion(
            DEFAULT_BACKUP_PAGES_PER_STEP,
            std::time::Duration::from_millis(0),
            None,
        )
        .map_err(|e| Error::storage(format!("Backup::run_to_completion: {e}")))?;

    // Drop the backup struct first (releases its borrows on src + dst),
    // then close the destination explicitly so any deferred error
    // surfaces here rather than on Drop.
    drop(backup);
    dst.close()
        .map_err(|(_, e)| Error::storage(format!("close destination after backup: {e}")))?;

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::{EmbedderConfig, SoloConfig};
    use crate::init::{InitParams, init};
    use tempfile::TempDir;
    use zeroize::Zeroizing;

    fn fresh_init(dir: &Path, passphrase: &str) -> SoloConfig {
        let outcome = init(InitParams {
            data_dir: dir.to_path_buf(),
            passphrase: Zeroizing::new(passphrase.to_string()),
            force: false,
            embedder: EmbedderConfig {
                name: "BAAI/bge-m3".into(),
                version: "v1".into(),
                dim: 1024,
                dtype: "f32".into(),
            },
        })
        .expect("init");
        SoloConfig::read(&outcome.config_path).expect("read config")
    }

    #[test]
    #[ignore = "requires SQLCipher: under plain bundled SQLite, PRAGMA key is a no-op so wrong keys silently succeed. Run with the workspace's bundled-sqlcipher-vendored-openssl feature: `cargo test -p solo-storage -- --include-ignored`"]
    fn backup_round_trip_preserves_database() {
        let src_dir = TempDir::new().unwrap();
        let dest_dir = TempDir::new().unwrap();
        let passphrase = "round-trip test passphrase";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let key = KeyMaterial::derive(passphrase, &salt).unwrap();

        // Insert a sentinel row so we can verify the backup carried
        // it across.
        {
            let conn = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            conn.execute(
                "INSERT INTO episodes (memory_id, ts_ms, source_type, content,
                                       encoding_context_json, status, tier,
                                       confidence, strength, salience,
                                       created_at_ms, updated_at_ms)
                 VALUES (?, ?, 'test', 'sentinel', '{}', 'active', 'hot',
                         0.9, 0.5, 0.5, ?, ?)",
                rusqlite::params![
                    "01900000-0000-7000-8000-000000000001",
                    0i64,
                    0i64,
                    0i64
                ],
            )
            .expect("insert sentinel");
        }

        // Run the backup.
        let dest_path = dest_dir.path().join("solo-backup.db");
        backup_database(&src_dir.path().join("solo.db"), &dest_path, &key)
            .expect("backup_database");

        // Open the backup with the SAME key — should succeed and the
        // sentinel row should be present.
        let dst = open_sqlcipher(&dest_path, &key).expect("open backup with same key");
        let row_count: i64 = dst
            .query_row(
                "SELECT COUNT(*) FROM episodes WHERE memory_id = ?",
                rusqlite::params!["01900000-0000-7000-8000-000000000001"],
                |row| row.get(0),
            )
            .expect("query backup");
        assert_eq!(row_count, 1, "sentinel row should be present in backup");

        // Opening with a DIFFERENT key should fail (wrong-key →
        // SQLCipher refuses to decrypt the header).
        let bad_key = KeyMaterial::derive("WRONG PASSPHRASE", &salt).unwrap();
        let bad_open = open_sqlcipher(&dest_path, &bad_key);
        assert!(
            bad_open.is_err(),
            "opening backup with wrong key should fail"
        );
    }

    #[test]
    #[ignore = "requires SQLCipher (see backup_round_trip_preserves_database)"]
    fn hot_backup_via_writer_round_trip() {
        // Daemon-side hot backup path: writer is alive, backup runs
        // through `WriteHandle::backup` against the writer's existing
        // connection.
        use crate::vector_index::HnswIndex;
        use crate::writer::{WriterActor, WriterSpawn};
        use crate::embedder::StubEmbedder;
        use crate::embedder_registry::get_or_insert_embedder_id;
        use std::sync::Arc;

        let src_dir = TempDir::new().unwrap();
        let dest_dir = TempDir::new().unwrap();
        let passphrase = "hot-backup test passphrase";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let key = KeyMaterial::derive(passphrase, &salt).unwrap();

        // Insert a sentinel so we can verify it traveled.
        {
            let conn = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            conn.execute(
                "INSERT INTO episodes (memory_id, ts_ms, source_type, content,
                                       encoding_context_json, status, tier,
                                       confidence, strength, salience,
                                       created_at_ms, updated_at_ms)
                 VALUES (?, ?, 'test', 'hot-sentinel', '{}', 'active', 'hot',
                         0.9, 0.5, 0.5, ?, ?)",
                rusqlite::params![
                    "01900000-0000-7000-8000-000000000002",
                    0i64,
                    0i64,
                    0i64
                ],
            )
            .unwrap();
        }

        // Spawn a key-aware writer.
        let runtime = tokio::runtime::Builder::new_multi_thread()
            .worker_threads(1)
            .enable_all()
            .build()
            .unwrap();

        runtime.block_on(async {
            let conn = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            let mut conn_for_id = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            let identity = crate::embedder_registry::EmbedderIdentity {
                name: cfg.embedder.name.clone(),
                version: cfg.embedder.version.clone(),
                dim: cfg.embedder.dim,
                dtype: cfg.embedder.dtype.clone(),
            };
            let embedder_id = get_or_insert_embedder_id(&mut conn_for_id, &identity).unwrap();
            drop(conn_for_id);
            let hnsw = Arc::new(HnswIndex::new(
                cfg.embedder.dim as usize,
                crate::vector_index::HnswParams::default(),
            ));
            let embedder: Arc<dyn solo_core::Embedder> = Arc::new(StubEmbedder::new(
                &cfg.embedder.name,
                &cfg.embedder.version,
                cfg.embedder.dim as usize,
            ));

            let WriterSpawn { handle, join } =
                WriterActor::spawn_full_with_key_and_optional_steward(
                    conn,
                    hnsw,
                    src_dir.path().to_path_buf(),
                    embedder_id,
                    embedder,
                    None,
                    key.clone(),
                );

            let dest_path = dest_dir.path().join("solo-hot-backup.db");
            handle.backup(dest_path.clone()).await.expect("hot backup");

            // Drop handle, wait for writer thread to settle.
            drop(handle);
            tokio::task::spawn_blocking(move || join.join().ok()).await.ok();

            // Open backup with the same key and verify the sentinel.
            let dst = open_sqlcipher(&dest_path, &key).unwrap();
            let n: i64 = dst
                .query_row(
                    "SELECT COUNT(*) FROM episodes WHERE memory_id = ?",
                    rusqlite::params!["01900000-0000-7000-8000-000000000002"],
                    |row| row.get(0),
                )
                .unwrap();
            assert_eq!(n, 1, "hot-backup sentinel should be present");
        });
    }

    #[test]
    #[ignore = "requires SQLCipher (see backup_round_trip_preserves_database)"]
    fn backup_to_same_file_as_source_refused() {
        // Pre-flight check: if `to` resolves to the same file as the
        // live `solo.db`, refuse with InvalidInput (HTTP-layer 400).
        // SQLite's online backup is undefined behavior in this case —
        // the safety check exists so a careless config doesn't corrupt
        // the source.
        let src_dir = TempDir::new().unwrap();
        let passphrase = "same-file refusal test";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let key = KeyMaterial::derive(passphrase, &salt).unwrap();

        let live_db = src_dir.path().join("solo.db");
        let result = backup_database(&live_db, &live_db, &key);
        let err = result.expect_err("must refuse same-file backup");
        let msg = err.to_string();
        assert!(
            msg.contains("same file") && msg.contains("refusing"),
            "error should explain why: got `{msg}`"
        );

        // Also catches the Path-equivalence case with redundant
        // separators / `.` segments. Canonicalisation handles this.
        let live_db_alt = src_dir.path().join("./solo.db");
        let result2 = backup_database(&live_db, &live_db_alt, &key);
        assert!(
            result2.is_err(),
            "redundant ./ in dest path should still be caught"
        );
    }

    #[test]
    #[ignore = "requires SQLCipher (see backup_round_trip_preserves_database)"]
    fn backup_with_wrong_source_key_fails() {
        let src_dir = TempDir::new().unwrap();
        let dest_dir = TempDir::new().unwrap();
        let passphrase = "real passphrase";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let wrong_key = KeyMaterial::derive("not the real one", &salt).unwrap();

        let dest_path = dest_dir.path().join("solo-backup.db");
        let result =
            backup_database(&src_dir.path().join("solo.db"), &dest_path, &wrong_key);
        assert!(
            result.is_err(),
            "backup with wrong source key should fail at open"
        );
    }
}