solo-storage 0.3.3

Solo: SQLite + SQLCipher persistence layer
Documentation
// SPDX-License-Identifier: Apache-2.0

//! Online SQLCipher backup.
//!
//! Solo derives its 32-byte SQLCipher key on the fly via Argon2id from the
//! user's passphrase + the persisted salt in `solo.config.toml`. SQLCipher's
//! standard CLI `.backup` command uses PBKDF2 to turn a passphrase into a
//! key, which produces a different value than Solo's Argon2id derivation —
//! so the obvious `sqlcipher … PRAGMA key = 'passphrase'; .backup target.db`
//! recipe fails with "file is not a database" against a Solo data dir.
//!
//! This module exposes [`backup_database`] — a programmatic equivalent that
//! threads the raw key through SQLite's online backup API. Both source and
//! destination are opened with `PRAGMA key = "x'<hex>'"` (raw form), so the
//! resulting backup file is encrypted with the same key as the source and
//! restores cleanly when paired with a copy of `solo.config.toml`.
//!
//! ## What this is not
//!
//! Not a "hot backup against a running daemon." Callers must hold the
//! `solo.lock` lockfile around the call (i.e., no other Solo process can
//! be touching the data dir). Future work — a `WriteCommand::Backup`
//! variant that runs against the writer's existing connection — would
//! unlock daemon-side hot backup, but isn't shipped today.

use std::path::Path;

use rusqlite::Connection;
use rusqlite::backup::Backup;

use crate::init::open_sqlcipher;
use crate::key_material::KeyMaterial;
use solo_core::{Error, Result};

/// Default page-step size for the backup loop. SQLCipher pages are 4 KiB by
/// default, so 100 pages = 400 KiB per step. Small enough that a SIGINT
/// during backup tears down quickly; large enough that the per-step
/// overhead is negligible for typical (single-digit GB) corpora.
pub const DEFAULT_BACKUP_PAGES_PER_STEP: i32 = 100;

/// Run an online SQLCipher backup of `src_path` to `dest_path`, encrypting
/// the destination with the same raw key.
///
/// Both source and destination are opened with `PRAGMA key = "x'<hex>'"`
/// (raw key form). The destination file is created if missing; if it
/// already exists, its contents are overwritten by the backup.
///
/// Returns `Err(Conflict)` if the source can't be opened with the supplied
/// key (typically a wrong passphrase / wrong salt — the source isn't
/// actually decryptable).
///
/// ## Lockfile responsibility
///
/// Callers must hold `solo.lock` around this call. The function does not
/// acquire it itself — that's a one-shot-vs-daemon coordination concern
/// best left to the caller.
pub fn backup_database(
    src_path: &Path,
    dest_path: &Path,
    key: &KeyMaterial,
) -> Result<()> {
    // Source: full Solo-style open (PRAGMA key + WAL + foreign_keys +
    // busy_timeout). open_sqlcipher's `PRAGMA journal_mode = wal` query
    // forces decryption — a wrong key surfaces here, before we touch
    // the destination.
    let src = open_sqlcipher(src_path, key)?;
    let result = backup_from_connection(&src, dest_path, key);
    // Close the source explicitly so any deferred error (e.g. WAL
    // checkpoint failure) surfaces here rather than on Drop.
    if let Err((_, e)) = src.close() {
        return Err(Error::storage(format!("close source after backup: {e}")));
    }
    result
}

/// Run an online SQLCipher backup using an already-open source connection.
///
/// The daemon-side hot-backup path uses this: the writer's existing
/// connection is the source (so the backup runs against live in-flight
/// writer state via SQLite's page-level snapshot), and we open + key the
/// destination fresh. Callers that don't have an open connection can use
/// [`backup_database`] instead.
///
/// `key` is the same raw `KeyMaterial` the source connection was opened
/// with — used to encrypt the destination so it restores under the same
/// passphrase + salt.
pub fn backup_from_connection(
    src: &Connection,
    dest_path: &Path,
    key: &KeyMaterial,
) -> Result<()> {
    // Pre-flight: refuse if dest is the same file as src. SQLite's
    // online backup is undefined behavior when source and destination
    // are the same database — could silently corrupt the live file.
    // The safe-bet check: compare canonicalised source path against
    // canonicalised dest path. Dest may not exist yet, so canonicalise
    // its parent + reattach the filename.
    if let Some(src_str) = src.path() {
        let src_path = Path::new(src_str);
        if let Ok(src_canon) = std::fs::canonicalize(src_path) {
            // `Path::parent` returns `Some("")` for a bare filename
            // like `solo.db`. Treat that as the current directory so
            // canonicalisation succeeds.
            let dest_parent = match dest_path.parent() {
                Some(p) if !p.as_os_str().is_empty() => p,
                _ => Path::new("."),
            };
            if let (Ok(dest_parent_canon), Some(dest_file)) =
                (std::fs::canonicalize(dest_parent), dest_path.file_name())
            {
                let dest_canon = dest_parent_canon.join(dest_file);
                if src_canon == dest_canon {
                    return Err(Error::invalid_input(format!(
                        "backup destination {} is the same file as the source database; \
                         refusing to overwrite (would corrupt the live database)",
                        dest_path.display()
                    )));
                }
            }
        }
    }

    // Destination: minimal open. We don't run startup pragmas; the
    // backup overwrites the entire database (header + pages), so any
    // pragma we set here would be discarded. We DO need PRAGMA key
    // upfront so SQLCipher writes encrypted pages.
    let mut dst = Connection::open(dest_path).map_err(|e| {
        Error::storage(format!(
            "open backup destination {}: {e}",
            dest_path.display()
        ))
    })?;
    let key_pragma = {
        let hex = key.as_hex();
        format!("PRAGMA key = \"x'{}'\"", &*hex)
    };
    dst.execute_batch(&key_pragma)
        .map_err(|e| Error::storage(format!("PRAGMA key on backup destination: {e}")))?;

    // SQLite's online backup. `Backup::new` borrows both connections;
    // `run_to_completion` drives the page-copy loop in-process. SQLite
    // takes a page-level snapshot of `src`, so concurrent writes on
    // the source are safe — the backup sees a consistent view as of
    // `Backup::new` time. The `pause_between_pages_ms = 0` argument
    // means "no throttle" — for a personal-scale corpus the backup
    // finishes in well under a second per GB of source.
    let backup = Backup::new(src, &mut dst)
        .map_err(|e| Error::storage(format!("Backup::new: {e}")))?;
    backup
        .run_to_completion(
            DEFAULT_BACKUP_PAGES_PER_STEP,
            std::time::Duration::from_millis(0),
            None,
        )
        .map_err(|e| Error::storage(format!("Backup::run_to_completion: {e}")))?;

    // Drop the backup struct first (releases its borrows on src + dst),
    // then close the destination explicitly so any deferred error
    // surfaces here rather than on Drop.
    drop(backup);
    dst.close()
        .map_err(|(_, e)| Error::storage(format!("close destination after backup: {e}")))?;

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::{EmbedderConfig, SoloConfig};
    use crate::init::{InitParams, init};
    use tempfile::TempDir;
    use zeroize::Zeroizing;

    fn fresh_init(dir: &Path, passphrase: &str) -> SoloConfig {
        let outcome = init(InitParams {
            data_dir: dir.to_path_buf(),
            passphrase: Zeroizing::new(passphrase.to_string()),
            force: false,
            embedder: EmbedderConfig {
                name: "BAAI/bge-m3".into(),
                version: "v1".into(),
                dim: 1024,
                dtype: "f32".into(),
            },
        })
        .expect("init");
        SoloConfig::read(&outcome.config_path).expect("read config")
    }

    #[test]
    #[ignore = "requires SQLCipher: under plain bundled SQLite, PRAGMA key is a no-op so wrong keys silently succeed. Run with the workspace's bundled-sqlcipher-vendored-openssl feature: `cargo test -p solo-storage -- --include-ignored`"]
    fn backup_round_trip_preserves_database() {
        let src_dir = TempDir::new().unwrap();
        let dest_dir = TempDir::new().unwrap();
        let passphrase = "round-trip test passphrase";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let key = KeyMaterial::derive(passphrase, &salt).unwrap();

        // Insert a sentinel row so we can verify the backup carried
        // it across.
        {
            let conn = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            conn.execute(
                "INSERT INTO episodes (memory_id, ts_ms, source_type, content,
                                       encoding_context_json, status, tier,
                                       confidence, strength, salience,
                                       created_at_ms, updated_at_ms)
                 VALUES (?, ?, 'test', 'sentinel', '{}', 'active', 'hot',
                         0.9, 0.5, 0.5, ?, ?)",
                rusqlite::params![
                    "01900000-0000-7000-8000-000000000001",
                    0i64,
                    0i64,
                    0i64
                ],
            )
            .expect("insert sentinel");
        }

        // Run the backup.
        let dest_path = dest_dir.path().join("solo-backup.db");
        backup_database(&src_dir.path().join("solo.db"), &dest_path, &key)
            .expect("backup_database");

        // Open the backup with the SAME key — should succeed and the
        // sentinel row should be present.
        let dst = open_sqlcipher(&dest_path, &key).expect("open backup with same key");
        let row_count: i64 = dst
            .query_row(
                "SELECT COUNT(*) FROM episodes WHERE memory_id = ?",
                rusqlite::params!["01900000-0000-7000-8000-000000000001"],
                |row| row.get(0),
            )
            .expect("query backup");
        assert_eq!(row_count, 1, "sentinel row should be present in backup");

        // Opening with a DIFFERENT key should fail (wrong-key →
        // SQLCipher refuses to decrypt the header).
        let bad_key = KeyMaterial::derive("WRONG PASSPHRASE", &salt).unwrap();
        let bad_open = open_sqlcipher(&dest_path, &bad_key);
        assert!(
            bad_open.is_err(),
            "opening backup with wrong key should fail"
        );
    }

    #[test]
    #[ignore = "requires SQLCipher (see backup_round_trip_preserves_database)"]
    fn hot_backup_via_writer_round_trip() {
        // Daemon-side hot backup path: writer is alive, backup runs
        // through `WriteHandle::backup` against the writer's existing
        // connection.
        use crate::vector_index::HnswIndex;
        use crate::writer::{WriterActor, WriterSpawn};
        use crate::embedder::StubEmbedder;
        use crate::embedder_registry::get_or_insert_embedder_id;
        use std::sync::Arc;

        let src_dir = TempDir::new().unwrap();
        let dest_dir = TempDir::new().unwrap();
        let passphrase = "hot-backup test passphrase";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let key = KeyMaterial::derive(passphrase, &salt).unwrap();

        // Insert a sentinel so we can verify it traveled.
        {
            let conn = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            conn.execute(
                "INSERT INTO episodes (memory_id, ts_ms, source_type, content,
                                       encoding_context_json, status, tier,
                                       confidence, strength, salience,
                                       created_at_ms, updated_at_ms)
                 VALUES (?, ?, 'test', 'hot-sentinel', '{}', 'active', 'hot',
                         0.9, 0.5, 0.5, ?, ?)",
                rusqlite::params![
                    "01900000-0000-7000-8000-000000000002",
                    0i64,
                    0i64,
                    0i64
                ],
            )
            .unwrap();
        }

        // Spawn a key-aware writer.
        let runtime = tokio::runtime::Builder::new_multi_thread()
            .worker_threads(1)
            .enable_all()
            .build()
            .unwrap();

        runtime.block_on(async {
            let conn = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            let mut conn_for_id = open_sqlcipher(&src_dir.path().join("solo.db"), &key).unwrap();
            let identity = crate::embedder_registry::EmbedderIdentity {
                name: cfg.embedder.name.clone(),
                version: cfg.embedder.version.clone(),
                dim: cfg.embedder.dim,
                dtype: cfg.embedder.dtype.clone(),
            };
            let embedder_id = get_or_insert_embedder_id(&mut conn_for_id, &identity).unwrap();
            drop(conn_for_id);
            let hnsw = Arc::new(HnswIndex::new(
                cfg.embedder.dim as usize,
                crate::vector_index::HnswParams::default(),
            ));
            let embedder: Arc<dyn solo_core::Embedder> = Arc::new(StubEmbedder::new(
                &cfg.embedder.name,
                &cfg.embedder.version,
                cfg.embedder.dim as usize,
            ));

            let WriterSpawn { handle, join } =
                WriterActor::spawn_full_with_key_and_optional_steward(
                    conn,
                    hnsw,
                    src_dir.path().to_path_buf(),
                    embedder_id,
                    embedder,
                    None,
                    key.clone(),
                );

            let dest_path = dest_dir.path().join("solo-hot-backup.db");
            handle.backup(dest_path.clone()).await.expect("hot backup");

            // Drop handle, wait for writer thread to settle.
            drop(handle);
            tokio::task::spawn_blocking(move || join.join().ok()).await.ok();

            // Open backup with the same key and verify the sentinel.
            let dst = open_sqlcipher(&dest_path, &key).unwrap();
            let n: i64 = dst
                .query_row(
                    "SELECT COUNT(*) FROM episodes WHERE memory_id = ?",
                    rusqlite::params!["01900000-0000-7000-8000-000000000002"],
                    |row| row.get(0),
                )
                .unwrap();
            assert_eq!(n, 1, "hot-backup sentinel should be present");
        });
    }

    #[test]
    #[ignore = "requires SQLCipher (see backup_round_trip_preserves_database)"]
    fn backup_to_same_file_as_source_refused() {
        // Pre-flight check: if `to` resolves to the same file as the
        // live `solo.db`, refuse with InvalidInput (HTTP-layer 400).
        // SQLite's online backup is undefined behavior in this case —
        // the safety check exists so a careless config doesn't corrupt
        // the source.
        let src_dir = TempDir::new().unwrap();
        let passphrase = "same-file refusal test";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let key = KeyMaterial::derive(passphrase, &salt).unwrap();

        let live_db = src_dir.path().join("solo.db");
        let result = backup_database(&live_db, &live_db, &key);
        let err = result.expect_err("must refuse same-file backup");
        let msg = err.to_string();
        assert!(
            msg.contains("same file") && msg.contains("refusing"),
            "error should explain why: got `{msg}`"
        );

        // Also catches the Path-equivalence case with redundant
        // separators / `.` segments. Canonicalisation handles this.
        let live_db_alt = src_dir.path().join("./solo.db");
        let result2 = backup_database(&live_db, &live_db_alt, &key);
        assert!(
            result2.is_err(),
            "redundant ./ in dest path should still be caught"
        );
    }

    #[test]
    #[ignore = "requires SQLCipher (see backup_round_trip_preserves_database)"]
    fn backup_with_wrong_source_key_fails() {
        let src_dir = TempDir::new().unwrap();
        let dest_dir = TempDir::new().unwrap();
        let passphrase = "real passphrase";

        let cfg = fresh_init(src_dir.path(), passphrase);
        let salt = cfg.salt_bytes().unwrap();
        let wrong_key = KeyMaterial::derive("not the real one", &salt).unwrap();

        let dest_path = dest_dir.path().join("solo-backup.db");
        let result =
            backup_database(&src_dir.path().join("solo.db"), &dest_path, &wrong_key);
        assert!(
            result.is_err(),
            "backup with wrong source key should fail at open"
        );
    }
}