solo-storage 0.7.0

// SPDX-License-Identifier: Apache-2.0

//! `solo init`: create a fresh Solo data directory.
//!
//! The orchestrator wires together every primitive in this crate:
//!
//!   1. `path_validation::validate_data_dir` — refuse cloud-sync folders.
//!   2. Detect existing init state. Refuse with a clear error unless caller
//!      passes `force = true`, in which case wipe the directory contents
//!      that Solo owns (NOT the directory itself, in case the user keeps
//!      other stuff there — though `~/.solo` should be Solo-owned).
//!   3. Create the data directory.
//!   4. Acquire `solo.lock` (RAII) so a parallel `solo init` or `solo daemon`
//!      can't race us.
//!   5. Generate a fresh 16-byte salt, derive the SQLCipher key via Argon2id.
//!   6. Open the SQLCipher database, bind `PRAGMA key`, set WAL + foreign_keys.
//!   7. Run all migrations.
//!   8. Write `solo.config.toml` (salt + embedder identity).
//!   9. Drop the lockfile (RAII — also runs on any error path between steps).
//!
//! On any error after the data dir is created, the partial state on disk is
//! left for inspection. The caller can re-run with `--force` to wipe and
//! retry.

use rusqlite::Connection;
use solo_core::{Embedder, Error, Result};
use std::path::{Path, PathBuf};
use zeroize::Zeroizing;

use crate::{
    config::{EmbedderConfig, SoloConfig},
    key_material::KeyMaterial,
    lockfile::Lockfile,
    migration,
    path_validation::validate_data_dir,
};

/// Default data dir: `~/.solo/`. Honors the home-dir resolution `dirs` crate
/// performs (Windows: `%USERPROFILE%`; Unix: `$HOME`). Returns `None` if no
/// home directory can be found.
pub fn default_data_dir() -> Option<PathBuf> {
    dirs::home_dir().map(|h| h.join(".solo"))
}

/// File names that Solo owns inside the data dir. `--force` removes these.
/// Anything else in the dir is left untouched.
///
/// HNSW snapshot filenames are derived from the basenames in
/// `crate::snapshot` (`hnsw_episodes`, `hnsw_episodes_bak`, `hnsw_episodes_tmp`)
/// + the suffixes hnsw_rs's `file_dump` writes (`.hnsw.data`, `.hnsw.graph`).
/// Keep this list in sync with `snapshot::{LIVE_BASENAME, BAK_BASENAME,
/// TMP_BASENAME}` if those ever change.
const SOLO_OWNED_FILES: &[&str] = &[
    "solo.db",
    "solo.db-wal",
    "solo.db-shm",
    "solo.config.toml",
    "solo.config.toml.tmp",
    "solo.lock",
    // Live snapshot pair
    "hnsw_episodes.hnsw.data",
    "hnsw_episodes.hnsw.graph",
    // Backup snapshot pair
    "hnsw_episodes_bak.hnsw.data",
    "hnsw_episodes_bak.hnsw.graph",
    // In-flight tmp pair (cleaned up by snapshot::save's first step, but
    // catch any orphans here for `--force`).
    "hnsw_episodes_tmp.hnsw.data",
    "hnsw_episodes_tmp.hnsw.graph",
];

/// `solo init` parameters. Built by the CLI layer.
#[derive(Debug, Clone)]
pub struct InitParams {
    /// Where to put the data dir. Created if missing.
    pub data_dir: PathBuf,
    /// Resolved passphrase, wrapped in `Zeroizing` so the buffer is wiped
    /// when this struct drops. CLI layer reads it via prompt or env var.
    pub passphrase: Zeroizing<String>,
    /// If true, wipe Solo-owned files in `data_dir` before initializing.
    pub force: bool,
    /// Embedder identity to record in the config. For commit 1.1 this is the
    /// BGE-M3 default; commit 1.4 (embedder loader) will produce it from the
    /// loaded model.
    pub embedder: EmbedderConfig,
}

/// Default embedder identity recorded in `solo.config.toml` when the
/// CLI hasn't probed a real backend via
/// [`crate::embedder::probe_embedder_config_from_env`].
///
/// In production, `solo init` always calls `probe_embedder_config_from_env`,
/// which picks between Ollama (probes the real dim) and Stub (32-dim,
/// deterministic). This function exists for test fixtures + downstream
/// callers that want a parameterless identity for first-init flows; it
/// returns the Stub identity, matching `StubEmbedder::default_stub()`
/// (name=`stub`, version=`v1`, dim=32).
///
/// Historically this returned the BGE-M3 identity (BAAI/bge-m3, 1024-dim).
/// BGE-M3 was removed in v0.6.0 — see `docs/dev-log/0071-v0.5.x-roadmap.md`
/// Priority 9. Callers that need a deterministic non-stub identity for
/// tests should build an `EmbedderConfig` literal directly.
pub fn default_embedder() -> EmbedderConfig {
    let stub = crate::embedder::StubEmbedder::default_stub();
    EmbedderConfig {
        name: stub.name().to_string(),
        version: stub.version().to_string(),
        dim: stub.dim() as u32,
        dtype: "f32".into(),
    }
}

/// Outcome reported back to the CLI layer for human-readable success output.
#[derive(Debug)]
pub struct InitOutcome {
    pub data_dir: PathBuf,
    pub db_path: PathBuf,
    pub config_path: PathBuf,
    pub schema_version: u32,
}

/// Run `solo init`. See module docstring for the step list.
pub fn init(params: InitParams) -> Result<InitOutcome> {
    let InitParams {
        data_dir,
        passphrase,
        force,
        embedder,
    } = params;

    if passphrase.is_empty() {
        return Err(Error::invalid_input(
            "passphrase must not be empty (Solo uses it to derive the SQLCipher key)",
        ));
    }

    validate_data_dir(&data_dir)?;

    // Step 2-3: existing state + dir creation.
    let db_path = data_dir.join("solo.db");
    let config_path = data_dir.join("solo.config.toml");
    let lock_path = data_dir.join("solo.lock");

    let already_initialized = db_path.exists() || config_path.exists();
    if already_initialized {
        if !force {
            return Err(Error::conflict(format!(
                "data directory is already initialized: {}\n\
                 Re-run with --force to wipe and re-initialize \
                 (DESTRUCTIVE — all stored memories will be lost).",
                data_dir.display()
            )));
        }
        wipe_solo_owned_files(&data_dir)?;
    }

    std::fs::create_dir_all(&data_dir).map_err(|e| {
        Error::storage(format!("create data dir {}: {e}", data_dir.display()))
    })?;

    // Step 4: lockfile.
    let _lock = Lockfile::acquire(&lock_path)?;

    // Step 5: salt + key.
    let salt = KeyMaterial::fresh_salt()?;
    let key = KeyMaterial::derive(&passphrase, &salt)?;

    // Step 6: SQLCipher DB open + PRAGMAs.
    let mut conn = open_sqlcipher(&db_path, &key)?;

    // Step 7: schema migrations.
    let schema_version = migration::run_migrations(&mut conn)?;

    // Smoke-test the cipher round trip: close the connection, reopen with the
    // same key, and read schema_migrations. If the key is wrong (e.g., the
    // passphrase wasn't actually the one used to write), the second open
    // surfaces the failure here, not silently at first read time.
    drop(conn);
    let conn2 = open_sqlcipher(&db_path, &key)?;
    let highest: u32 = conn2
        .query_row(
            "SELECT MAX(version) FROM schema_migrations",
            [],
            |row| row.get(0),
        )
        .map_err(|e| Error::storage(format!("verify cipher round-trip: {e}")))?;
    drop(conn2);
    if highest != schema_version {
        return Err(Error::storage(format!(
            "cipher round-trip read drift: wrote {schema_version}, read {highest}"
        )));
    }

    // Step 8: persist config.
    let cfg = SoloConfig::new(salt, embedder);
    cfg.write(&config_path)?;

    // Step 9: lockfile drops here.
    Ok(InitOutcome {
        data_dir,
        db_path,
        config_path,
        schema_version,
    })
}

/// Open a SQLCipher database, bind the raw key, and set the journal-mode +
/// foreign-keys pragmas. Used by `init` and exposed for tests.
pub fn open_sqlcipher(db_path: &Path, key: &KeyMaterial) -> Result<Connection> {
    let conn = Connection::open(db_path)
        .map_err(|e| Error::storage(format!("open {}: {e}", db_path.display())))?;
    // PRAGMA key MUST be the first statement on a fresh connection.
    // `as_hex()` returns Zeroizing<String>; deref + format! into a
    // throwaway PRAGMA. The PRAGMA buffer itself isn't zeroize-tracked
    // (a String allocated by format!) — best effort for now.
    let key_pragma = {
        let hex = key.as_hex();
        format!("PRAGMA key = \"x'{}'\"", &*hex)
    };
    conn.execute_batch(&key_pragma)
        .map_err(|e| Error::storage(format!("PRAGMA key: {e}")))?;
    // Standard pragmas. journal_mode=wal returns the new mode as a row, so we
    // use query_row; the others execute fine via execute_batch.
    let mode: String = conn
        .query_row("PRAGMA journal_mode = wal", [], |row| row.get(0))
        .map_err(|e| Error::storage(format!("set journal_mode=wal: {e}")))?;
    if mode.to_lowercase() != "wal" {
        return Err(Error::storage(format!(
            "expected WAL journal mode, got {mode}"
        )));
    }
    conn.execute_batch(
        "PRAGMA foreign_keys = ON;
         PRAGMA busy_timeout = 5000;
         PRAGMA synchronous = NORMAL;",
    )
    .map_err(|e| Error::storage(format!("set startup pragmas: {e}")))?;
    Ok(conn)
}

fn wipe_solo_owned_files(data_dir: &Path) -> Result<()> {
    if !data_dir.exists() {
        return Ok(());
    }
    for name in SOLO_OWNED_FILES {
        let p = data_dir.join(name);
        if p.exists() {
            std::fs::remove_file(&p)
                .map_err(|e| Error::storage(format!("remove {}: {e}", p.display())))?;
        }
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    fn fixture_params(dir: &Path) -> InitParams {
        InitParams {
            data_dir: dir.to_path_buf(),
            passphrase: Zeroizing::new("correct horse battery staple".into()),
            force: false,
            embedder: default_embedder(),
        }
    }

    #[test]
    fn happy_path_creates_db_and_config() {
        let tmp = TempDir::new().unwrap();
        let dir = tmp.path().join("solo-data");
        let outcome = init(fixture_params(&dir)).expect("init should succeed");
        assert_eq!(outcome.data_dir, dir);
        assert!(outcome.db_path.exists(), "solo.db must exist");
        assert!(outcome.config_path.exists(), "solo.config.toml must exist");
        assert_eq!(outcome.schema_version, 3);
        // Lockfile should be cleaned up on the success path (RAII drop).
        assert!(!dir.join("solo.lock").exists(), "lockfile must be removed");
    }

    #[test]
    fn config_round_trips_salt_correctly() {
        let tmp = TempDir::new().unwrap();
        let dir = tmp.path().join("solo-data");
        let outcome = init(fixture_params(&dir)).unwrap();
        let cfg = SoloConfig::read(&outcome.config_path).unwrap();
        let salt = cfg.salt_bytes().unwrap();
        // Re-derive the key from the persisted salt + the same passphrase;
        // open the DB; should succeed.
        let key = KeyMaterial::derive("correct horse battery staple", &salt).unwrap();
        let conn = open_sqlcipher(&outcome.db_path, &key).unwrap();
        let v: u32 = conn
            .query_row(
                "SELECT MAX(version) FROM schema_migrations",
                [],
                |row| row.get(0),
            )
            .unwrap();
        assert_eq!(v, 3);
    }

    #[test]
    #[ignore = "requires SQLCipher: under plain bundled SQLite, PRAGMA key is a no-op so wrong keys silently succeed. Run with the workspace's bundled-sqlcipher-vendored-openssl feature: `cargo test -p solo-storage -- --include-ignored`"]
    fn wrong_passphrase_fails_to_open() {
        let tmp = TempDir::new().unwrap();
        let dir = tmp.path().join("solo-data");
        let outcome = init(fixture_params(&dir)).unwrap();
        let cfg = SoloConfig::read(&outcome.config_path).unwrap();
        let salt = cfg.salt_bytes().unwrap();
        let bad_key = KeyMaterial::derive("WRONG PASSPHRASE", &salt).unwrap();
        // open_sqlcipher itself only sets pragmas; the actual decryption
        // failure surfaces on the first real query.
        let conn = open_sqlcipher(&outcome.db_path, &bad_key);
        let conn = match conn {
            Ok(c) => c,
            Err(_) => return, // failed at PRAGMA stage — also acceptable.
        };
        let res: rusqlite::Result<u32> = conn.query_row(
            "SELECT MAX(version) FROM schema_migrations",
            [],
            |row| row.get(0),
        );
        assert!(res.is_err(), "wrong passphrase must fail to read");
    }

    #[test]
    fn second_init_without_force_refuses() {
        let tmp = TempDir::new().unwrap();
        let dir = tmp.path().join("solo-data");
        init(fixture_params(&dir)).unwrap();
        let err = init(fixture_params(&dir)).unwrap_err();
        assert!(
            matches!(err, Error::Conflict(_)),
            "expected Conflict, got {err:?}"
        );
        assert!(err.to_string().contains("already initialized"));
    }

    #[test]
    fn force_wipes_and_re_inits() {
        let tmp = TempDir::new().unwrap();
        let dir = tmp.path().join("solo-data");
        let first = init(fixture_params(&dir)).unwrap();
        let first_cfg = SoloConfig::read(&first.config_path).unwrap();

        let mut params = fixture_params(&dir);
        params.force = true;
        let second = init(params).unwrap();
        let second_cfg = SoloConfig::read(&second.config_path).unwrap();

        // A new salt should have been generated; same passphrase => different
        // derived key. Compare salts directly.
        assert_ne!(first_cfg.salt_hex, second_cfg.salt_hex);
    }

    /// Regression: SOLO_OWNED_FILES had stale HNSW filenames
    /// (`hnsw_episodes.bin`, `.graph`, `.data`, etc.) that didn't match
    /// the current snapshot module's actual output (`.hnsw.data`,
    /// `.hnsw.graph` on the live/_bak/_tmp basenames). `solo init --force`
    /// would skip wiping those files, leaving stale data after re-init.
    /// This test plants snapshot files using the current naming scheme
    /// then verifies --force removes them all.
    #[test]
    fn force_wipes_current_hnsw_snapshot_files() {
        let tmp = TempDir::new().unwrap();
        let dir = tmp.path().join("solo-data");
        let _ = init(fixture_params(&dir)).unwrap();

        // Plant snapshot files using the names snapshot.rs actually writes.
        let planted = [
            "hnsw_episodes.hnsw.data",
            "hnsw_episodes.hnsw.graph",
            "hnsw_episodes_bak.hnsw.data",
            "hnsw_episodes_bak.hnsw.graph",
            "hnsw_episodes_tmp.hnsw.data",
            "hnsw_episodes_tmp.hnsw.graph",
        ];
        for name in &planted {
            std::fs::write(dir.join(name), b"stale snapshot data").unwrap();
        }

        let mut params = fixture_params(&dir);
        params.force = true;
        let _ = init(params).unwrap();

        // All planted files must be gone after --force.
        for name in &planted {
            let p = dir.join(name);
            assert!(
                !p.exists(),
                "{} should have been wiped by --force",
                p.display()
            );
        }
    }

    #[test]
    fn empty_passphrase_rejected() {
        let tmp = TempDir::new().unwrap();
        let mut params = fixture_params(tmp.path());
        params.passphrase.clear();
        let err = init(params).unwrap_err();
        assert!(matches!(err, Error::InvalidInput(_)), "got: {err:?}");
    }

    #[test]
    fn cloud_sync_path_rejected() {
        // We don't actually create files; validate_data_dir runs first.
        let placeholder = std::env::temp_dir().join("solo-init-cloud-test");
        let mut params = fixture_params(&placeholder);
        // Force a cloud-sync component into the path. Must be absolute on
        // both Unix and Windows so validate_data_dir's absolute-path check
        // doesn't short-circuit before the cloud-sync check we want to
        // exercise.
        #[cfg(windows)]
        let cloud = std::path::PathBuf::from(r"C:\Users\x\Dropbox\solo");
        #[cfg(not(windows))]
        let cloud = std::path::PathBuf::from("/Users/x/Dropbox/solo");
        params.data_dir = cloud;
        let err = init(params).unwrap_err();
        assert!(err.to_string().contains("cloud-sync"), "got: {err}");
    }
}