solo-storage 0.5.1

Solo: SQLite + SQLCipher persistence layer
Documentation
// SPDX-License-Identifier: Apache-2.0

//! `solo.config.toml` reader/writer.
//!
//! The config file lives alongside `solo.db` and stores everything Solo needs
//! to re-open the database on startup but does NOT need to keep secret. The
//! Argon2 salt is the load-bearing field — without it, the same passphrase
//! produces a different key, so the SQLCipher database becomes unreadable.
//!
//! Layout (TOML):
//! ```toml
//! schema_version = 1
//! salt_hex       = "0123456789abcdef0123456789abcdef"   # 16 bytes -> 32 hex
//!
//! [embedder]
//! name    = "BAAI/bge-m3"      # matches `crate::embedder::bge_m3::BGE_M3_NAME`
//! version = "v1"               # `BGE_M3_VERSION`; bump on any vector-shifting change
//! dim     = 1024               # `BGE_M3_DIM`
//! dtype   = "f32"
//! ```
//!
//! Why TOML: human-readable for debugging + recovery. The whole file is small;
//! we don't need a more compact format.

use serde::{Deserialize, Serialize};
use solo_core::{Error, Result};
use std::path::Path;

use crate::key_material::SALT_LEN;

/// Current config schema version. Bump on any incompatible field change.
pub const CONFIG_SCHEMA_VERSION: u32 = 1;

/// Top-level config struct, serialized as TOML to `solo.config.toml`.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SoloConfig {
    /// Version of the config schema itself (NOT the database schema). Bumping
    /// this lets future Solo versions migrate old config files in-place.
    pub schema_version: u32,
    /// 32-character lowercase hex string of the 16-byte Argon2 salt.
    pub salt_hex: String,
    /// Embedder identity: name, version, dim, dtype. The database holds
    /// embeddings tied to a specific `(name, version)`; if those change, the
    /// daemon refuses to start until `solo reembed` rebuilds them.
    pub embedder: EmbedderConfig,
    /// User-identity settings for the read-path. Default empty; backward-
    /// compatible with configs that don't declare an `[identity]` block.
    /// Today this carries `user_aliases` so `facts_about` can resolve a
    /// queried alias against historical triples whose `subject_id` was
    /// normalised to the canonical `"user"`. v0.5.0 Priority 1, sub-step
    /// 1C — see `docs/dev-log/0071-v0.5.x-roadmap.md`.
    #[serde(default)]
    pub identity: IdentityConfig,
}

/// User-identity settings persisted under `[identity]` in `solo.config.toml`.
///
/// `user_aliases` lets a user query `facts_about(subject = "alex")` and have
/// the read path also surface rows that were extracted historically with the
/// canonical `subject_id = "user"` (or vice-versa). The forward-going
/// extraction pipeline (Priority 1 sub-steps 1A + 1B) prefers named entities
/// over `"user"`, but historical triples written before 1A still use
/// `"user"` — read-side alias expansion bridges the two without rewriting
/// any data.
///
/// Default = empty — zero behaviour change for existing configs.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct IdentityConfig {
    /// Names that should be treated as equivalent to the canonical `"user"`
    /// subject when querying `facts_about`. Lets a user query "facts about
    /// alex" and get rows that were historically extracted with
    /// `subject_id = "user"`. Case-sensitive — match the casing in the
    /// triples table.
    #[serde(default)]
    pub user_aliases: Vec<String>,
}

/// Embedder identity persisted to disk so startup can detect drift.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EmbedderConfig {
    pub name: String,
    pub version: String,
    pub dim: u32,
    /// Serialized form of `solo_core::EmbeddingDtype`: "f32" | "f16" | "i8" | "binary".
    pub dtype: String,
}

impl SoloConfig {
    /// Build a fresh config for first-run setup. Caller supplies the salt
    /// (typically `KeyMaterial::fresh_salt()`). `identity` defaults to
    /// empty — `solo init` does not seed `user_aliases`; users opt in by
    /// editing `solo.config.toml`.
    pub fn new(salt: [u8; SALT_LEN], embedder: EmbedderConfig) -> Self {
        Self {
            schema_version: CONFIG_SCHEMA_VERSION,
            salt_hex: hex::encode(salt),
            embedder,
            identity: IdentityConfig::default(),
        }
    }

    /// Decode the persisted salt back to its 16-byte form.
    pub fn salt_bytes(&self) -> Result<[u8; SALT_LEN]> {
        let bytes = hex::decode(&self.salt_hex)
            .map_err(|e| Error::storage(format!("config salt_hex is not valid hex: {e}")))?;
        if bytes.len() != SALT_LEN {
            return Err(Error::storage(format!(
                "config salt_hex must decode to {} bytes, got {}",
                SALT_LEN,
                bytes.len()
            )));
        }
        let mut out = [0u8; SALT_LEN];
        out.copy_from_slice(&bytes);
        Ok(out)
    }

    /// Serialize to `solo.config.toml` at the given path. Atomic-writes via a
    /// `<path>.tmp` file + rename so a crash mid-write can't leave a partial
    /// config. Refuses to overwrite an existing file (caller must handle the
    /// already-initialized case).
    ///
    /// Durability ordering: write tmp → fsync tmp → rename → fsync parent dir
    /// (Unix only; Windows relies on NTFS's metadata journal). The salt
    /// stored here is the only path back into the SQLCipher database — a
    /// partial-write corruption locks the user out forever, so we pay the
    /// fsync cost (~1 ms) without compromise.
    pub fn write(&self, path: &Path) -> Result<()> {
        if path.exists() {
            return Err(Error::conflict(format!(
                "config already exists: {}",
                path.display()
            )));
        }
        let tmp_path = path.with_extension("toml.tmp");
        let body = toml::to_string_pretty(self)
            .map_err(|e| Error::storage(format!("toml serialize: {e}")))?;

        // Open + write + fsync the tmp file before exposing it via rename.
        {
            let mut tmp_file = std::fs::OpenOptions::new()
                .write(true)
                .create_new(true)
                .open(&tmp_path)
                .map_err(|e| Error::storage(format!("open tmp {}: {e}", tmp_path.display())))?;
            std::io::Write::write_all(&mut tmp_file, body.as_bytes())
                .map_err(|e| Error::storage(format!("write {}: {e}", tmp_path.display())))?;
            tmp_file
                .sync_all()
                .map_err(|e| Error::storage(format!("fsync tmp {}: {e}", tmp_path.display())))?;
        }

        std::fs::rename(&tmp_path, path)
            .map_err(|e| Error::storage(format!("rename to {}: {e}", path.display())))?;

        // fsync the parent directory so the rename persists across a crash.
        // No-op on Windows — opening a directory for FlushFileBuffers requires
        // FILE_FLAG_BACKUP_SEMANTICS; NTFS's metadata journal handles this case.
        #[cfg(unix)]
        {
            if let Some(parent) = path.parent() {
                if let Ok(d) = std::fs::OpenOptions::new().read(true).open(parent) {
                    let _ = d.sync_all();
                }
            }
        }

        Ok(())
    }

    /// Read + parse from `solo.config.toml`. Validates schema_version.
    pub fn read(path: &Path) -> Result<Self> {
        let body = std::fs::read_to_string(path)
            .map_err(|e| Error::storage(format!("read {}: {e}", path.display())))?;
        let cfg: Self = toml::from_str(&body)
            .map_err(|e| Error::storage(format!("toml parse {}: {e}", path.display())))?;
        if cfg.schema_version != CONFIG_SCHEMA_VERSION {
            return Err(Error::storage(format!(
                "config schema_version mismatch: file is v{}, this binary expects v{}",
                cfg.schema_version, CONFIG_SCHEMA_VERSION
            )));
        }
        // Validate salt_hex shape eagerly so callers see the error here, not
        // later at key-derive time.
        let _ = cfg.salt_bytes()?;
        Ok(cfg)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    fn fixture_embedder() -> EmbedderConfig {
        EmbedderConfig {
            name: "bge-m3".into(),
            version: "v1.0".into(),
            dim: 1024,
            dtype: "f32".into(),
        }
    }

    #[test]
    fn roundtrip_via_disk() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");

        let salt = [7u8; SALT_LEN];
        let cfg = SoloConfig::new(salt, fixture_embedder());
        cfg.write(&path).unwrap();

        let read_back = SoloConfig::read(&path).unwrap();
        assert_eq!(cfg, read_back);
        assert_eq!(read_back.salt_bytes().unwrap(), salt);
    }

    #[test]
    fn write_refuses_overwrite() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        let cfg = SoloConfig::new([0; SALT_LEN], fixture_embedder());
        cfg.write(&path).unwrap();
        let err = cfg.write(&path).unwrap_err();
        assert!(err.to_string().contains("already exists"), "got: {err}");
    }

    #[test]
    fn read_rejects_wrong_schema_version() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            r#"
schema_version = 99
salt_hex = "00000000000000000000000000000000"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"
"#,
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        assert!(err.to_string().contains("schema_version mismatch"), "got: {err}");
    }

    #[test]
    fn read_rejects_non_hex_salt() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"
"#
            ),
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        // hex::decode fails on non-hex chars → "not valid hex".
        assert!(err.to_string().contains("salt_hex"), "got: {err}");
    }

    #[test]
    fn read_rejects_missing_embedder_block() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "00000000000000000000000000000000"
"#
            ),
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        // serde error for missing field
        assert!(err.to_string().to_lowercase().contains("embedder") || err.to_string().contains("missing"), "got: {err}");
    }

    #[test]
    fn read_loads_user_aliases_from_identity_block() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "00000000000000000000000000000000"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"

[identity]
user_aliases = ["alex", "alice"]
"#
            ),
        )
        .unwrap();
        let cfg = SoloConfig::read(&path).expect("read ok");
        assert_eq!(cfg.identity.user_aliases, vec!["alex".to_string(), "alice".to_string()]);
    }

    #[test]
    fn read_defaults_identity_when_block_absent() {
        // Backward compat: existing configs (pre-v0.5.0) have no
        // [identity] block. They must still deserialize cleanly, with
        // `user_aliases` defaulting to empty.
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "00000000000000000000000000000000"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"
"#
            ),
        )
        .unwrap();
        let cfg = SoloConfig::read(&path).expect("read ok");
        assert!(cfg.identity.user_aliases.is_empty());
    }

    #[test]
    fn read_defaults_user_aliases_when_identity_block_empty() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "00000000000000000000000000000000"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"

[identity]
"#
            ),
        )
        .unwrap();
        let cfg = SoloConfig::read(&path).expect("read ok");
        assert!(cfg.identity.user_aliases.is_empty());
    }

    #[test]
    fn read_rejects_short_salt_hex() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "deadbeef"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"
"#
            ),
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        assert!(err.to_string().contains("salt_hex"), "got: {err}");
    }
}