solo-storage 0.3.5

Solo: SQLite + SQLCipher persistence layer
Documentation
// SPDX-License-Identifier: Apache-2.0

//! `solo.config.toml` reader/writer.
//!
//! The config file lives alongside `solo.db` and stores everything Solo needs
//! to re-open the database on startup but does NOT need to keep secret. The
//! Argon2 salt is the load-bearing field — without it, the same passphrase
//! produces a different key, so the SQLCipher database becomes unreadable.
//!
//! Layout (TOML):
//! ```toml
//! schema_version = 1
//! salt_hex       = "0123456789abcdef0123456789abcdef"   # 16 bytes -> 32 hex
//!
//! [embedder]
//! name    = "BAAI/bge-m3"      # matches `crate::embedder::bge_m3::BGE_M3_NAME`
//! version = "v1"               # `BGE_M3_VERSION`; bump on any vector-shifting change
//! dim     = 1024               # `BGE_M3_DIM`
//! dtype   = "f32"
//! ```
//!
//! Why TOML: human-readable for debugging + recovery. The whole file is small;
//! we don't need a more compact format.

use serde::{Deserialize, Serialize};
use solo_core::{Error, Result};
use std::path::Path;

use crate::key_material::SALT_LEN;

/// Current config schema version. Bump on any incompatible field change.
pub const CONFIG_SCHEMA_VERSION: u32 = 1;

/// Top-level config struct, serialized as TOML to `solo.config.toml`.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SoloConfig {
    /// Version of the config schema itself (NOT the database schema). Bumping
    /// this lets future Solo versions migrate old config files in-place.
    pub schema_version: u32,
    /// 32-character lowercase hex string of the 16-byte Argon2 salt.
    pub salt_hex: String,
    /// Embedder identity: name, version, dim, dtype. The database holds
    /// embeddings tied to a specific `(name, version)`; if those change, the
    /// daemon refuses to start until `solo reembed` rebuilds them.
    pub embedder: EmbedderConfig,
}

/// Embedder identity persisted to disk so startup can detect drift.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EmbedderConfig {
    pub name: String,
    pub version: String,
    pub dim: u32,
    /// Serialized form of `solo_core::EmbeddingDtype`: "f32" | "f16" | "i8" | "binary".
    pub dtype: String,
}

impl SoloConfig {
    /// Build a fresh config for first-run setup. Caller supplies the salt
    /// (typically `KeyMaterial::fresh_salt()`).
    pub fn new(salt: [u8; SALT_LEN], embedder: EmbedderConfig) -> Self {
        Self {
            schema_version: CONFIG_SCHEMA_VERSION,
            salt_hex: hex::encode(salt),
            embedder,
        }
    }

    /// Decode the persisted salt back to its 16-byte form.
    pub fn salt_bytes(&self) -> Result<[u8; SALT_LEN]> {
        let bytes = hex::decode(&self.salt_hex)
            .map_err(|e| Error::storage(format!("config salt_hex is not valid hex: {e}")))?;
        if bytes.len() != SALT_LEN {
            return Err(Error::storage(format!(
                "config salt_hex must decode to {} bytes, got {}",
                SALT_LEN,
                bytes.len()
            )));
        }
        let mut out = [0u8; SALT_LEN];
        out.copy_from_slice(&bytes);
        Ok(out)
    }

    /// Serialize to `solo.config.toml` at the given path. Atomic-writes via a
    /// `<path>.tmp` file + rename so a crash mid-write can't leave a partial
    /// config. Refuses to overwrite an existing file (caller must handle the
    /// already-initialized case).
    ///
    /// Durability ordering: write tmp → fsync tmp → rename → fsync parent dir
    /// (Unix only; Windows relies on NTFS's metadata journal). The salt
    /// stored here is the only path back into the SQLCipher database — a
    /// partial-write corruption locks the user out forever, so we pay the
    /// fsync cost (~1 ms) without compromise.
    pub fn write(&self, path: &Path) -> Result<()> {
        if path.exists() {
            return Err(Error::conflict(format!(
                "config already exists: {}",
                path.display()
            )));
        }
        let tmp_path = path.with_extension("toml.tmp");
        let body = toml::to_string_pretty(self)
            .map_err(|e| Error::storage(format!("toml serialize: {e}")))?;

        // Open + write + fsync the tmp file before exposing it via rename.
        {
            let mut tmp_file = std::fs::OpenOptions::new()
                .write(true)
                .create_new(true)
                .open(&tmp_path)
                .map_err(|e| Error::storage(format!("open tmp {}: {e}", tmp_path.display())))?;
            std::io::Write::write_all(&mut tmp_file, body.as_bytes())
                .map_err(|e| Error::storage(format!("write {}: {e}", tmp_path.display())))?;
            tmp_file
                .sync_all()
                .map_err(|e| Error::storage(format!("fsync tmp {}: {e}", tmp_path.display())))?;
        }

        std::fs::rename(&tmp_path, path)
            .map_err(|e| Error::storage(format!("rename to {}: {e}", path.display())))?;

        // fsync the parent directory so the rename persists across a crash.
        // No-op on Windows — opening a directory for FlushFileBuffers requires
        // FILE_FLAG_BACKUP_SEMANTICS; NTFS's metadata journal handles this case.
        #[cfg(unix)]
        {
            if let Some(parent) = path.parent() {
                if let Ok(d) = std::fs::OpenOptions::new().read(true).open(parent) {
                    let _ = d.sync_all();
                }
            }
        }

        Ok(())
    }

    /// Read + parse from `solo.config.toml`. Validates schema_version.
    pub fn read(path: &Path) -> Result<Self> {
        let body = std::fs::read_to_string(path)
            .map_err(|e| Error::storage(format!("read {}: {e}", path.display())))?;
        let cfg: Self = toml::from_str(&body)
            .map_err(|e| Error::storage(format!("toml parse {}: {e}", path.display())))?;
        if cfg.schema_version != CONFIG_SCHEMA_VERSION {
            return Err(Error::storage(format!(
                "config schema_version mismatch: file is v{}, this binary expects v{}",
                cfg.schema_version, CONFIG_SCHEMA_VERSION
            )));
        }
        // Validate salt_hex shape eagerly so callers see the error here, not
        // later at key-derive time.
        let _ = cfg.salt_bytes()?;
        Ok(cfg)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    fn fixture_embedder() -> EmbedderConfig {
        EmbedderConfig {
            name: "bge-m3".into(),
            version: "v1.0".into(),
            dim: 1024,
            dtype: "f32".into(),
        }
    }

    #[test]
    fn roundtrip_via_disk() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");

        let salt = [7u8; SALT_LEN];
        let cfg = SoloConfig::new(salt, fixture_embedder());
        cfg.write(&path).unwrap();

        let read_back = SoloConfig::read(&path).unwrap();
        assert_eq!(cfg, read_back);
        assert_eq!(read_back.salt_bytes().unwrap(), salt);
    }

    #[test]
    fn write_refuses_overwrite() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        let cfg = SoloConfig::new([0; SALT_LEN], fixture_embedder());
        cfg.write(&path).unwrap();
        let err = cfg.write(&path).unwrap_err();
        assert!(err.to_string().contains("already exists"), "got: {err}");
    }

    #[test]
    fn read_rejects_wrong_schema_version() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            r#"
schema_version = 99
salt_hex = "00000000000000000000000000000000"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"
"#,
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        assert!(err.to_string().contains("schema_version mismatch"), "got: {err}");
    }

    #[test]
    fn read_rejects_non_hex_salt() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"
"#
            ),
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        // hex::decode fails on non-hex chars → "not valid hex".
        assert!(err.to_string().contains("salt_hex"), "got: {err}");
    }

    #[test]
    fn read_rejects_missing_embedder_block() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "00000000000000000000000000000000"
"#
            ),
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        // serde error for missing field
        assert!(err.to_string().to_lowercase().contains("embedder") || err.to_string().contains("missing"), "got: {err}");
    }

    #[test]
    fn read_rejects_short_salt_hex() {
        let tmp = TempDir::new().unwrap();
        let path = tmp.path().join("solo.config.toml");
        std::fs::write(
            &path,
            format!(
                r#"
schema_version = {CONFIG_SCHEMA_VERSION}
salt_hex = "deadbeef"

[embedder]
name = "bge-m3"
version = "v1.0"
dim = 1024
dtype = "f32"
"#
            ),
        )
        .unwrap();
        let err = SoloConfig::read(&path).unwrap_err();
        assert!(err.to_string().contains("salt_hex"), "got: {err}");
    }
}