solo-storage 0.8.1

// SPDX-License-Identifier: Apache-2.0

//! Per-tenant handle: writer-actor + reader pool + HNSW + embedder bundled
//! into a single resource. v0.8.0 P2.
//!
//! ## Design
//!
//! Each tenant in the data dir has:
//!
//!   * Its own SQLCipher DB at `<data_dir>/tenants/<tenant_id>.db` (P1 layout).
//!   * Its own writer-actor on a dedicated OS thread (ADR-0003 model
//!     preserved per-tenant; see ADR-0004 for the multi-tenant invariants
//!     ADR-0004 adds on top of ADR-0003).
//!   * Its own reader pool (default size 2).
//!   * Its own HNSW index loaded from per-tenant snapshot files.
//!   * Its own resolved `embedder_id` for the persisted embedder identity.
//!   * A shared `Arc<dyn Embedder>` (a single embedder backend instance is
//!     re-used across tenants — embedders are stateless, no per-tenant
//!     state required).
//!
//! `TenantHandle::open` runs the full per-tenant startup chain (open DB,
//! migrate schema, load HNSW snapshot with fallbacks, rebuild from SQL on
//! empty snapshot, rebuild tombstones, replay `pending_index`, spawn the
//! writer-actor, build the reader pool). On shutdown it saves a final
//! snapshot, drains the writer thread, and drops the pool.
//!
//! ## HNSW snapshot layout
//!
//! Per-tenant **subdir** layout: `<data_dir>/tenants/<tenant_id>/<basename>.hnsw.{data,graph}`.
//! Per-tenant DB stays as a flat-file in `<data_dir>/tenants/<tenant_id>.db`
//! (same shape as P1 left it). The subdir per tenant cleanly isolates the
//! snapshot files; a future `solo tenants backup <id>` can tarball
//! `<data_dir>/tenants/<id>/` plus `<data_dir>/tenants/<id>.db` (and
//! sidecars) without globbing by prefix.
//!
//! For the `default` tenant migrated from v0.7.1, the P1 helper placed
//! snapshots flat in `<data_dir>/tenants/`. `TenantHandle::open` for
//! `default` upgrades that flat layout to `<data_dir>/tenants/default/`
//! lazily on first open (renames the four/six HNSW files into the subdir).
//! Idempotent.

use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::sync::Mutex;
use tokio::runtime::Handle as TokioHandle;

use rusqlite::Connection;
use solo_core::{Embedder, Error, Result, TenantId, VectorIndex, VectorIndexFactory};

use crate::audit::{AuditWriter, AuditWriterShutdown, purge_older_than};
use crate::embedder_registry::{EmbedderIdentity, get_or_insert_embedder_id};
use crate::hnsw_id::episode_hnsw_id;
use crate::init::open_sqlcipher;
use crate::key_material::KeyMaterial;
use crate::migration;
use crate::reader::ReaderPool;
use crate::recovery::{
    DriftReport, RebuildReport, ReplayReport, detect_drift, rebuild_hnsw_from_sql,
    replay_pending_index,
};
use crate::snapshot::{self, BAK_BASENAME, LIVE_BASENAME, TMP_BASENAME};
use crate::tenants::{TENANTS_SUBDIR, TenantsIndex};
use crate::vector_index::{HnswFactory, HnswIndex, HnswParams};
use crate::writer::{WriteHandle, WriterActor, WriterSpawn};

/// HNSW snapshot file suffixes (mirrors `snapshot::DATA_SUFFIX` / `GRAPH_SUFFIX`
/// which are private to that module).
const HNSW_DATA_SUFFIX: &str = ".hnsw.data";
const HNSW_GRAPH_SUFFIX: &str = ".hnsw.graph";

/// Per-tenant handle. Cheap to clone via `Arc<TenantHandle>` (the registry
/// owns the Arc; callers borrow `&TenantHandle`).
pub struct TenantHandle {
    tenant_id: TenantId,
    config: crate::config::SoloConfig,
    db_path: PathBuf,
    snapshot_dir: PathBuf,
    embedder_id: i64,
    hnsw: Arc<dyn VectorIndex + Send + Sync>,
    embedder: Arc<dyn Embedder>,
    // Writer side: hold the WriteHandle (clone-cheap) and the OS-thread join
    // handle. On shutdown, drop the handle then join the thread.
    write: WriteHandle,
    /// Only `Some` between `open` and `shutdown_all`. `take()`-d during the
    /// shutdown sequence so the join runs to completion. After shutdown the
    /// TenantHandle is consumed.
    writer_join: Option<std::thread::JoinHandle<()>>,
    read: ReaderPool,
    /// v0.8.0 P4: async audit writer for the query path. Cheap to clone
    /// (mpsc sender). The drainer's join handle lives in
    /// `audit_shutdown`.
    audit: AuditWriter,
    /// v0.8.0 P4: shutdown handle for the audit drainer task. `take()`-d
    /// during shutdown so we can `.join()` it after dropping every
    /// `AuditWriter` clone (i.e., closing the channel).
    audit_shutdown: Mutex<Option<AuditWriterShutdown>>,
    /// v0.8.0 P4: optional background retention-sweep task. Spawned only
    /// when both `[audit] retention_days` AND `[audit] purge_interval_secs`
    /// are set in `solo.config.toml`. Aborted on shutdown.
    audit_sweep_handle: Mutex<Option<tokio::task::JoinHandle<()>>>,
    /// Replay statistics from open (advisory; surfaced for logging).
    replay: ReplayReport,
    /// Drift report from open (advisory).
    drift: DriftReport,
    used_bak_snapshot: bool,
    started_fresh: bool,
    rebuild: RebuildReport,
}

/// Snapshot layout for one tenant inside `<data_dir>/tenants/`.
///
/// As of v0.8.0 P2 each tenant gets its own subdir
/// `<data_dir>/tenants/<tenant_id>/` holding the HNSW snapshot pairs.
/// `TenantHandle::open` creates the subdir on first use and (for the
/// `default` tenant migrated from v0.7.1) migrates any flat-layout
/// snapshots from `<data_dir>/tenants/` into the subdir on first open.
fn per_tenant_snapshot_dir(data_dir: &Path, tenant_id: &TenantId) -> PathBuf {
    data_dir.join(TENANTS_SUBDIR).join(tenant_id.as_str())
}

fn per_tenant_db_path(data_dir: &Path, db_filename: &str) -> PathBuf {
    data_dir.join(TENANTS_SUBDIR).join(db_filename)
}

/// Migrate v0.7.1 flat-tenants/ HNSW snapshots into the per-tenant subdir.
///
/// P1's `migrate_v071_to_v080` left snapshots flat in `<data_dir>/tenants/`
/// (e.g. `tenants/hnsw_episodes.hnsw.data`). P2 moves them into
/// `<data_dir>/tenants/default/` so per-tenant backup/restore can tarball
/// the entire subdir without globbing by prefix.
///
/// Only runs against the `default` tenant — other tenants are created by
/// `solo tenants create` (P6/P7) which writes their snapshots straight to
/// the subdir layout, so the upgrade is unnecessary for them.
///
/// Idempotent: if a snapshot is already in the subdir (or absent
/// altogether), no-op.
fn upgrade_flat_default_snapshots_to_subdir(
    data_dir: &Path,
    tenant_id: &TenantId,
) -> Result<()> {
    if tenant_id.as_str() != "default" {
        return Ok(());
    }
    let flat_dir = data_dir.join(TENANTS_SUBDIR);
    let subdir = flat_dir.join(tenant_id.as_str());
    std::fs::create_dir_all(&subdir).map_err(|e| {
        Error::storage(format!(
            "create per-tenant snapshot subdir {}: {e}",
            subdir.display()
        ))
    })?;

    for basename in [LIVE_BASENAME, BAK_BASENAME, TMP_BASENAME] {
        for suffix in [HNSW_DATA_SUFFIX, HNSW_GRAPH_SUFFIX] {
            let filename = format!("{basename}{suffix}");
            let src = flat_dir.join(&filename);
            let dst = subdir.join(&filename);
            if !src.is_file() {
                continue;
            }
            if dst.is_file() {
                // Both exist (operator surgery state). Prefer the subdir
                // copy; remove the flat one to clear ambiguity.
                std::fs::remove_file(&src).map_err(|e| {
                    Error::storage(format!(
                        "remove flat-layout snapshot duplicate {}: {e}",
                        src.display()
                    ))
                })?;
                continue;
            }
            std::fs::rename(&src, &dst).map_err(|e| {
                Error::storage(format!(
                    "promote flat-layout snapshot {} → {}: {e}",
                    src.display(),
                    dst.display()
                ))
            })?;
            tracing::info!(
                src = %src.display(),
                dst = %dst.display(),
                tenant = %tenant_id,
                "P2: promoted flat-tenants/ HNSW snapshot into per-tenant subdir"
            );
        }
    }
    Ok(())
}

/// Parameters for opening a tenant. Built by the registry from its shared
/// bootstrap state plus the tenant-specific row from `tenants_index.db`.
pub struct TenantOpenParams {
    pub data_dir: PathBuf,
    pub key: KeyMaterial,
    pub db_filename: String,
    pub embedder: Arc<dyn Embedder>,
    pub hnsw_params: HnswParams,
    /// Optional Steward (LLM-driven consolidation). Wired only when the
    /// daemon/CLI was started with a real `LlmClient` configured.
    pub steward: Option<Arc<solo_steward::Steward>>,
    /// Optional tokio runtime handle. Required when `embedder` is wired
    /// (the writer-actor's blocking thread `block_on`s embedder calls
    /// during reembed). For pure-storage tests that don't spawn a runtime
    /// inside the writer, this can be `None`.
    pub runtime_handle: Option<TokioHandle>,
    /// v0.8.1 P3: per-tenant byte quota from `tenants_index.tenants.
    /// quota_bytes`. `None` means unlimited (default for v0.8.0
    /// tenants and any tenant created without `--quota-bytes`). Cached
    /// in the writer-actor for enforcement at write time.
    #[allow(clippy::field_reassign_with_default)]
    pub quota_bytes: Option<u64>,
}

impl TenantHandle {
    /// Open a tenant. Reads `solo.config.toml` for the embedder identity,
    /// applies per-tenant migrations, loads the HNSW snapshot (with `.bak`
    /// fallback and SQL-rebuild fallback), replays `pending_index`, runs
    /// drift detection, and spawns the writer-actor.
    ///
    /// The returned `TenantHandle` is ready for read + write requests.
    pub fn open(tenant_id: TenantId, params: TenantOpenParams) -> Result<Self> {
        let TenantOpenParams {
            data_dir,
            key,
            db_filename,
            embedder,
            hnsw_params,
            steward,
            runtime_handle,
            quota_bytes,
        } = params;

        // Read the canonical config from `<data_dir>/solo.config.toml`.
        // v0.8.0 P2: one config per data dir, not per tenant. The embedder
        // identity in the config is the deployment-wide identity; per-tenant
        // embedder swaps are a v0.8.1+ concern.
        let config_path = data_dir.join("solo.config.toml");
        let config = crate::config::SoloConfig::read(&config_path)?;
        let dim = config.embedder.dim as usize;
        if dim == 0 {
            return Err(Error::storage(format!(
                "solo.config.toml records embedder.dim=0 — corrupt config? at {config_path:?}"
            )));
        }

        // Upgrade v0.7.1-flat snapshot layout to per-tenant-subdir layout
        // before locating the snapshot dir.
        upgrade_flat_default_snapshots_to_subdir(&data_dir, &tenant_id)?;

        let db_path = per_tenant_db_path(&data_dir, &db_filename);
        let snapshot_dir = per_tenant_snapshot_dir(&data_dir, &tenant_id);
        std::fs::create_dir_all(&snapshot_dir).map_err(|e| {
            Error::storage(format!(
                "create per-tenant snapshot dir {}: {e}",
                snapshot_dir.display()
            ))
        })?;

        // Per-tenant DB must exist — TenantRegistry::get_or_open is the
        // sole caller, and it ensures the row is in `tenants_index` with
        // status='active' before this runs. A missing DB file at this
        // point is a corruption signal, not a "first time" case.
        if !db_path.is_file() {
            return Err(Error::not_found(format!(
                "per-tenant DB not found at {}; the tenants_index row \
                 references this file but it is missing. Operator action \
                 required (restore from backup or remove the orphan registry row).",
                db_path.display()
            )));
        }

        // Open the init connection used for migrations + startup chain.
        let mut conn: Connection = open_sqlcipher(&db_path, &key)?;

        // Run per-tenant migrations idempotently.
        let _schema_version = migration::run_migrations(&mut conn)?;

        // Resolve embedder_id from the persisted config. The embedder
        // identity is the same across every tenant in v0.8.0 P2 (one
        // config per data dir), so this row gets the same id from each
        // tenant's `embedders` table.
        let embedder_identity = EmbedderIdentity {
            name: config.embedder.name.clone(),
            version: config.embedder.version.clone(),
            dim: config.embedder.dim,
            dtype: config.embedder.dtype.clone(),
        };
        let embedder_id = get_or_insert_embedder_id(&conn, &embedder_identity)?;

        // Load HNSW snapshot with the same three-way fallback as the
        // single-tenant startup chain.
        let factory = HnswFactory::with_params(hnsw_params);
        let (hnsw_index, used_bak_snapshot, started_fresh) =
            load_hnsw_with_fallback(&snapshot_dir, &factory, dim);

        if !started_fresh && hnsw_index.dim() != dim {
            return Err(Error::storage(format!(
                "tenant {tenant_id}: HNSW snapshot dim ({}) does not match \
                 solo.config.toml embedder.dim ({dim}). Embedder identity has \
                 shifted under the daemon. Run `solo reembed` to rebuild.",
                hnsw_index.dim()
            )));
        }

        // Rebuild from SQL when no snapshot was loadable.
        let rebuild = if started_fresh {
            let started = std::time::Instant::now();
            let r = rebuild_hnsw_from_sql(&conn, &hnsw_index, embedder_id)?;
            if r.rows_seen > 0 {
                tracing::info!(
                    tenant = %tenant_id,
                    rows_seen = r.rows_seen,
                    rows_added = r.rows_added,
                    rows_skipped = r.rows_skipped,
                    elapsed_ms = started.elapsed().as_millis() as u64,
                    "tenant: rebuilt HNSW from embeddings after empty-snapshot fallback"
                );
            }
            r
        } else {
            RebuildReport::default()
        };

        let hnsw: Arc<dyn VectorIndex + Send + Sync> = Arc::new(hnsw_index);

        let forgotten = if started_fresh {
            0
        } else {
            rebuild_tombstones_from_sql(&conn, hnsw.as_ref())?
        };
        if forgotten > 0 {
            tracing::info!(
                tenant = %tenant_id,
                forgotten,
                "tenant: rebuilt HNSW tombstones from forgotten episodes"
            );
        }

        let replay = replay_pending_index(&mut conn, hnsw.as_ref())?;
        let drift = detect_drift(&conn, hnsw.as_ref())?;
        drop(conn);

        // Build the reader pool.
        let pool = ReaderPool::new(&db_path, Some(key.clone()), hnsw.clone())?;

        // v0.8.0 P5: build the redaction registry from the per-data-dir
        // config. Disabled by default; the writer-actor's per-write
        // path short-circuits via `RedactionRegistry::is_enabled`. Invalid
        // custom regexes here surface as `TenantHandle::open` errors so
        // operators see the problem at startup, not at first write.
        let redactor = Arc::new(crate::redaction::RedactionRegistry::from_config(
            &config.redaction,
        )?);

        // Spawn the writer-actor. We always wire embedder + (optional)
        // steward + key + runtime handle when one is available. For pure
        // tests that pass `runtime_handle: None`, we fall back to the
        // simpler spawn variant that doesn't try to capture a runtime.
        let writer_conn = open_sqlcipher(&db_path, &key)?;

        let WriterSpawn {
            handle: write,
            join,
        } = if let Some(rt) = runtime_handle.clone() {
            // v0.8.1 P3: pass the cached quota + db_path so the writer
            // can enforce per-write. When quota_bytes is None (the
            // common case for v0.8.0 tenants), the writer's per-write
            // check short-circuits on the QuotaDecision::Unlimited
            // branch in one Option compare.
            WriterActor::spawn_full_with_quota(
                writer_conn,
                hnsw.clone(),
                snapshot_dir.clone(),
                embedder_id,
                embedder.clone(),
                steward,
                key.clone(),
                rt,
                redactor,
                quota_bytes,
                db_path.clone(),
            )
        } else {
            WriterActor::spawn_full(
                writer_conn,
                hnsw.clone(),
                snapshot_dir.clone(),
                embedder_id,
            )
        };

        // v0.8.0 P4: spawn the async audit drainer. Uses the same key as
        // the writer; opens its own SQLCipher connection lazily on first
        // event. Requires a tokio runtime to be live when this is called
        // (every prod path goes through `TenantRegistry::get_or_open`
        // which calls this inside `spawn_blocking` on a runtime).
        let (audit, audit_shutdown) =
            AuditWriter::spawn(db_path.clone(), Some(key.clone()));

        // v0.8.0 P4: optional background retention sweep. Spawned only if
        // both `retention_days` and `purge_interval_secs` are configured.
        let audit_sweep_handle = spawn_audit_sweep(
            &tenant_id,
            &db_path,
            &key,
            &config.audit,
            runtime_handle.clone(),
        );

        Ok(TenantHandle {
            tenant_id,
            config,
            db_path,
            snapshot_dir,
            embedder_id,
            hnsw,
            embedder,
            write,
            writer_join: Some(join),
            read: pool,
            audit,
            audit_shutdown: Mutex::new(Some(audit_shutdown)),
            audit_sweep_handle: Mutex::new(audit_sweep_handle),
            replay,
            drift,
            used_bak_snapshot,
            started_fresh,
            rebuild,
        })
    }

    pub fn tenant_id(&self) -> &TenantId {
        &self.tenant_id
    }
    pub fn config(&self) -> &crate::config::SoloConfig {
        &self.config
    }
    pub fn db_path(&self) -> &Path {
        &self.db_path
    }
    pub fn snapshot_dir(&self) -> &Path {
        &self.snapshot_dir
    }
    pub fn embedder_id(&self) -> i64 {
        self.embedder_id
    }
    pub fn write(&self) -> &WriteHandle {
        &self.write
    }
    pub fn read(&self) -> &ReaderPool {
        &self.read
    }
    pub fn hnsw(&self) -> &Arc<dyn VectorIndex + Send + Sync> {
        &self.hnsw
    }
    pub fn embedder(&self) -> &Arc<dyn Embedder> {
        &self.embedder
    }
    pub fn replay(&self) -> &ReplayReport {
        &self.replay
    }
    pub fn drift(&self) -> &DriftReport {
        &self.drift
    }
    pub fn used_bak_snapshot(&self) -> bool {
        self.used_bak_snapshot
    }
    pub fn started_fresh(&self) -> bool {
        self.started_fresh
    }
    pub fn rebuild(&self) -> &RebuildReport {
        &self.rebuild
    }
    /// v0.8.0 P4: cloneable async audit writer for query paths.
    pub fn audit(&self) -> &AuditWriter {
        &self.audit
    }

    /// Assemble a `TenantHandle` from already-constructed parts. Used by
    /// test harnesses (in solo-api / solo-query) that build a writer +
    /// reader pool + HNSW manually against a non-SQLCipher test DB and
    /// don't want to go through `TenantHandle::open` (which assumes a
    /// real SQLCipher-encrypted file).
    ///
    /// Production callers MUST go through `TenantHandle::open` via
    /// `TenantRegistry::get_or_open`.
    #[cfg(any(test, feature = "test-support"))]
    #[allow(clippy::too_many_arguments)]
    pub fn from_parts_for_tests(
        tenant_id: TenantId,
        config: crate::config::SoloConfig,
        db_path: PathBuf,
        snapshot_dir: PathBuf,
        embedder_id: i64,
        hnsw: Arc<dyn VectorIndex + Send + Sync>,
        embedder: Arc<dyn Embedder>,
        write: WriteHandle,
        writer_join: std::thread::JoinHandle<()>,
        read: ReaderPool,
    ) -> Self {
        // v0.8.0 P4: test harnesses get a no-op audit writer by default.
        // Tests that need real audit emission can `assemble_for_tests`
        // with `with_audit` afterwards.
        let audit = AuditWriter::noop();
        Self {
            tenant_id,
            config,
            db_path,
            snapshot_dir,
            embedder_id,
            hnsw,
            embedder,
            write,
            writer_join: Some(writer_join),
            read,
            audit,
            audit_shutdown: Mutex::new(None),
            audit_sweep_handle: Mutex::new(None),
            replay: ReplayReport::default(),
            drift: DriftReport::default(),
            used_bak_snapshot: false,
            started_fresh: true,
            rebuild: RebuildReport::default(),
        }
    }

    /// Graceful shutdown:
    /// 1. Save a final HNSW snapshot (best-effort; logged on failure).
    /// 2. Drop the WriteHandle to close the mpsc channel.
    /// 3. Join the writer-actor's OS thread so it completes
    ///    `wal_checkpoint(TRUNCATE)` before this returns.
    /// 4. Drop the reader pool (must happen inside a tokio runtime).
    ///
    /// Optionally skip the snapshot save (used by `solo reembed`, which
    /// deliberately wipes snapshots).
    pub async fn shutdown(mut self, save_snapshot: bool) -> Result<()> {
        if save_snapshot
            && let Err(e) = self.write.save_snapshot().await
        {
            tracing::warn!(
                tenant = %self.tenant_id,
                error = %e,
                "tenant shutdown: final snapshot save failed (continuing)"
            );
        }
        // v0.8.0 P4: abort the background retention sweep task (if any).
        if let Some(handle) = self.audit_sweep_handle.lock().unwrap().take() {
            handle.abort();
        }
        // Drop the AuditWriter and wait for the drainer to flush + exit.
        // Order matters: drop the writer first (so the mpsc channel
        // closes after the in-flight events drain), then join the drainer.
        let audit_shutdown = self.audit_shutdown.lock().unwrap().take();
        // The handle's own audit clone is implicitly dropped when `self`
        // drops, but we drop it explicitly here so the drainer sees the
        // close-signal before we await the join below.
        // Replace `self.audit` with a noop so the field stays valid for
        // the rest of `self`'s drop sequence.
        let _ = std::mem::replace(&mut self.audit, AuditWriter::noop());
        if let Some(shutdown) = audit_shutdown {
            shutdown.join().await;
        }

        // Drop the handle so the actor exits.
        let write = self.write;
        drop(write);
        if let Some(join) = self.writer_join.take() {
            // Join on a blocking task so we don't hold the tokio runtime
            // off its workers while the OS thread is closing files.
            tokio::task::spawn_blocking(move || {
                if let Err(panic) = join.join() {
                    tracing::error!(?panic, "tenant: writer thread panicked on shutdown");
                }
            })
            .await
            .ok();
        }
        // ReaderPool drops here when self drops; explicit no-op so the
        // intent is documented.
        drop(self.read);
        Ok(())
    }
}

/// Walk `episodes WHERE status='forgotten'` and tombstone each rowid in the
/// HNSW. See `startup::rebuild_tombstones_from_sql` for the long
/// explanation; this is the per-tenant copy.
fn rebuild_tombstones_from_sql(
    conn: &Connection,
    hnsw: &dyn VectorIndex,
) -> Result<usize> {
    let mut stmt = conn
        .prepare("SELECT rowid FROM episodes WHERE status = 'forgotten'")
        .map_err(|e| Error::storage(format!("prepare forgotten select: {e}")))?;
    let rows = stmt
        .query_map([], |row| row.get::<_, i64>(0))
        .map_err(|e| Error::storage(format!("query_map forgotten: {e}")))?;
    let mut count = 0usize;
    for r in rows {
        let rowid = r.map_err(|e| Error::storage(format!("forgotten row decode: {e}")))?;
        hnsw.remove(episode_hnsw_id(rowid))?;
        count += 1;
    }
    Ok(count)
}

/// Try the live snapshot, then `.bak`, then fall back to a fresh empty
/// index. Same as `startup::load_hnsw_with_fallback`; the per-tenant copy
/// uses the same logic.
fn load_hnsw_with_fallback(
    snapshot_dir: &Path,
    factory: &HnswFactory,
    dim: usize,
) -> (HnswIndex, bool, bool) {
    match snapshot::load(snapshot_dir) {
        Ok(idx) => {
            tracing::info!(
                snapshot_kind = "live",
                dim = idx.dim(),
                len = idx.len(),
                "tenant HNSW loaded from live snapshot"
            );
            (idx, false, false)
        }
        Err(primary_err) => {
            tracing::warn!(error = %primary_err, "tenant: live HNSW snapshot failed; trying .bak");
            match snapshot::load_bak(snapshot_dir) {
                Ok(idx) => {
                    tracing::warn!(
                        snapshot_kind = "bak",
                        dim = idx.dim(),
                        len = idx.len(),
                        "tenant HNSW loaded from backup snapshot — investigate the live pair"
                    );
                    (idx, true, false)
                }
                Err(bak_err) => {
                    tracing::warn!(
                        primary = %primary_err,
                        bak = %bak_err,
                        dim,
                        "tenant: no HNSW snapshot available; starting fresh empty index"
                    );
                    let empty = factory
                        .create(dim)
                        .expect("HnswFactory::create with valid dim must succeed");
                    (empty, false, true)
                }
            }
        }
    }
}

/// v0.8.0 P4: spawn a per-tenant background retention sweep, gated on
/// `[audit] retention_days` + `[audit] purge_interval_secs` both being
/// set. Returns `None` when:
///
///   * Either knob is `None` (sweep is opt-in).
///   * `runtime_handle` is `None` (test harnesses without a tokio runtime).
///
/// The spawned task wakes every `purge_interval_secs`, opens its own
/// SQLCipher connection (separate from the writer-actor's), and calls
/// `purge_older_than(now - retention_days * 86400_000)`. Failures are
/// logged + retried at the next tick; we don't crash the task on a
/// transient SQLite error.
///
/// Aborted by `TenantHandle::shutdown`.
fn spawn_audit_sweep(
    tenant_id: &TenantId,
    db_path: &Path,
    key: &KeyMaterial,
    audit_cfg: &crate::config::AuditSettings,
    runtime_handle: Option<TokioHandle>,
) -> Option<tokio::task::JoinHandle<()>> {
    let retention_days = audit_cfg.retention_days?;
    let interval_secs = audit_cfg.purge_interval_secs?;
    let rt = runtime_handle?;

    let tenant = tenant_id.clone();
    let path = db_path.to_path_buf();
    let key = key.clone();
    let interval = std::time::Duration::from_secs(interval_secs);

    Some(rt.spawn(async move {
        let mut ticker = tokio::time::interval(interval);
        // First tick fires immediately; consume + discard so the first
        // real sweep happens AFTER `interval` (not at-startup, when there
        // typically isn't anything past retention yet).
        ticker.tick().await;
        loop {
            ticker.tick().await;
            let cutoff_ms = chrono::Utc::now().timestamp_millis()
                - i64::from(retention_days) * 86_400_000;
            let path = path.clone();
            let key = key.clone();
            let tenant = tenant.clone();
            let outcome = tokio::task::spawn_blocking(move || {
                let mut conn = match open_sqlcipher(&path, &key) {
                    Ok(c) => c,
                    Err(e) => return Err(e),
                };
                purge_older_than(&mut conn, cutoff_ms)
            })
            .await;
            match outcome {
                Ok(Ok(deleted)) if deleted > 0 => tracing::info!(
                    tenant = %tenant,
                    deleted,
                    cutoff_ms,
                    "audit retention sweep purged rows"
                ),
                Ok(Ok(_)) => tracing::debug!(
                    tenant = %tenant,
                    "audit retention sweep ran (nothing to purge)"
                ),
                Ok(Err(e)) => tracing::warn!(
                    tenant = %tenant,
                    error = %e,
                    "audit retention sweep failed (will retry next interval)"
                ),
                Err(e) => tracing::warn!(
                    tenant = %tenant,
                    error = %e,
                    "audit retention sweep join failed"
                ),
            }
        }
    }))
}

/// Look up a tenant's `db_filename` in `tenants_index.db`. Helper for the
/// registry; refuses to open a tenant whose row is missing or whose status
/// is not `'active'`.
///
/// v0.8.1 P3 supersedes this helper inside the registry — `get_or_open`
/// now reads the full record so it can capture `quota_bytes` alongside
/// the filename. Kept here for any future caller that just wants the
/// filename.
#[allow(dead_code)]
pub(crate) fn lookup_tenant_db_filename(
    index: &TenantsIndex,
    tenant_id: &TenantId,
) -> Result<String> {
    let rec = index.lookup(tenant_id)?.ok_or_else(|| {
        Error::not_found(format!("tenant `{tenant_id}` not found in tenants_index"))
    })?;
    if rec.status != crate::tenants::TenantStatus::Active {
        return Err(Error::conflict(format!(
            "tenant `{tenant_id}` has status `{}`; refusing to open",
            rec.status.as_sql_str()
        )));
    }
    Ok(rec.db_filename)
}