kiromi-ai-memory 0.2.2

// SPDX-License-Identifier: Apache-2.0 OR MIT
//! Plan 12 phases D + E + G + H — regeneration operations on
//! [`crate::Memory`].
//!
//! Each operation runs under the tenant write lock. Storage blobs are
//! append-only; only `gc()` deletes. Plan 18 dispatch 3 rewrote
//! `regenerate_embeddings` and `reindex` to flow through the
//! [`crate::index::VectorIndex`] + [`crate::index::LexicalIndex`] traits
//! against the catalog SQLite — no parquet shards, no usearch artefacts.
//! Atomic guarantee: each per-row update is a single SQL transaction
//! (catalog row + `memory_vec` row + audit), so a partial run leaves the
//! catalog in a coherent state per memory.

use std::collections::BTreeSet;
use std::time::{Duration, Instant};

use crate::audit::AuditOp;
use crate::embedder::{EmbedRole, Embedder};
use crate::error::{Error, Result};
use crate::handle::Memory;
use crate::memory::MemoryId;
use crate::metadata::AuditEntry;
use crate::regen::{GcOpts, GcReport, RegenReport, ReindexReport};
use crate::storage::{StorageEntry, StorageKey};
use crate::summary::Scope;

/// Best-effort utf8 decode of a body blob.
fn body_to_string(bytes: bytes::Bytes) -> Result<String> {
    String::from_utf8(bytes.to_vec()).map_err(|e| Error::storage("non-utf8 body", e))
}

impl Memory {
    /// Plan 12 — rebuild the live vector + lexical indices under `scope`
    /// from the catalog SQL state. Idempotent. Useful after corruption,
    /// an out-of-band edit, or a backup restore that left the index
    /// virtual tables empty.
    ///
    /// Plan 18: walks `memory` rows under `scope`, re-upserts each row's
    /// `memory_vec` entry from `embedding_blob` and each row's
    /// `memory_fts` entry from the body in object storage. Then walks
    /// the live `summary` rows under `scope` and rebuilds
    /// `summary_vec` + `summary_fts` the same way (re-embedding when an
    /// embedder is configured). No on-disk shard files are touched —
    /// vec0 + FTS5 are the only indices.
    ///
    /// **Atomicity:** the per-row index upserts go through the
    /// [`crate::index::VectorIndex`] / [`crate::index::LexicalIndex`]
    /// traits **outside** of any catalog transaction. The catalog rows
    /// themselves are not rewritten by `reindex`, so this is sound: a
    /// crash mid-loop leaves index rows partially rebuilt, which is the
    /// same state the call started from, and the next `reindex` finishes
    /// the job. Concurrent readers are excluded by the tenant write lock.
    ///
    /// ```no_run
    /// # async fn _ex(mem: kiromi_ai_memory::Memory) -> kiromi_ai_memory::Result<()> {
    /// use kiromi_ai_memory::summary::Scope;
    /// let r = mem.reindex(Scope::All).await?;
    /// # let _ = r; Ok(()) }
    /// ```
    pub async fn reindex(&self, scope: Scope) -> Result<ReindexReport> {
        let _g = self.inner.locks.lock(&self.inner.tenant).await;
        let started = Instant::now();
        let mut report = ReindexReport::default();

        // Read the persisted dim so we can sanity-check `embedding_blob`.
        let meta = self.inner.metadata.read_schema_meta().await?;
        let dims: Option<usize> = meta
            .as_ref()
            .and_then(|m| m.embedder_dims)
            .and_then(|d| usize::try_from(d).ok());

        // ----- memories -----
        let mems = self.inner.metadata.memories_in_subtree(&scope).await?;
        let touched_partitions: BTreeSet<_> =
            mems.iter().map(|m| m.partition_path.clone()).collect();
        for row in &mems {
            // Vector half — re-upsert memory_vec from embedding_blob.
            if let Some(d) = dims
                && let Some(vector) = self.inner.metadata.get_memory_embedding(&row.id, d).await?
            {
                self.inner
                    .vector_index
                    .upsert_memory(&row.id, &row.partition_path, row.kind.as_ref(), &vector)
                    .await?;
            }
            // Lexical half — refetch body from storage and re-upsert memory_fts.
            let body = self
                .inner
                .storage
                .get(&StorageKey::new(row.data_path.clone()))
                .await?;
            let body_str = body_to_string(body)?;
            self.inner
                .lexical_index
                .upsert_memory(&row.id, &row.partition_path, &body_str)
                .await?;
            report.memories_indexed += 1;
        }

        // ----- summaries -----
        // Only the latest (non-superseded) live summary per (subject, style)
        // matters for search. The `summaries_in_subtree` method returns all
        // live summary rows; `INSERT OR REPLACE` semantics mean superseded
        // versions just overwrite each other in deterministic order, but to
        // keep the shape clean we only push the latest version per row id.
        let summaries = self.inner.metadata.summaries_in_subtree(&scope).await?;
        for s in &summaries {
            if s.tombstoned {
                continue;
            }
            let parent_path = match s.subject_kind.as_str() {
                "tenant" => crate::partition::tenant_root_path().as_str().to_string(),
                "partition" => s
                    .subject_path
                    .as_ref()
                    .map(|p| p.as_str().to_string())
                    .unwrap_or_else(|| crate::partition::tenant_root_path().as_str().into()),
                "memory" => s
                    .subject_path
                    .as_ref()
                    .map(|p| p.as_str().to_string())
                    .unwrap_or_else(|| crate::partition::tenant_root_path().as_str().into()),
                _ => crate::partition::tenant_root_path().as_str().into(),
            };
            let body = self
                .inner
                .storage
                .get(&StorageKey::new(s.data_path.clone()))
                .await?;
            let body_str = body_to_string(body)?;
            // Vector half (only when an embedder is configured — caller-provided
            // mode would need the caller to feed vectors, which `reindex`
            // does not surface yet).
            if let Some(embedder) = self.inner.embedder.as_deref() {
                let mut vs = embedder
                    .embed(EmbedRole::Document, &[body_str.as_str()])
                    .await?;
                if let Some(v) = vs.pop() {
                    let style = crate::summarizer::SummaryStyle::from_persisted(s.style.as_str());
                    self.inner
                        .vector_index
                        .upsert_summary(&s.id, &parent_path, &style, &v)
                        .await?;
                }
            }
            self.inner
                .lexical_index
                .upsert_summary(&s.id, &parent_path, &body_str)
                .await?;
            report.child_summaries_indexed += 1;
        }

        // `indices_rebuilt` historically counted partition_index nodes. We
        // keep semantic compat: report one entry per touched partition
        // (memory side) plus one for each summary subject's `parent_path`.
        report.indices_rebuilt = u64::try_from(touched_partitions.len()).unwrap_or(0)
            + u64::try_from(summaries.iter().filter(|s| !s.tombstoned).count()).unwrap_or(0);
        if report.indices_rebuilt == 0 && (!mems.is_empty() || !summaries.is_empty()) {
            report.indices_rebuilt = 1;
        }

        report.duration_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(0);
        Ok(report)
    }

    /// Atomically swap every embedding under `scope` to the supplied embedder
    /// and rebuild every affected `memory_vec` row.
    ///
    /// **Cost:** O(memories under scope × embed cost). Each row update is a
    /// single SQL transaction (memory row + memory_vec + audit), so a partial
    /// run leaves every successfully-processed row in a coherent state.
    ///
    /// **Errors:** [`Error::EmbedderDimMismatch`] if the new embedder's
    /// dimensionality differs from `schema_meta.embedder_dims` (changing
    /// dim is not yet supported and would invalidate every persisted vector);
    /// [`Error::Embedder`] for embed failures; [`Error::Storage`] /
    /// [`Error::Metadata`] for backend errors. Per-row failures land in
    /// `RegenReport::errors` and bump `failed`.
    ///
    /// [`Error::EmbedderDimMismatch`]: crate::error::Error::EmbedderDimMismatch
    /// [`Error::Embedder`]: crate::error::Error::Embedder
    /// [`Error::Storage`]: crate::error::Error::Storage
    /// [`Error::Metadata`]: crate::error::Error::Metadata
    ///
    /// ```no_run
    /// # async fn _ex(mem: kiromi_ai_memory::Memory, embedder: &dyn kiromi_ai_memory::Embedder) -> kiromi_ai_memory::Result<()> {
    /// use kiromi_ai_memory::summary::Scope;
    /// let r = mem.regenerate_embeddings(Scope::All, embedder).await?;
    /// # let _ = r; Ok(()) }
    /// ```
    pub async fn regenerate_embeddings(
        &self,
        scope: Scope,
        embedder: &dyn Embedder,
    ) -> Result<RegenReport> {
        let _g = self.inner.locks.lock(&self.inner.tenant).await;
        let started = Instant::now();

        // 1. Validate dim match against schema_meta.
        let meta = self.inner.metadata.read_schema_meta().await?;
        if let Some(meta) = meta.as_ref()
            && let Some(stored_dims) = meta.embedder_dims
        {
            let new_dims = i64::try_from(embedder.dimensions()).unwrap_or(0);
            if stored_dims != new_dims {
                return Err(Error::EmbedderDimMismatch {
                    old: usize::try_from(stored_dims).unwrap_or(0),
                    new: embedder.dimensions(),
                });
            }
        }

        let new_embedder_id = embedder.id().to_string();
        let mut report = RegenReport::default();

        // 2. Walk live memories under scope and re-embed each.
        let mems = self.inner.metadata.memories_in_subtree(&scope).await?;
        for row in &mems {
            // Fetch the body bytes.
            let body = match self
                .inner
                .storage
                .get(&StorageKey::new(row.data_path.clone()))
                .await
            {
                Ok(b) => b,
                Err(e) => {
                    report.failed += 1;
                    if report.errors.len() < 32 {
                        report.errors.push((row.id.to_string(), e.to_string()));
                    }
                    continue;
                }
            };
            let body_str = match body_to_string(body) {
                Ok(s) => s,
                Err(e) => {
                    report.failed += 1;
                    if report.errors.len() < 32 {
                        report.errors.push((row.id.to_string(), e.to_string()));
                    }
                    continue;
                }
            };
            // Embed once per row (caller can override at the embedder
            // boundary if they want batching).
            let mut vs = match embedder
                .embed(EmbedRole::Document, &[body_str.as_str()])
                .await
            {
                Ok(v) => v,
                Err(e) => {
                    report.failed += 1;
                    if report.errors.len() < 32 {
                        report.errors.push((row.id.to_string(), e.to_string()));
                    }
                    continue;
                }
            };
            let vector = match vs.pop() {
                Some(v) => v,
                None => {
                    report.failed += 1;
                    if report.errors.len() < 32 {
                        report.errors.push((
                            row.id.to_string(),
                            "embedder returned empty vector".to_string(),
                        ));
                    }
                    continue;
                }
            };
            // Pack to little-endian f32 bytes.
            let blob_bytes: &[u8] = bytemuck::cast_slice(&vector);
            let now = self.inner.clock.now_ms();
            let audit = AuditEntry {
                ts_ms: now,
                actor: self.inner.actor.clone(),
                op: AuditOp::RegenerateEmbeddings,
                partition_path: Some(row.partition_path.clone()),
                memory_id: Some(row.id),
                detail: serde_json::json!({
                    "embedder_id": new_embedder_id,
                    "dims": embedder.dimensions(),
                }),
            };
            match self
                .inner
                .metadata
                .regenerate_memory_embedding(
                    &row.id,
                    &row.partition_path,
                    row.kind,
                    blob_bytes,
                    &new_embedder_id,
                    audit,
                )
                .await
            {
                Ok(()) => {
                    report.processed += 1;
                }
                Err(e) => {
                    report.failed += 1;
                    if report.errors.len() < 32 {
                        report.errors.push((row.id.to_string(), e.to_string()));
                    }
                }
            }
        }

        // 3. Bump schema_meta.embedder_id when scope is tenant-wide. For
        //    Partition / Memory scopes only the affected rows' embedder_id
        //    flipped — schema_meta stays so downstream open() calls continue
        //    to validate against the dominant embedder.
        let now = self.inner.clock.now_ms();
        if matches!(scope, Scope::All | Scope::Tenant)
            && let Some(meta) = meta.as_ref()
        {
            let new_meta = crate::metadata::SchemaMeta {
                partition_scheme: meta.partition_scheme.clone(),
                scheme_version: meta.scheme_version,
                embedder_id: Some(new_embedder_id.clone()),
                embedder_dims: Some(i64::try_from(embedder.dimensions()).unwrap_or(0)),
                created_at_ms: meta.created_at_ms,
            };
            self.inner.metadata.write_schema_meta(&new_meta).await?;
        }

        report.duration_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(0);

        let _ = self
            .inner
            .event_tx
            .send(crate::event::MemoryEvent::EmbeddingsRegenerated {
                count: report.processed,
                ts_ms: now,
            });

        Ok(report)
    }

    /// Plan 12 — orphan-blob retention sweep. Walks `data/`,
    /// `metadata/<path>/summaries/`, and `metadata/snapshots/` for blobs
    /// unreferenced by any live row, then deletes those older than
    /// `opts.retain_for`.
    ///
    /// Plan 18 dispatch 3 — vec0 + FTS5 live in the SQLite catalog, so
    /// gc no longer reaps parquet/usearch shards: it only checks the
    /// storage blobs against `memory.data_path`, `summary.data_path`,
    /// and the snapshot manifest paths.
    ///
    /// `dry_run` returns counts without deleting.
    ///
    /// ```no_run
    /// # async fn _ex(mem: kiromi_ai_memory::Memory) -> kiromi_ai_memory::Result<()> {
    /// use kiromi_ai_memory::regen::GcOpts;
    /// let r = mem.gc(GcOpts::default()).await?;
    /// # let _ = r; Ok(()) }
    /// ```
    pub async fn gc(&self, opts: GcOpts) -> Result<GcReport> {
        let _g = self.inner.locks.lock(&self.inner.tenant).await;
        let now = self.inner.clock.now_ms();
        let cutoff_ms = now - i64::try_from(opts.retain_for.as_millis()).unwrap_or(0);

        let mut report = GcReport::default();

        // Build the live key set from SQL.
        let mut live_keys: BTreeSet<String> = BTreeSet::new();
        let mems = self.inner.metadata.memories_in_subtree(&Scope::All).await?;
        for m in &mems {
            live_keys.insert(m.data_path.clone());
        }
        let summaries = self
            .inner
            .metadata
            .summaries_in_subtree(&Scope::All)
            .await?;
        for s in &summaries {
            live_keys.insert(s.data_path.clone());
        }
        let snaps = self.inner.metadata.list_snapshots().await?;
        for s in &snaps {
            // manifest path stored relative; final key is
            // {tenant}/metadata/{manifest_path}.
            let key = format!(
                "{}/metadata/{}",
                self.inner.tenant.as_str(),
                s.manifest_path
            );
            live_keys.insert(key);
        }

        // Walk data/ and metadata/.
        let tenant_prefix = self.inner.tenant.as_str().to_string();
        let data_prefix = StorageKey::new(format!("{tenant_prefix}/data"));
        let meta_prefix = StorageKey::new(format!("{tenant_prefix}/metadata"));
        let mut all_entries: Vec<StorageEntry> =
            self.inner.storage.list_prefix(&data_prefix).await?;
        let mut meta_entries = self.inner.storage.list_prefix(&meta_prefix).await?;
        all_entries.append(&mut meta_entries);

        for ent in all_entries {
            let key_str = ent.key.as_str();
            if live_keys.contains(key_str) {
                continue;
            }
            // Heuristic age check: storage backends with no last_modified
            // exposed are treated as immediately reapable when retain_for
            // is zero. Since the trait does not surface mtime in slice 1,
            // we approximate "age" by relying on the caller's retain_for
            // policy: if retain_for == 0, every orphan is reaped.
            if opts.retain_for > Duration::ZERO {
                // No mtime info available — defer to next call.
                continue;
            }
            let _ = cutoff_ms; // documented above; not used in slice 1.

            // Plan 18 dispatch 4: gc only walks `data/` and
            // `metadata/<path>/summaries/` plus `metadata/snapshots/`. The
            // legacy `embeddings/` / `indices/` / `*.usearch` / `*.parquet`
            // shard counter is gone — vec0 + FTS5 live inside the SQLite
            // catalog, not in object storage.
            let bytes = ent.size;
            if key_str.starts_with(&format!("{tenant_prefix}/data/")) || key_str.contains("/data/")
            {
                report.data_blobs_removed += 1;
            } else if key_str.contains("/summaries/") {
                report.summary_blobs_removed += 1;
            } else if key_str.contains("/snapshots/") {
                report.manifest_blobs_removed += 1;
            } else {
                report.data_blobs_removed += 1;
            }
            report.bytes_freed += bytes;
            if !opts.dry_run {
                self.inner.storage.delete(&ent.key).await?;
            }
        }

        if !opts.dry_run {
            let audit = AuditEntry {
                ts_ms: now,
                actor: self.inner.actor.clone(),
                op: AuditOp::Gc,
                partition_path: None,
                memory_id: None,
                detail: serde_json::json!({
                    "data_blobs_removed": report.data_blobs_removed,
                    "summary_blobs_removed": report.summary_blobs_removed,
                    "manifest_blobs_removed": report.manifest_blobs_removed,
                    "bytes_freed": report.bytes_freed,
                    "dry_run": false,
                }),
            };
            let _ = self.inner.metadata.insert_restore_audit(audit).await?;
        }

        Ok(report)
    }
}

/// Suppress the "unused" lint when this module is included while a
/// downstream ref is in flux.
#[allow(dead_code)]
const _UNUSED: Option<MemoryId> = None;