seshat-storage 0.7.0

//! Repository traits and SQLite implementations for Seshat's knowledge graph.
//!
//! Each trait defines the persistence API for a single entity type. The SQLite
//! implementations operate on the shared `Database` handle.

mod branch_metadata_repository;
mod branch_repository;
pub mod decision_repository;
mod edge_repository;
pub mod embedding_repository;
mod file_ir_repository;
mod node_repository;
mod package_metadata_repository;
mod repo_metadata_repository;
mod submodule_repository;
mod symbol_index_repository;

pub use branch_metadata_repository::SqliteBranchMetadataRepository;
pub use branch_repository::SqliteBranchRepository;
pub use decision_repository::{
    Decision, DecisionNature, DecisionState, DecisionWeight, ExampleEvidence,
    SqliteDecisionRepository,
};
pub use edge_repository::SqliteEdgeRepository;
pub use embedding_repository::{
    EmbeddingInput, EmbeddingRow, SqliteEmbeddingRepository, bytes_to_f32s, f32s_to_bytes,
};
pub use file_ir_repository::SqliteFileIRRepository;
pub use node_repository::SqliteNodeRepository;
pub use package_metadata_repository::{PackageMetadataRow, SqlitePackageMetadataRepository};
pub use repo_metadata_repository::SqliteRepoMetadataRepository;
pub use submodule_repository::{SqliteSubmoduleRepository, SubmoduleInput, SubmoduleRow};
pub use symbol_index_repository::{
    SqliteSymbolIndexRepository, SymbolDefinitionRow, SymbolImportRow, SymbolKind,
    extract_definitions, extract_imports,
};

use std::collections::HashMap;
use std::sync::{Arc, Mutex, MutexGuard};

use rusqlite::Connection;

use crate::StorageError;
use seshat_core::{
    BranchId, Edge, EdgeId, EdgeType, KnowledgeNature, KnowledgeNode, NodeId, ProjectFile,
};

/// Acquire a lock on a shared `Connection`, mapping poisoned-mutex errors
/// to [`StorageError`].
///
/// All SQLite repository implementations use `Arc<Mutex<Connection>>`.
/// This helper eliminates the identical `conn()` method from each one.
pub(crate) fn lock_conn(
    conn: &Arc<Mutex<Connection>>,
) -> Result<MutexGuard<'_, Connection>, StorageError> {
    conn.lock()
        .map_err(|e| StorageError::QueryError(format!("Failed to acquire connection lock: {e}")))
}

/// Persistence operations for [`KnowledgeNode`]s.
pub trait NodeRepository {
    /// Insert a new node. Returns the node with its assigned ID.
    fn insert(&self, node: &KnowledgeNode) -> Result<KnowledgeNode, StorageError>;

    /// Get a node by its ID.
    fn get_by_id(&self, id: NodeId) -> Result<KnowledgeNode, StorageError>;

    /// Find all nodes with the given nature.
    fn find_by_nature(&self, nature: KnowledgeNature) -> Result<Vec<KnowledgeNode>, StorageError>;

    /// Find all nodes belonging to the given branch.
    fn find_by_branch(&self, branch_id: &BranchId) -> Result<Vec<KnowledgeNode>, StorageError>;

    /// Update an existing node. The node's `id` field identifies which row to update.
    fn update(&self, node: &KnowledgeNode) -> Result<(), StorageError>;

    /// Delete a node by its ID.
    fn delete(&self, id: NodeId) -> Result<(), StorageError>;

    /// Delete all nodes for the given branch. Returns the number of rows deleted.
    fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Delete only `fact` nodes for a branch (module structure, documentation).
    ///
    /// Preserves `convention`, `observation`, and user-recorded decision nodes.
    /// Use this instead of `delete_by_branch` when rebuilding module graphs
    /// to avoid wiping user-confirmed conventions.
    fn delete_facts_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Delete auto-detected convention nodes for a branch.
    ///
    /// Only removes nodes where `ext_data` contains `"source": "auto_detected"`.
    /// User-recorded decisions (`"source": "user"`) are preserved.
    /// Returns the number of rows deleted.
    fn delete_auto_detected_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Find all convention nodes for the given branch.
    ///
    /// Returns nodes where `ext_data` contains `"source": "auto_detected"` or
    /// `"source": "user"` (i.e., convention-related nodes, not module/doc facts).
    fn find_conventions_by_branch(
        &self,
        branch_id: &BranchId,
    ) -> Result<Vec<KnowledgeNode>, StorageError>;
}

/// Persistence operations for [`Edge`]s.
pub trait EdgeRepository {
    /// Insert a new edge. Returns the edge with its assigned ID.
    fn insert(&self, edge: &Edge) -> Result<Edge, StorageError>;

    /// Find all edges originating from the given source node.
    fn find_by_source(&self, source_id: NodeId) -> Result<Vec<Edge>, StorageError>;

    /// Find all edges targeting the given node.
    fn find_by_target(&self, target_id: NodeId) -> Result<Vec<Edge>, StorageError>;

    /// Find all edges of the given type.
    fn find_by_type(&self, edge_type: EdgeType) -> Result<Vec<Edge>, StorageError>;

    /// Delete an edge by its ID.
    fn delete(&self, id: EdgeId) -> Result<(), StorageError>;

    /// Delete all edges for the given branch. Returns the number of rows deleted.
    fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
}

/// Persistence operations for file IR records (parsed source file cache).
pub trait FileIRRepository {
    /// Insert or update a file IR record. Uses `(branch_id, file_path)` as the
    /// natural key — if a row already exists, it is replaced.
    ///
    /// `last_commit_date` is the Unix timestamp of the most recent git commit
    /// that touched this file (from `collect_git_file_dates`). `None` means
    /// the project is not a git repo or the file has no commit history.
    fn upsert(
        &self,
        branch_id: &BranchId,
        file: &ProjectFile,
        last_commit_date: Option<i64>,
    ) -> Result<(), StorageError>;

    /// Insert or update a file IR record **and** replace the matching
    /// `symbol_definitions` / `symbol_imports` rows in a single transaction.
    ///
    /// Either every write commits, or none of them do.  Used by the scanner
    /// and the watcher hot tier so the symbol-index stays consistent with
    /// `files_ir` even if a write fails partway through.
    ///
    /// Definitions and imports are extracted from `file` via
    /// [`extract_definitions`] / [`extract_imports`].
    fn upsert_with_symbol_index(
        &self,
        branch_id: &BranchId,
        file: &ProjectFile,
        last_commit_date: Option<i64>,
    ) -> Result<(), StorageError>;

    /// Get the IR for a file by its path within a branch.
    fn get_by_path(
        &self,
        branch_id: &BranchId,
        file_path: &str,
    ) -> Result<ProjectFile, StorageError>;

    /// Get all file IR records for the given branch.
    fn get_by_branch(&self, branch_id: &BranchId) -> Result<Vec<ProjectFile>, StorageError>;

    /// Get all `(file_path, content_hash)` pairs for a branch.
    ///
    /// This is more efficient than [`get_by_branch`](Self::get_by_branch) when you only need
    /// path + hash for incremental comparison (avoids deserializing the full IR).
    fn get_file_hashes_by_branch(
        &self,
        branch_id: &BranchId,
    ) -> Result<HashMap<String, String>, StorageError>;

    /// Delete the IR record for a file path within a branch.
    fn delete_by_path(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;

    /// Delete the `files_ir` row **and** every matching `symbol_definitions` /
    /// `symbol_imports` row for `(branch_id, file_path)` in a single
    /// transaction.  Pairs with [`Self::upsert_with_symbol_index`] so the
    /// watcher / scanner have one atomic write path for both add/modify and
    /// delete — readers cannot observe `files_ir` gone while symbol-index
    /// rows linger (or vice versa).
    ///
    /// Returns [`StorageError::NotFound`] if no `files_ir` row matched; the
    /// symbol-index DELETEs are still attempted inside the same transaction
    /// (orphan symbol rows from an earlier non-atomic write are cleaned up).
    fn delete_with_symbol_index(
        &self,
        branch_id: &BranchId,
        file_path: &str,
    ) -> Result<(), StorageError>;

    /// Check whether the stored content hash matches the given hash.
    /// Returns `true` if a record exists and the hash matches, `false` otherwise.
    fn check_content_hash(
        &self,
        branch_id: &BranchId,
        file_path: &str,
        content_hash: &str,
    ) -> Result<bool, StorageError>;

    /// Get all `(file_path, last_commit_date)` pairs for a branch.
    ///
    /// Returns a map of file paths to their most recent git commit timestamps.
    /// Files without a recorded date are included with `None`.
    fn get_file_dates_by_branch(
        &self,
        branch_id: &BranchId,
    ) -> Result<HashMap<String, Option<i64>>, StorageError>;

    /// Update `convention_compliance_count` for multiple files in a single
    /// transaction.
    ///
    /// `counts` maps `file_path` → compliance count (number of
    /// `follows_convention == true` findings for that file).
    fn update_convention_compliance_counts(
        &self,
        branch_id: &BranchId,
        counts: &HashMap<String, u32>,
    ) -> Result<(), StorageError>;
}

/// Persistence operations for branch management.
///
/// Branch snapshots work by copying all nodes, edges, and files_ir rows with a
/// new `branch_id`. The current branch is tracked in the `metadata` table.
pub trait BranchRepository {
    /// Create a snapshot of the source branch under a new branch name.
    /// Copies all nodes, edges, and files_ir rows in a single transaction.
    fn create_snapshot(
        &self,
        source_branch: &BranchId,
        new_branch: &BranchId,
    ) -> Result<(), StorageError>;

    /// Switch the current branch to the given branch.
    fn switch_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;

    /// Delete all data associated with the given branch.
    fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;

    /// List all distinct branch IDs present in the database.
    fn list_branches(&self) -> Result<Vec<BranchId>, StorageError>;

    /// Get the current branch. Returns the branch stored in the metadata table,
    /// or a default of `"main"` if no current branch has been set.
    fn get_current_branch(&self) -> Result<BranchId, StorageError>;

    /// Read the last commit SHA recorded for a branch (sentinel for the
    /// `seshat serve` / `seshat review` startup freshness check).
    /// Returns `None` if the branch has no recorded commit yet.
    fn get_last_scanned_commit(&self, branch_id: &BranchId)
    -> Result<Option<String>, StorageError>;

    /// Record the latest commit SHA for a branch and bump `last_scanned_at`
    /// to the current Unix time. UPSERTs the `branches` row.
    fn set_last_scanned_commit(
        &self,
        branch_id: &BranchId,
        commit: &str,
    ) -> Result<(), StorageError>;

    /// Idempotent `INSERT OR IGNORE` of a branch row, used so freshness
    /// checks can rely on the sentinel always existing.
    fn ensure_branch_exists(&self, branch_id: &BranchId) -> Result<(), StorageError>;
}

/// Persistence operations for [`Decision`]s — user-recorded knowledge
/// keyed project-wide by `description_hash`.
pub trait DecisionRepository {
    /// UPSERT a decision row keyed by `description_hash`.
    fn upsert(&self, decision: &Decision) -> Result<(), StorageError>;

    /// Look up a single decision by hash.
    fn get_by_hash(&self, hash: &str) -> Result<Option<Decision>, StorageError>;

    /// Bulk lookup of decisions by a slice of hashes (chunked internally
    /// at 500 hashes per `IN (...)` SELECT — comfortably under SQLite's
    /// `SQLITE_MAX_VARIABLE_NUMBER` on either old (999) or new (32766) builds).
    fn get_by_hashes(&self, hashes: &[&str]) -> Result<HashMap<String, Decision>, StorageError>;

    /// Delete the decision row with the given hash.
    fn delete(&self, hash: &str) -> Result<(), StorageError>;

    /// Find decisions whose `description_hash` starts with `prefix`.
    ///
    /// Used by `seshat decisions forget <prefix>` for the prefix-lookup
    /// path. Implementations should push the filter down to the index
    /// (`WHERE description_hash GLOB 'prefix*'`) instead of materialising
    /// the full table and filtering in Rust — the PK index makes the
    /// SQL form `O(matching_rows)` rather than `O(total_rows)`.
    fn find_by_hash_prefix(&self, prefix: &str) -> Result<Vec<Decision>, StorageError>;

    /// Atomically migrate a decision from `old_hash` to the PK carried by
    /// `new_decision.description_hash`. The two writes happen inside a
    /// single transaction so a crash between the DELETE and the INSERT
    /// cannot lose the row.
    ///
    /// Use this when a content-derived PK has to follow a content change —
    /// e.g. `update_decision` rewrites the `description`, so the
    /// `description_hash` recomputes to a different value and the row's
    /// identity has to migrate accordingly.
    ///
    /// # Errors
    /// - `StorageError::Sqlite` with a UNIQUE constraint failure if a row
    ///   already lives at `new_decision.description_hash` — the caller
    ///   should pre-check and surface a domain-specific error.
    /// - Other storage errors propagate as usual.
    fn rekey(&self, old_hash: &str, new_decision: &Decision) -> Result<(), StorageError>;

    /// Count rows with the given `state`.
    fn count_by_state(&self, state: DecisionState) -> Result<usize, StorageError>;

    /// List all decisions, ordered by `decided_at DESC`.
    fn list(&self) -> Result<Vec<Decision>, StorageError>;

    /// List decisions filtered by `state`, ordered by `decided_at DESC`.
    fn list_by_state(&self, state: DecisionState) -> Result<Vec<Decision>, StorageError>;
}

/// Persistence operations for package registry metadata cache.
///
/// Stores categories, keywords, and descriptions fetched from package registries
/// (crates.io, npm, PyPI) keyed by `(name, registry)`.
pub trait PackageMetadataRepository {
    /// Insert or update a package metadata row. Uses `(name, registry)` as the
    /// natural key — if a row already exists, it is replaced.
    fn upsert(&self, row: &PackageMetadataRow) -> Result<(), StorageError>;

    /// Get metadata for a package from a specific registry.
    /// Returns `None` if no cached entry exists.
    fn get(&self, name: &str, registry: &str) -> Result<Option<PackageMetadataRow>, StorageError>;

    /// Get all cached metadata entries for a specific registry.
    fn get_by_registry(&self, registry: &str) -> Result<Vec<PackageMetadataRow>, StorageError>;

    /// Delete entries with `fetched_at` older than the given Unix timestamp.
    /// Returns the number of rows deleted.
    fn delete_stale(&self, before_timestamp: i64) -> Result<usize, StorageError>;
}

/// Persistence operations for submodule records.
///
/// Tracks git submodules linked to a parent project, each with a dedicated DB.
pub trait SubmoduleRepository {
    /// Insert a new submodule record. Returns the full row (with generated `id`
    /// and timestamps).
    fn insert(&self, input: &SubmoduleInput) -> Result<SubmoduleRow, StorageError>;

    /// Update an existing submodule by its `relative_path`.
    fn update(&self, input: &SubmoduleInput) -> Result<(), StorageError>;

    /// Insert or update a submodule record atomically.
    ///
    /// Uses `INSERT ... ON CONFLICT(relative_path) DO UPDATE` so the caller
    /// doesn't need a separate try-update-then-insert pattern.
    fn upsert(&self, input: &SubmoduleInput) -> Result<(), StorageError>;

    /// Delete a submodule record by its `relative_path`.
    fn delete(&self, relative_path: &str) -> Result<(), StorageError>;

    /// List all submodules, sorted by `relative_path`.
    fn list(&self) -> Result<Vec<SubmoduleRow>, StorageError>;

    /// Find a submodule by its mount path relative to the repo root.
    /// Returns `None` if no record exists for this path.
    fn find_by_path(&self, relative_path: &str) -> Result<Option<SubmoduleRow>, StorageError>;
}

/// Persistence operations for code embedding vectors.
///
/// Stores per-item (function, type, export) embeddings generated during
/// `seshat scan` when an embedding provider is configured. When the
/// `[embedding]` config section is absent, this table remains empty.
pub trait EmbeddingRepository {
    /// Insert or update a single embedding.
    fn upsert(&self, branch_id: &str, input: &EmbeddingInput) -> Result<(), StorageError>;

    /// Insert or update a batch of embeddings in a single transaction.
    fn upsert_batch(&self, branch_id: &str, inputs: &[EmbeddingInput]) -> Result<(), StorageError>;

    /// Get all embeddings for a branch.
    fn get_by_branch(&self, branch_id: &str) -> Result<Vec<EmbeddingRow>, StorageError>;

    /// Get embeddings for a specific file within a branch.
    fn get_by_file(
        &self,
        branch_id: &str,
        file_path: &str,
    ) -> Result<Vec<EmbeddingRow>, StorageError>;

    /// Delete all embeddings for a specific file within a branch.
    /// Returns the number of rows deleted.
    fn delete_by_file(&self, branch_id: &str, file_path: &str) -> Result<usize, StorageError>;

    /// Delete all embeddings for a branch. Returns the number of rows deleted.
    fn delete_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;

    /// Count embeddings for a branch.
    fn count_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;

    /// Get all (file_path, item_name, item_kind) keys stored for a branch.
    fn get_stored_keys(
        &self,
        branch_id: &str,
    ) -> Result<Vec<(String, String, String)>, StorageError>;

    /// Delete embedding rows identified by the given composite keys.
    ///
    /// Deletes in batches of 100 per transaction. Returns total rows deleted.
    fn delete_stale(
        &self,
        branch_id: &str,
        stale_keys: &[(String, String, String)],
    ) -> Result<usize, StorageError>;
}

/// Persistence operations for the per-symbol index (V13).
///
/// `symbol_definitions` and `symbol_imports` are the back-end for
/// `query_code_pattern`'s O(log N) name lookup — they replace the previous
/// scan-every-IR-blob path.  The two tables are updated together so they
/// stay consistent with `files_ir`: the writer always replaces both halves
/// for a given `(branch_id, file_path)` in a single transaction.
pub trait SymbolIndexRepository {
    /// Replace every symbol-definition and symbol-import row for the given
    /// `(branch_id, file_path)` with the supplied lists, atomically.
    ///
    /// Used by both the full-scan path and the hot-tier watcher.  Idempotent:
    /// calling with the same inputs twice leaves the same row set behind.
    fn replace_file(
        &self,
        branch_id: &BranchId,
        file_path: &str,
        definitions: &[symbol_index_repository::SymbolDefinitionRow],
        imports: &[symbol_index_repository::SymbolImportRow],
    ) -> Result<(), StorageError>;

    /// Drop all symbol-definition and symbol-import rows for a deleted file.
    fn delete_file(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;

    /// Drop every symbol-definition and symbol-import row for a branch.
    /// Used when a branch is wiped (`delete_branch`) or rebuilt from scratch.
    fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;

    /// Count `symbol_definitions` rows for a branch — primarily for tests
    /// and for the post-migration backfill gate.
    fn count_definitions(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Return every `symbol_definitions` row recorded for a single file on a
    /// branch. Used to anchor a recorded decision to a concrete code snippet
    /// when the caller supplies a file but no snippet.
    fn definitions_for_file(
        &self,
        branch_id: &BranchId,
        file_path: &str,
    ) -> Result<Vec<symbol_index_repository::SymbolDefinitionRow>, StorageError>;

    /// Count `symbol_imports` rows for a branch.
    fn count_imports(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
}

/// Persistence operations for per-branch key-value metadata.
///
/// Stores per-branch state that must not bleed across branches (e.g.
/// `workspace_crates`). Rows are keyed by `(branch_id, key)` and FK-cascade
/// with the parent branch — see migration V14.
///
/// This is the per-branch counterpart of [`RepoMetadataRepository`]: prefer
/// this trait for anything whose value depends on the currently-scanned
/// branch.
pub trait BranchMetadataRepository {
    /// Get the value for `(branch_id, key)`. Returns `None` if the row does
    /// not exist.
    fn get(&self, branch_id: &str, key: &str) -> Result<Option<String>, StorageError>;

    /// UPSERT a `(branch_id, key, value)` triple. Overwrites the existing
    /// value (and refreshes `updated_at`) on conflict.
    fn set(&self, branch_id: &str, key: &str, value: &str) -> Result<(), StorageError>;

    /// List every `(key, value)` pair stored under `branch_id`, ordered by
    /// `key`. Returns an empty vec if the branch has no metadata.
    fn list(&self, branch_id: &str) -> Result<Vec<(String, String)>, StorageError>;

    /// Delete the row identified by `(branch_id, key)`. No-op when the row
    /// does not exist.
    fn delete(&self, branch_id: &str, key: &str) -> Result<(), StorageError>;
}

/// Persistence operations for repo-level key-value metadata.
///
/// Stores lightweight metadata like `project_name`, `last_scan_time`,
/// `file_count`, `convention_count`, etc.
pub trait RepoMetadataRepository {
    /// Get the value for a key. Returns `None` if the key does not exist.
    fn get(&self, key: &str) -> Result<Option<String>, StorageError>;

    /// Set a key-value pair. Overwrites if the key already exists.
    fn set(&self, key: &str, value: &str) -> Result<(), StorageError>;

    /// Get all key-value pairs, sorted by key.
    fn get_all(&self) -> Result<Vec<(String, String)>, StorageError>;
}