Skip to main content

seshat_storage/repository/
mod.rs

1//! Repository traits and SQLite implementations for Seshat's knowledge graph.
2//!
3//! Each trait defines the persistence API for a single entity type. The SQLite
4//! implementations operate on the shared `Database` handle.
5
6mod branch_metadata_repository;
7mod branch_repository;
8pub mod decision_repository;
9mod edge_repository;
10pub mod embedding_repository;
11mod file_ir_repository;
12mod node_repository;
13mod package_metadata_repository;
14mod repo_metadata_repository;
15mod submodule_repository;
16mod symbol_index_repository;
17
18pub use branch_metadata_repository::SqliteBranchMetadataRepository;
19pub use branch_repository::SqliteBranchRepository;
20pub use decision_repository::{
21    Decision, DecisionNature, DecisionState, DecisionWeight, ExampleEvidence,
22    SqliteDecisionRepository,
23};
24pub use edge_repository::SqliteEdgeRepository;
25pub use embedding_repository::{
26    EmbeddingInput, EmbeddingRow, SqliteEmbeddingRepository, bytes_to_f32s, f32s_to_bytes,
27};
28pub use file_ir_repository::SqliteFileIRRepository;
29pub use node_repository::SqliteNodeRepository;
30pub use package_metadata_repository::{PackageMetadataRow, SqlitePackageMetadataRepository};
31pub use repo_metadata_repository::SqliteRepoMetadataRepository;
32pub use submodule_repository::{SqliteSubmoduleRepository, SubmoduleInput, SubmoduleRow};
33pub use symbol_index_repository::{
34    SqliteSymbolIndexRepository, SymbolDefinitionRow, SymbolImportRow, SymbolKind,
35    extract_definitions, extract_imports,
36};
37
38use std::collections::HashMap;
39use std::sync::{Arc, Mutex, MutexGuard};
40
41use rusqlite::Connection;
42
43use crate::StorageError;
44use seshat_core::{
45    BranchId, Edge, EdgeId, EdgeType, KnowledgeNature, KnowledgeNode, NodeId, ProjectFile,
46};
47
48/// Acquire a lock on a shared `Connection`, mapping poisoned-mutex errors
49/// to [`StorageError`].
50///
51/// All SQLite repository implementations use `Arc<Mutex<Connection>>`.
52/// This helper eliminates the identical `conn()` method from each one.
53pub(crate) fn lock_conn(
54    conn: &Arc<Mutex<Connection>>,
55) -> Result<MutexGuard<'_, Connection>, StorageError> {
56    conn.lock()
57        .map_err(|e| StorageError::QueryError(format!("Failed to acquire connection lock: {e}")))
58}
59
60/// Persistence operations for [`KnowledgeNode`]s.
61pub trait NodeRepository {
62    /// Insert a new node. Returns the node with its assigned ID.
63    fn insert(&self, node: &KnowledgeNode) -> Result<KnowledgeNode, StorageError>;
64
65    /// Get a node by its ID.
66    fn get_by_id(&self, id: NodeId) -> Result<KnowledgeNode, StorageError>;
67
68    /// Find all nodes with the given nature.
69    fn find_by_nature(&self, nature: KnowledgeNature) -> Result<Vec<KnowledgeNode>, StorageError>;
70
71    /// Find all nodes belonging to the given branch.
72    fn find_by_branch(&self, branch_id: &BranchId) -> Result<Vec<KnowledgeNode>, StorageError>;
73
74    /// Update an existing node. The node's `id` field identifies which row to update.
75    fn update(&self, node: &KnowledgeNode) -> Result<(), StorageError>;
76
77    /// Delete a node by its ID.
78    fn delete(&self, id: NodeId) -> Result<(), StorageError>;
79
80    /// Delete all nodes for the given branch. Returns the number of rows deleted.
81    fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
82
83    /// Delete only `fact` nodes for a branch (module structure, documentation).
84    ///
85    /// Preserves `convention`, `observation`, and user-recorded decision nodes.
86    /// Use this instead of `delete_by_branch` when rebuilding module graphs
87    /// to avoid wiping user-confirmed conventions.
88    fn delete_facts_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
89
90    /// Delete auto-detected convention nodes for a branch.
91    ///
92    /// Only removes nodes where `ext_data` contains `"source": "auto_detected"`.
93    /// User-recorded decisions (`"source": "user"`) are preserved.
94    /// Returns the number of rows deleted.
95    fn delete_auto_detected_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
96
97    /// Find all convention nodes for the given branch.
98    ///
99    /// Returns nodes where `ext_data` contains `"source": "auto_detected"` or
100    /// `"source": "user"` (i.e., convention-related nodes, not module/doc facts).
101    fn find_conventions_by_branch(
102        &self,
103        branch_id: &BranchId,
104    ) -> Result<Vec<KnowledgeNode>, StorageError>;
105}
106
107/// Persistence operations for [`Edge`]s.
108pub trait EdgeRepository {
109    /// Insert a new edge. Returns the edge with its assigned ID.
110    fn insert(&self, edge: &Edge) -> Result<Edge, StorageError>;
111
112    /// Find all edges originating from the given source node.
113    fn find_by_source(&self, source_id: NodeId) -> Result<Vec<Edge>, StorageError>;
114
115    /// Find all edges targeting the given node.
116    fn find_by_target(&self, target_id: NodeId) -> Result<Vec<Edge>, StorageError>;
117
118    /// Find all edges of the given type.
119    fn find_by_type(&self, edge_type: EdgeType) -> Result<Vec<Edge>, StorageError>;
120
121    /// Delete an edge by its ID.
122    fn delete(&self, id: EdgeId) -> Result<(), StorageError>;
123
124    /// Delete all edges for the given branch. Returns the number of rows deleted.
125    fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
126}
127
128/// Persistence operations for file IR records (parsed source file cache).
129pub trait FileIRRepository {
130    /// Insert or update a file IR record. Uses `(branch_id, file_path)` as the
131    /// natural key — if a row already exists, it is replaced.
132    ///
133    /// `last_commit_date` is the Unix timestamp of the most recent git commit
134    /// that touched this file (from `collect_git_file_dates`). `None` means
135    /// the project is not a git repo or the file has no commit history.
136    fn upsert(
137        &self,
138        branch_id: &BranchId,
139        file: &ProjectFile,
140        last_commit_date: Option<i64>,
141    ) -> Result<(), StorageError>;
142
143    /// Insert or update a file IR record **and** replace the matching
144    /// `symbol_definitions` / `symbol_imports` rows in a single transaction.
145    ///
146    /// Either every write commits, or none of them do.  Used by the scanner
147    /// and the watcher hot tier so the symbol-index stays consistent with
148    /// `files_ir` even if a write fails partway through.
149    ///
150    /// Definitions and imports are extracted from `file` via
151    /// [`extract_definitions`] / [`extract_imports`].
152    fn upsert_with_symbol_index(
153        &self,
154        branch_id: &BranchId,
155        file: &ProjectFile,
156        last_commit_date: Option<i64>,
157    ) -> Result<(), StorageError>;
158
159    /// Get the IR for a file by its path within a branch.
160    fn get_by_path(
161        &self,
162        branch_id: &BranchId,
163        file_path: &str,
164    ) -> Result<ProjectFile, StorageError>;
165
166    /// Get all file IR records for the given branch.
167    fn get_by_branch(&self, branch_id: &BranchId) -> Result<Vec<ProjectFile>, StorageError>;
168
169    /// Get all `(file_path, content_hash)` pairs for a branch.
170    ///
171    /// This is more efficient than [`get_by_branch`](Self::get_by_branch) when you only need
172    /// path + hash for incremental comparison (avoids deserializing the full IR).
173    fn get_file_hashes_by_branch(
174        &self,
175        branch_id: &BranchId,
176    ) -> Result<HashMap<String, String>, StorageError>;
177
178    /// Delete the IR record for a file path within a branch.
179    fn delete_by_path(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;
180
181    /// Delete the `files_ir` row **and** every matching `symbol_definitions` /
182    /// `symbol_imports` row for `(branch_id, file_path)` in a single
183    /// transaction.  Pairs with [`Self::upsert_with_symbol_index`] so the
184    /// watcher / scanner have one atomic write path for both add/modify and
185    /// delete — readers cannot observe `files_ir` gone while symbol-index
186    /// rows linger (or vice versa).
187    ///
188    /// Returns [`StorageError::NotFound`] if no `files_ir` row matched; the
189    /// symbol-index DELETEs are still attempted inside the same transaction
190    /// (orphan symbol rows from an earlier non-atomic write are cleaned up).
191    fn delete_with_symbol_index(
192        &self,
193        branch_id: &BranchId,
194        file_path: &str,
195    ) -> Result<(), StorageError>;
196
197    /// Check whether the stored content hash matches the given hash.
198    /// Returns `true` if a record exists and the hash matches, `false` otherwise.
199    fn check_content_hash(
200        &self,
201        branch_id: &BranchId,
202        file_path: &str,
203        content_hash: &str,
204    ) -> Result<bool, StorageError>;
205
206    /// Get all `(file_path, last_commit_date)` pairs for a branch.
207    ///
208    /// Returns a map of file paths to their most recent git commit timestamps.
209    /// Files without a recorded date are included with `None`.
210    fn get_file_dates_by_branch(
211        &self,
212        branch_id: &BranchId,
213    ) -> Result<HashMap<String, Option<i64>>, StorageError>;
214
215    /// Update `convention_compliance_count` for multiple files in a single
216    /// transaction.
217    ///
218    /// `counts` maps `file_path` → compliance count (number of
219    /// `follows_convention == true` findings for that file).
220    fn update_convention_compliance_counts(
221        &self,
222        branch_id: &BranchId,
223        counts: &HashMap<String, u32>,
224    ) -> Result<(), StorageError>;
225}
226
227/// Persistence operations for branch management.
228///
229/// Branch snapshots work by copying all nodes, edges, and files_ir rows with a
230/// new `branch_id`. The current branch is tracked in the `metadata` table.
231pub trait BranchRepository {
232    /// Create a snapshot of the source branch under a new branch name.
233    /// Copies all nodes, edges, and files_ir rows in a single transaction.
234    fn create_snapshot(
235        &self,
236        source_branch: &BranchId,
237        new_branch: &BranchId,
238    ) -> Result<(), StorageError>;
239
240    /// Switch the current branch to the given branch.
241    fn switch_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;
242
243    /// Delete all data associated with the given branch.
244    fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;
245
246    /// List all distinct branch IDs present in the database.
247    fn list_branches(&self) -> Result<Vec<BranchId>, StorageError>;
248
249    /// Get the current branch. Returns the branch stored in the metadata table,
250    /// or a default of `"main"` if no current branch has been set.
251    fn get_current_branch(&self) -> Result<BranchId, StorageError>;
252
253    /// Read the last commit SHA recorded for a branch (sentinel for the
254    /// `seshat serve` / `seshat review` startup freshness check).
255    /// Returns `None` if the branch has no recorded commit yet.
256    fn get_last_scanned_commit(&self, branch_id: &BranchId)
257    -> Result<Option<String>, StorageError>;
258
259    /// Record the latest commit SHA for a branch and bump `last_scanned_at`
260    /// to the current Unix time. UPSERTs the `branches` row.
261    fn set_last_scanned_commit(
262        &self,
263        branch_id: &BranchId,
264        commit: &str,
265    ) -> Result<(), StorageError>;
266
267    /// Idempotent `INSERT OR IGNORE` of a branch row, used so freshness
268    /// checks can rely on the sentinel always existing.
269    fn ensure_branch_exists(&self, branch_id: &BranchId) -> Result<(), StorageError>;
270}
271
272/// Persistence operations for [`Decision`]s — user-recorded knowledge
273/// keyed project-wide by `description_hash`.
274pub trait DecisionRepository {
275    /// UPSERT a decision row keyed by `description_hash`.
276    fn upsert(&self, decision: &Decision) -> Result<(), StorageError>;
277
278    /// Look up a single decision by hash.
279    fn get_by_hash(&self, hash: &str) -> Result<Option<Decision>, StorageError>;
280
281    /// Bulk lookup of decisions by a slice of hashes (chunked internally
282    /// at 500 hashes per `IN (...)` SELECT — comfortably under SQLite's
283    /// `SQLITE_MAX_VARIABLE_NUMBER` on either old (999) or new (32766) builds).
284    fn get_by_hashes(&self, hashes: &[&str]) -> Result<HashMap<String, Decision>, StorageError>;
285
286    /// Delete the decision row with the given hash.
287    fn delete(&self, hash: &str) -> Result<(), StorageError>;
288
289    /// Find decisions whose `description_hash` starts with `prefix`.
290    ///
291    /// Used by `seshat decisions forget <prefix>` for the prefix-lookup
292    /// path. Implementations should push the filter down to the index
293    /// (`WHERE description_hash GLOB 'prefix*'`) instead of materialising
294    /// the full table and filtering in Rust — the PK index makes the
295    /// SQL form `O(matching_rows)` rather than `O(total_rows)`.
296    fn find_by_hash_prefix(&self, prefix: &str) -> Result<Vec<Decision>, StorageError>;
297
298    /// Atomically migrate a decision from `old_hash` to the PK carried by
299    /// `new_decision.description_hash`. The two writes happen inside a
300    /// single transaction so a crash between the DELETE and the INSERT
301    /// cannot lose the row.
302    ///
303    /// Use this when a content-derived PK has to follow a content change —
304    /// e.g. `update_decision` rewrites the `description`, so the
305    /// `description_hash` recomputes to a different value and the row's
306    /// identity has to migrate accordingly.
307    ///
308    /// # Errors
309    /// - `StorageError::Sqlite` with a UNIQUE constraint failure if a row
310    ///   already lives at `new_decision.description_hash` — the caller
311    ///   should pre-check and surface a domain-specific error.
312    /// - Other storage errors propagate as usual.
313    fn rekey(&self, old_hash: &str, new_decision: &Decision) -> Result<(), StorageError>;
314
315    /// Count rows with the given `state`.
316    fn count_by_state(&self, state: DecisionState) -> Result<usize, StorageError>;
317
318    /// List all decisions, ordered by `decided_at DESC`.
319    fn list(&self) -> Result<Vec<Decision>, StorageError>;
320
321    /// List decisions filtered by `state`, ordered by `decided_at DESC`.
322    fn list_by_state(&self, state: DecisionState) -> Result<Vec<Decision>, StorageError>;
323}
324
325/// Persistence operations for package registry metadata cache.
326///
327/// Stores categories, keywords, and descriptions fetched from package registries
328/// (crates.io, npm, PyPI) keyed by `(name, registry)`.
329pub trait PackageMetadataRepository {
330    /// Insert or update a package metadata row. Uses `(name, registry)` as the
331    /// natural key — if a row already exists, it is replaced.
332    fn upsert(&self, row: &PackageMetadataRow) -> Result<(), StorageError>;
333
334    /// Get metadata for a package from a specific registry.
335    /// Returns `None` if no cached entry exists.
336    fn get(&self, name: &str, registry: &str) -> Result<Option<PackageMetadataRow>, StorageError>;
337
338    /// Get all cached metadata entries for a specific registry.
339    fn get_by_registry(&self, registry: &str) -> Result<Vec<PackageMetadataRow>, StorageError>;
340
341    /// Delete entries with `fetched_at` older than the given Unix timestamp.
342    /// Returns the number of rows deleted.
343    fn delete_stale(&self, before_timestamp: i64) -> Result<usize, StorageError>;
344}
345
346/// Persistence operations for submodule records.
347///
348/// Tracks git submodules linked to a parent project, each with a dedicated DB.
349pub trait SubmoduleRepository {
350    /// Insert a new submodule record. Returns the full row (with generated `id`
351    /// and timestamps).
352    fn insert(&self, input: &SubmoduleInput) -> Result<SubmoduleRow, StorageError>;
353
354    /// Update an existing submodule by its `relative_path`.
355    fn update(&self, input: &SubmoduleInput) -> Result<(), StorageError>;
356
357    /// Insert or update a submodule record atomically.
358    ///
359    /// Uses `INSERT ... ON CONFLICT(relative_path) DO UPDATE` so the caller
360    /// doesn't need a separate try-update-then-insert pattern.
361    fn upsert(&self, input: &SubmoduleInput) -> Result<(), StorageError>;
362
363    /// Delete a submodule record by its `relative_path`.
364    fn delete(&self, relative_path: &str) -> Result<(), StorageError>;
365
366    /// List all submodules, sorted by `relative_path`.
367    fn list(&self) -> Result<Vec<SubmoduleRow>, StorageError>;
368
369    /// Find a submodule by its mount path relative to the repo root.
370    /// Returns `None` if no record exists for this path.
371    fn find_by_path(&self, relative_path: &str) -> Result<Option<SubmoduleRow>, StorageError>;
372}
373
374/// Persistence operations for code embedding vectors.
375///
376/// Stores per-item (function, type, export) embeddings generated during
377/// `seshat scan` when an embedding provider is configured. When the
378/// `[embedding]` config section is absent, this table remains empty.
379pub trait EmbeddingRepository {
380    /// Insert or update a single embedding.
381    fn upsert(&self, branch_id: &str, input: &EmbeddingInput) -> Result<(), StorageError>;
382
383    /// Insert or update a batch of embeddings in a single transaction.
384    fn upsert_batch(&self, branch_id: &str, inputs: &[EmbeddingInput]) -> Result<(), StorageError>;
385
386    /// Get all embeddings for a branch.
387    fn get_by_branch(&self, branch_id: &str) -> Result<Vec<EmbeddingRow>, StorageError>;
388
389    /// Get embeddings for a specific file within a branch.
390    fn get_by_file(
391        &self,
392        branch_id: &str,
393        file_path: &str,
394    ) -> Result<Vec<EmbeddingRow>, StorageError>;
395
396    /// Delete all embeddings for a specific file within a branch.
397    /// Returns the number of rows deleted.
398    fn delete_by_file(&self, branch_id: &str, file_path: &str) -> Result<usize, StorageError>;
399
400    /// Delete all embeddings for a branch. Returns the number of rows deleted.
401    fn delete_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;
402
403    /// Count embeddings for a branch.
404    fn count_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;
405
406    /// Get all (file_path, item_name, item_kind) keys stored for a branch.
407    fn get_stored_keys(
408        &self,
409        branch_id: &str,
410    ) -> Result<Vec<(String, String, String)>, StorageError>;
411
412    /// Delete embedding rows identified by the given composite keys.
413    ///
414    /// Deletes in batches of 100 per transaction. Returns total rows deleted.
415    fn delete_stale(
416        &self,
417        branch_id: &str,
418        stale_keys: &[(String, String, String)],
419    ) -> Result<usize, StorageError>;
420}
421
422/// Persistence operations for the per-symbol index (V13).
423///
424/// `symbol_definitions` and `symbol_imports` are the back-end for
425/// `query_code_pattern`'s O(log N) name lookup — they replace the previous
426/// scan-every-IR-blob path.  The two tables are updated together so they
427/// stay consistent with `files_ir`: the writer always replaces both halves
428/// for a given `(branch_id, file_path)` in a single transaction.
429pub trait SymbolIndexRepository {
430    /// Replace every symbol-definition and symbol-import row for the given
431    /// `(branch_id, file_path)` with the supplied lists, atomically.
432    ///
433    /// Used by both the full-scan path and the hot-tier watcher.  Idempotent:
434    /// calling with the same inputs twice leaves the same row set behind.
435    fn replace_file(
436        &self,
437        branch_id: &BranchId,
438        file_path: &str,
439        definitions: &[symbol_index_repository::SymbolDefinitionRow],
440        imports: &[symbol_index_repository::SymbolImportRow],
441    ) -> Result<(), StorageError>;
442
443    /// Drop all symbol-definition and symbol-import rows for a deleted file.
444    fn delete_file(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;
445
446    /// Drop every symbol-definition and symbol-import row for a branch.
447    /// Used when a branch is wiped (`delete_branch`) or rebuilt from scratch.
448    fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;
449
450    /// Count `symbol_definitions` rows for a branch — primarily for tests
451    /// and for the post-migration backfill gate.
452    fn count_definitions(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
453
454    /// Return every `symbol_definitions` row recorded for a single file on a
455    /// branch. Used to anchor a recorded decision to a concrete code snippet
456    /// when the caller supplies a file but no snippet.
457    fn definitions_for_file(
458        &self,
459        branch_id: &BranchId,
460        file_path: &str,
461    ) -> Result<Vec<symbol_index_repository::SymbolDefinitionRow>, StorageError>;
462
463    /// Count `symbol_imports` rows for a branch.
464    fn count_imports(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
465}
466
467/// Persistence operations for per-branch key-value metadata.
468///
469/// Stores per-branch state that must not bleed across branches (e.g.
470/// `workspace_crates`). Rows are keyed by `(branch_id, key)` and FK-cascade
471/// with the parent branch — see migration V14.
472///
473/// This is the per-branch counterpart of [`RepoMetadataRepository`]: prefer
474/// this trait for anything whose value depends on the currently-scanned
475/// branch.
476pub trait BranchMetadataRepository {
477    /// Get the value for `(branch_id, key)`. Returns `None` if the row does
478    /// not exist.
479    fn get(&self, branch_id: &str, key: &str) -> Result<Option<String>, StorageError>;
480
481    /// UPSERT a `(branch_id, key, value)` triple. Overwrites the existing
482    /// value (and refreshes `updated_at`) on conflict.
483    fn set(&self, branch_id: &str, key: &str, value: &str) -> Result<(), StorageError>;
484
485    /// List every `(key, value)` pair stored under `branch_id`, ordered by
486    /// `key`. Returns an empty vec if the branch has no metadata.
487    fn list(&self, branch_id: &str) -> Result<Vec<(String, String)>, StorageError>;
488
489    /// Delete the row identified by `(branch_id, key)`. No-op when the row
490    /// does not exist.
491    fn delete(&self, branch_id: &str, key: &str) -> Result<(), StorageError>;
492}
493
494/// Persistence operations for repo-level key-value metadata.
495///
496/// Stores lightweight metadata like `project_name`, `last_scan_time`,
497/// `file_count`, `convention_count`, etc.
498pub trait RepoMetadataRepository {
499    /// Get the value for a key. Returns `None` if the key does not exist.
500    fn get(&self, key: &str) -> Result<Option<String>, StorageError>;
501
502    /// Set a key-value pair. Overwrites if the key already exists.
503    fn set(&self, key: &str, value: &str) -> Result<(), StorageError>;
504
505    /// Get all key-value pairs, sorted by key.
506    fn get_all(&self) -> Result<Vec<(String, String)>, StorageError>;
507}