seshat_storage/repository/mod.rs
1//! Repository traits and SQLite implementations for Seshat's knowledge graph.
2//!
3//! Each trait defines the persistence API for a single entity type. The SQLite
4//! implementations operate on the shared `Database` handle.
5
6mod branch_metadata_repository;
7mod branch_repository;
8pub mod decision_repository;
9mod edge_repository;
10pub mod embedding_repository;
11mod file_ir_repository;
12mod node_repository;
13mod package_metadata_repository;
14mod repo_metadata_repository;
15mod submodule_repository;
16mod symbol_index_repository;
17
18pub use branch_metadata_repository::SqliteBranchMetadataRepository;
19pub use branch_repository::SqliteBranchRepository;
20pub use decision_repository::{
21 Decision, DecisionNature, DecisionState, DecisionWeight, ExampleEvidence,
22 SqliteDecisionRepository,
23};
24pub use edge_repository::SqliteEdgeRepository;
25pub use embedding_repository::{
26 EmbeddingInput, EmbeddingRow, SqliteEmbeddingRepository, bytes_to_f32s, f32s_to_bytes,
27};
28pub use file_ir_repository::SqliteFileIRRepository;
29pub use node_repository::SqliteNodeRepository;
30pub use package_metadata_repository::{PackageMetadataRow, SqlitePackageMetadataRepository};
31pub use repo_metadata_repository::SqliteRepoMetadataRepository;
32pub use submodule_repository::{SqliteSubmoduleRepository, SubmoduleInput, SubmoduleRow};
33pub use symbol_index_repository::{
34 SqliteSymbolIndexRepository, SymbolDefinitionRow, SymbolImportRow, SymbolKind,
35 extract_definitions, extract_imports,
36};
37
38use std::collections::HashMap;
39use std::sync::{Arc, Mutex, MutexGuard};
40
41use rusqlite::Connection;
42
43use crate::StorageError;
44use seshat_core::{
45 BranchId, Edge, EdgeId, EdgeType, KnowledgeNature, KnowledgeNode, NodeId, ProjectFile,
46};
47
48/// Acquire a lock on a shared `Connection`, mapping poisoned-mutex errors
49/// to [`StorageError`].
50///
51/// All SQLite repository implementations use `Arc<Mutex<Connection>>`.
52/// This helper eliminates the identical `conn()` method from each one.
53pub(crate) fn lock_conn(
54 conn: &Arc<Mutex<Connection>>,
55) -> Result<MutexGuard<'_, Connection>, StorageError> {
56 conn.lock()
57 .map_err(|e| StorageError::QueryError(format!("Failed to acquire connection lock: {e}")))
58}
59
60/// Persistence operations for [`KnowledgeNode`]s.
61pub trait NodeRepository {
62 /// Insert a new node. Returns the node with its assigned ID.
63 fn insert(&self, node: &KnowledgeNode) -> Result<KnowledgeNode, StorageError>;
64
65 /// Get a node by its ID.
66 fn get_by_id(&self, id: NodeId) -> Result<KnowledgeNode, StorageError>;
67
68 /// Find all nodes with the given nature.
69 fn find_by_nature(&self, nature: KnowledgeNature) -> Result<Vec<KnowledgeNode>, StorageError>;
70
71 /// Find all nodes belonging to the given branch.
72 fn find_by_branch(&self, branch_id: &BranchId) -> Result<Vec<KnowledgeNode>, StorageError>;
73
74 /// Update an existing node. The node's `id` field identifies which row to update.
75 fn update(&self, node: &KnowledgeNode) -> Result<(), StorageError>;
76
77 /// Delete a node by its ID.
78 fn delete(&self, id: NodeId) -> Result<(), StorageError>;
79
80 /// Delete all nodes for the given branch. Returns the number of rows deleted.
81 fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
82
83 /// Delete only `fact` nodes for a branch (module structure, documentation).
84 ///
85 /// Preserves `convention`, `observation`, and user-recorded decision nodes.
86 /// Use this instead of `delete_by_branch` when rebuilding module graphs
87 /// to avoid wiping user-confirmed conventions.
88 fn delete_facts_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
89
90 /// Delete auto-detected convention nodes for a branch.
91 ///
92 /// Only removes nodes where `ext_data` contains `"source": "auto_detected"`.
93 /// User-recorded decisions (`"source": "user"`) are preserved.
94 /// Returns the number of rows deleted.
95 fn delete_auto_detected_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
96
97 /// Find all convention nodes for the given branch.
98 ///
99 /// Returns nodes where `ext_data` contains `"source": "auto_detected"` or
100 /// `"source": "user"` (i.e., convention-related nodes, not module/doc facts).
101 fn find_conventions_by_branch(
102 &self,
103 branch_id: &BranchId,
104 ) -> Result<Vec<KnowledgeNode>, StorageError>;
105}
106
107/// Persistence operations for [`Edge`]s.
108pub trait EdgeRepository {
109 /// Insert a new edge. Returns the edge with its assigned ID.
110 fn insert(&self, edge: &Edge) -> Result<Edge, StorageError>;
111
112 /// Find all edges originating from the given source node.
113 fn find_by_source(&self, source_id: NodeId) -> Result<Vec<Edge>, StorageError>;
114
115 /// Find all edges targeting the given node.
116 fn find_by_target(&self, target_id: NodeId) -> Result<Vec<Edge>, StorageError>;
117
118 /// Find all edges of the given type.
119 fn find_by_type(&self, edge_type: EdgeType) -> Result<Vec<Edge>, StorageError>;
120
121 /// Delete an edge by its ID.
122 fn delete(&self, id: EdgeId) -> Result<(), StorageError>;
123
124 /// Delete all edges for the given branch. Returns the number of rows deleted.
125 fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
126}
127
128/// Persistence operations for file IR records (parsed source file cache).
129pub trait FileIRRepository {
130 /// Insert or update a file IR record. Uses `(branch_id, file_path)` as the
131 /// natural key — if a row already exists, it is replaced.
132 ///
133 /// `last_commit_date` is the Unix timestamp of the most recent git commit
134 /// that touched this file (from `collect_git_file_dates`). `None` means
135 /// the project is not a git repo or the file has no commit history.
136 fn upsert(
137 &self,
138 branch_id: &BranchId,
139 file: &ProjectFile,
140 last_commit_date: Option<i64>,
141 ) -> Result<(), StorageError>;
142
143 /// Insert or update a file IR record **and** replace the matching
144 /// `symbol_definitions` / `symbol_imports` rows in a single transaction.
145 ///
146 /// Either every write commits, or none of them do. Used by the scanner
147 /// and the watcher hot tier so the symbol-index stays consistent with
148 /// `files_ir` even if a write fails partway through.
149 ///
150 /// Definitions and imports are extracted from `file` via
151 /// [`extract_definitions`] / [`extract_imports`].
152 fn upsert_with_symbol_index(
153 &self,
154 branch_id: &BranchId,
155 file: &ProjectFile,
156 last_commit_date: Option<i64>,
157 ) -> Result<(), StorageError>;
158
159 /// Get the IR for a file by its path within a branch.
160 fn get_by_path(
161 &self,
162 branch_id: &BranchId,
163 file_path: &str,
164 ) -> Result<ProjectFile, StorageError>;
165
166 /// Get all file IR records for the given branch.
167 fn get_by_branch(&self, branch_id: &BranchId) -> Result<Vec<ProjectFile>, StorageError>;
168
169 /// Get all `(file_path, content_hash)` pairs for a branch.
170 ///
171 /// This is more efficient than [`get_by_branch`](Self::get_by_branch) when you only need
172 /// path + hash for incremental comparison (avoids deserializing the full IR).
173 fn get_file_hashes_by_branch(
174 &self,
175 branch_id: &BranchId,
176 ) -> Result<HashMap<String, String>, StorageError>;
177
178 /// Delete the IR record for a file path within a branch.
179 fn delete_by_path(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;
180
181 /// Delete the `files_ir` row **and** every matching `symbol_definitions` /
182 /// `symbol_imports` row for `(branch_id, file_path)` in a single
183 /// transaction. Pairs with [`Self::upsert_with_symbol_index`] so the
184 /// watcher / scanner have one atomic write path for both add/modify and
185 /// delete — readers cannot observe `files_ir` gone while symbol-index
186 /// rows linger (or vice versa).
187 ///
188 /// Returns [`StorageError::NotFound`] if no `files_ir` row matched; the
189 /// symbol-index DELETEs are still attempted inside the same transaction
190 /// (orphan symbol rows from an earlier non-atomic write are cleaned up).
191 fn delete_with_symbol_index(
192 &self,
193 branch_id: &BranchId,
194 file_path: &str,
195 ) -> Result<(), StorageError>;
196
197 /// Check whether the stored content hash matches the given hash.
198 /// Returns `true` if a record exists and the hash matches, `false` otherwise.
199 fn check_content_hash(
200 &self,
201 branch_id: &BranchId,
202 file_path: &str,
203 content_hash: &str,
204 ) -> Result<bool, StorageError>;
205
206 /// Get all `(file_path, last_commit_date)` pairs for a branch.
207 ///
208 /// Returns a map of file paths to their most recent git commit timestamps.
209 /// Files without a recorded date are included with `None`.
210 fn get_file_dates_by_branch(
211 &self,
212 branch_id: &BranchId,
213 ) -> Result<HashMap<String, Option<i64>>, StorageError>;
214
215 /// Update `convention_compliance_count` for multiple files in a single
216 /// transaction.
217 ///
218 /// `counts` maps `file_path` → compliance count (number of
219 /// `follows_convention == true` findings for that file).
220 fn update_convention_compliance_counts(
221 &self,
222 branch_id: &BranchId,
223 counts: &HashMap<String, u32>,
224 ) -> Result<(), StorageError>;
225}
226
227/// Persistence operations for branch management.
228///
229/// Branch snapshots work by copying all nodes, edges, and files_ir rows with a
230/// new `branch_id`. The current branch is tracked in the `metadata` table.
231pub trait BranchRepository {
232 /// Create a snapshot of the source branch under a new branch name.
233 /// Copies all nodes, edges, and files_ir rows in a single transaction.
234 fn create_snapshot(
235 &self,
236 source_branch: &BranchId,
237 new_branch: &BranchId,
238 ) -> Result<(), StorageError>;
239
240 /// Switch the current branch to the given branch.
241 fn switch_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;
242
243 /// Delete all data associated with the given branch.
244 fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;
245
246 /// List all distinct branch IDs present in the database.
247 fn list_branches(&self) -> Result<Vec<BranchId>, StorageError>;
248
249 /// Get the current branch. Returns the branch stored in the metadata table,
250 /// or a default of `"main"` if no current branch has been set.
251 fn get_current_branch(&self) -> Result<BranchId, StorageError>;
252
253 /// Read the last commit SHA recorded for a branch (sentinel for the
254 /// `seshat serve` / `seshat review` startup freshness check).
255 /// Returns `None` if the branch has no recorded commit yet.
256 fn get_last_scanned_commit(&self, branch_id: &BranchId)
257 -> Result<Option<String>, StorageError>;
258
259 /// Record the latest commit SHA for a branch and bump `last_scanned_at`
260 /// to the current Unix time. UPSERTs the `branches` row.
261 fn set_last_scanned_commit(
262 &self,
263 branch_id: &BranchId,
264 commit: &str,
265 ) -> Result<(), StorageError>;
266
267 /// Idempotent `INSERT OR IGNORE` of a branch row, used so freshness
268 /// checks can rely on the sentinel always existing.
269 fn ensure_branch_exists(&self, branch_id: &BranchId) -> Result<(), StorageError>;
270}
271
272/// Persistence operations for [`Decision`]s — user-recorded knowledge
273/// keyed project-wide by `description_hash`.
274pub trait DecisionRepository {
275 /// UPSERT a decision row keyed by `description_hash`.
276 fn upsert(&self, decision: &Decision) -> Result<(), StorageError>;
277
278 /// Look up a single decision by hash.
279 fn get_by_hash(&self, hash: &str) -> Result<Option<Decision>, StorageError>;
280
281 /// Bulk lookup of decisions by a slice of hashes (chunked internally
282 /// at 500 hashes per `IN (...)` SELECT — comfortably under SQLite's
283 /// `SQLITE_MAX_VARIABLE_NUMBER` on either old (999) or new (32766) builds).
284 fn get_by_hashes(&self, hashes: &[&str]) -> Result<HashMap<String, Decision>, StorageError>;
285
286 /// Delete the decision row with the given hash.
287 fn delete(&self, hash: &str) -> Result<(), StorageError>;
288
289 /// Find decisions whose `description_hash` starts with `prefix`.
290 ///
291 /// Used by `seshat decisions forget <prefix>` for the prefix-lookup
292 /// path. Implementations should push the filter down to the index
293 /// (`WHERE description_hash GLOB 'prefix*'`) instead of materialising
294 /// the full table and filtering in Rust — the PK index makes the
295 /// SQL form `O(matching_rows)` rather than `O(total_rows)`.
296 fn find_by_hash_prefix(&self, prefix: &str) -> Result<Vec<Decision>, StorageError>;
297
298 /// Atomically migrate a decision from `old_hash` to the PK carried by
299 /// `new_decision.description_hash`. The two writes happen inside a
300 /// single transaction so a crash between the DELETE and the INSERT
301 /// cannot lose the row.
302 ///
303 /// Use this when a content-derived PK has to follow a content change —
304 /// e.g. `update_decision` rewrites the `description`, so the
305 /// `description_hash` recomputes to a different value and the row's
306 /// identity has to migrate accordingly.
307 ///
308 /// # Errors
309 /// - `StorageError::Sqlite` with a UNIQUE constraint failure if a row
310 /// already lives at `new_decision.description_hash` — the caller
311 /// should pre-check and surface a domain-specific error.
312 /// - Other storage errors propagate as usual.
313 fn rekey(&self, old_hash: &str, new_decision: &Decision) -> Result<(), StorageError>;
314
315 /// Count rows with the given `state`.
316 fn count_by_state(&self, state: DecisionState) -> Result<usize, StorageError>;
317
318 /// List all decisions, ordered by `decided_at DESC`.
319 fn list(&self) -> Result<Vec<Decision>, StorageError>;
320
321 /// List decisions filtered by `state`, ordered by `decided_at DESC`.
322 fn list_by_state(&self, state: DecisionState) -> Result<Vec<Decision>, StorageError>;
323}
324
325/// Persistence operations for package registry metadata cache.
326///
327/// Stores categories, keywords, and descriptions fetched from package registries
328/// (crates.io, npm, PyPI) keyed by `(name, registry)`.
329pub trait PackageMetadataRepository {
330 /// Insert or update a package metadata row. Uses `(name, registry)` as the
331 /// natural key — if a row already exists, it is replaced.
332 fn upsert(&self, row: &PackageMetadataRow) -> Result<(), StorageError>;
333
334 /// Get metadata for a package from a specific registry.
335 /// Returns `None` if no cached entry exists.
336 fn get(&self, name: &str, registry: &str) -> Result<Option<PackageMetadataRow>, StorageError>;
337
338 /// Get all cached metadata entries for a specific registry.
339 fn get_by_registry(&self, registry: &str) -> Result<Vec<PackageMetadataRow>, StorageError>;
340
341 /// Delete entries with `fetched_at` older than the given Unix timestamp.
342 /// Returns the number of rows deleted.
343 fn delete_stale(&self, before_timestamp: i64) -> Result<usize, StorageError>;
344}
345
346/// Persistence operations for submodule records.
347///
348/// Tracks git submodules linked to a parent project, each with a dedicated DB.
349pub trait SubmoduleRepository {
350 /// Insert a new submodule record. Returns the full row (with generated `id`
351 /// and timestamps).
352 fn insert(&self, input: &SubmoduleInput) -> Result<SubmoduleRow, StorageError>;
353
354 /// Update an existing submodule by its `relative_path`.
355 fn update(&self, input: &SubmoduleInput) -> Result<(), StorageError>;
356
357 /// Insert or update a submodule record atomically.
358 ///
359 /// Uses `INSERT ... ON CONFLICT(relative_path) DO UPDATE` so the caller
360 /// doesn't need a separate try-update-then-insert pattern.
361 fn upsert(&self, input: &SubmoduleInput) -> Result<(), StorageError>;
362
363 /// Delete a submodule record by its `relative_path`.
364 fn delete(&self, relative_path: &str) -> Result<(), StorageError>;
365
366 /// List all submodules, sorted by `relative_path`.
367 fn list(&self) -> Result<Vec<SubmoduleRow>, StorageError>;
368
369 /// Find a submodule by its mount path relative to the repo root.
370 /// Returns `None` if no record exists for this path.
371 fn find_by_path(&self, relative_path: &str) -> Result<Option<SubmoduleRow>, StorageError>;
372}
373
374/// Persistence operations for code embedding vectors.
375///
376/// Stores per-item (function, type, export) embeddings generated during
377/// `seshat scan` when an embedding provider is configured. When the
378/// `[embedding]` config section is absent, this table remains empty.
379pub trait EmbeddingRepository {
380 /// Insert or update a single embedding.
381 fn upsert(&self, branch_id: &str, input: &EmbeddingInput) -> Result<(), StorageError>;
382
383 /// Insert or update a batch of embeddings in a single transaction.
384 fn upsert_batch(&self, branch_id: &str, inputs: &[EmbeddingInput]) -> Result<(), StorageError>;
385
386 /// Get all embeddings for a branch.
387 fn get_by_branch(&self, branch_id: &str) -> Result<Vec<EmbeddingRow>, StorageError>;
388
389 /// Get embeddings for a specific file within a branch.
390 fn get_by_file(
391 &self,
392 branch_id: &str,
393 file_path: &str,
394 ) -> Result<Vec<EmbeddingRow>, StorageError>;
395
396 /// Delete all embeddings for a specific file within a branch.
397 /// Returns the number of rows deleted.
398 fn delete_by_file(&self, branch_id: &str, file_path: &str) -> Result<usize, StorageError>;
399
400 /// Delete all embeddings for a branch. Returns the number of rows deleted.
401 fn delete_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;
402
403 /// Count embeddings for a branch.
404 fn count_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;
405
406 /// Get all (file_path, item_name, item_kind) keys stored for a branch.
407 fn get_stored_keys(
408 &self,
409 branch_id: &str,
410 ) -> Result<Vec<(String, String, String)>, StorageError>;
411
412 /// Delete embedding rows identified by the given composite keys.
413 ///
414 /// Deletes in batches of 100 per transaction. Returns total rows deleted.
415 fn delete_stale(
416 &self,
417 branch_id: &str,
418 stale_keys: &[(String, String, String)],
419 ) -> Result<usize, StorageError>;
420}
421
422/// Persistence operations for the per-symbol index (V13).
423///
424/// `symbol_definitions` and `symbol_imports` are the back-end for
425/// `query_code_pattern`'s O(log N) name lookup — they replace the previous
426/// scan-every-IR-blob path. The two tables are updated together so they
427/// stay consistent with `files_ir`: the writer always replaces both halves
428/// for a given `(branch_id, file_path)` in a single transaction.
429pub trait SymbolIndexRepository {
430 /// Replace every symbol-definition and symbol-import row for the given
431 /// `(branch_id, file_path)` with the supplied lists, atomically.
432 ///
433 /// Used by both the full-scan path and the hot-tier watcher. Idempotent:
434 /// calling with the same inputs twice leaves the same row set behind.
435 fn replace_file(
436 &self,
437 branch_id: &BranchId,
438 file_path: &str,
439 definitions: &[symbol_index_repository::SymbolDefinitionRow],
440 imports: &[symbol_index_repository::SymbolImportRow],
441 ) -> Result<(), StorageError>;
442
443 /// Drop all symbol-definition and symbol-import rows for a deleted file.
444 fn delete_file(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;
445
446 /// Drop every symbol-definition and symbol-import row for a branch.
447 /// Used when a branch is wiped (`delete_branch`) or rebuilt from scratch.
448 fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;
449
450 /// Count `symbol_definitions` rows for a branch — primarily for tests
451 /// and for the post-migration backfill gate.
452 fn count_definitions(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
453
454 /// Return every `symbol_definitions` row recorded for a single file on a
455 /// branch. Used to anchor a recorded decision to a concrete code snippet
456 /// when the caller supplies a file but no snippet.
457 fn definitions_for_file(
458 &self,
459 branch_id: &BranchId,
460 file_path: &str,
461 ) -> Result<Vec<symbol_index_repository::SymbolDefinitionRow>, StorageError>;
462
463 /// Count `symbol_imports` rows for a branch.
464 fn count_imports(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
465}
466
467/// Persistence operations for per-branch key-value metadata.
468///
469/// Stores per-branch state that must not bleed across branches (e.g.
470/// `workspace_crates`). Rows are keyed by `(branch_id, key)` and FK-cascade
471/// with the parent branch — see migration V14.
472///
473/// This is the per-branch counterpart of [`RepoMetadataRepository`]: prefer
474/// this trait for anything whose value depends on the currently-scanned
475/// branch.
476pub trait BranchMetadataRepository {
477 /// Get the value for `(branch_id, key)`. Returns `None` if the row does
478 /// not exist.
479 fn get(&self, branch_id: &str, key: &str) -> Result<Option<String>, StorageError>;
480
481 /// UPSERT a `(branch_id, key, value)` triple. Overwrites the existing
482 /// value (and refreshes `updated_at`) on conflict.
483 fn set(&self, branch_id: &str, key: &str, value: &str) -> Result<(), StorageError>;
484
485 /// List every `(key, value)` pair stored under `branch_id`, ordered by
486 /// `key`. Returns an empty vec if the branch has no metadata.
487 fn list(&self, branch_id: &str) -> Result<Vec<(String, String)>, StorageError>;
488
489 /// Delete the row identified by `(branch_id, key)`. No-op when the row
490 /// does not exist.
491 fn delete(&self, branch_id: &str, key: &str) -> Result<(), StorageError>;
492}
493
494/// Persistence operations for repo-level key-value metadata.
495///
496/// Stores lightweight metadata like `project_name`, `last_scan_time`,
497/// `file_count`, `convention_count`, etc.
498pub trait RepoMetadataRepository {
499 /// Get the value for a key. Returns `None` if the key does not exist.
500 fn get(&self, key: &str) -> Result<Option<String>, StorageError>;
501
502 /// Set a key-value pair. Overwrites if the key already exists.
503 fn set(&self, key: &str, value: &str) -> Result<(), StorageError>;
504
505 /// Get all key-value pairs, sorted by key.
506 fn get_all(&self) -> Result<Vec<(String, String)>, StorageError>;
507}