seshat-storage 0.7.0

SQLite storage, migrations, and repository implementations for Seshat
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
//! Repository traits and SQLite implementations for Seshat's knowledge graph.
//!
//! Each trait defines the persistence API for a single entity type. The SQLite
//! implementations operate on the shared `Database` handle.

mod branch_metadata_repository;
mod branch_repository;
pub mod decision_repository;
mod edge_repository;
pub mod embedding_repository;
mod file_ir_repository;
mod node_repository;
mod package_metadata_repository;
mod repo_metadata_repository;
mod submodule_repository;
mod symbol_index_repository;

pub use branch_metadata_repository::SqliteBranchMetadataRepository;
pub use branch_repository::SqliteBranchRepository;
pub use decision_repository::{
    Decision, DecisionNature, DecisionState, DecisionWeight, ExampleEvidence,
    SqliteDecisionRepository,
};
pub use edge_repository::SqliteEdgeRepository;
pub use embedding_repository::{
    EmbeddingInput, EmbeddingRow, SqliteEmbeddingRepository, bytes_to_f32s, f32s_to_bytes,
};
pub use file_ir_repository::SqliteFileIRRepository;
pub use node_repository::SqliteNodeRepository;
pub use package_metadata_repository::{PackageMetadataRow, SqlitePackageMetadataRepository};
pub use repo_metadata_repository::SqliteRepoMetadataRepository;
pub use submodule_repository::{SqliteSubmoduleRepository, SubmoduleInput, SubmoduleRow};
pub use symbol_index_repository::{
    SqliteSymbolIndexRepository, SymbolDefinitionRow, SymbolImportRow, SymbolKind,
    extract_definitions, extract_imports,
};

use std::collections::HashMap;
use std::sync::{Arc, Mutex, MutexGuard};

use rusqlite::Connection;

use crate::StorageError;
use seshat_core::{
    BranchId, Edge, EdgeId, EdgeType, KnowledgeNature, KnowledgeNode, NodeId, ProjectFile,
};

/// Acquire a lock on a shared `Connection`, mapping poisoned-mutex errors
/// to [`StorageError`].
///
/// All SQLite repository implementations use `Arc<Mutex<Connection>>`.
/// This helper eliminates the identical `conn()` method from each one.
pub(crate) fn lock_conn(
    conn: &Arc<Mutex<Connection>>,
) -> Result<MutexGuard<'_, Connection>, StorageError> {
    conn.lock()
        .map_err(|e| StorageError::QueryError(format!("Failed to acquire connection lock: {e}")))
}

/// Persistence operations for [`KnowledgeNode`]s.
pub trait NodeRepository {
    /// Insert a new node. Returns the node with its assigned ID.
    fn insert(&self, node: &KnowledgeNode) -> Result<KnowledgeNode, StorageError>;

    /// Get a node by its ID.
    fn get_by_id(&self, id: NodeId) -> Result<KnowledgeNode, StorageError>;

    /// Find all nodes with the given nature.
    fn find_by_nature(&self, nature: KnowledgeNature) -> Result<Vec<KnowledgeNode>, StorageError>;

    /// Find all nodes belonging to the given branch.
    fn find_by_branch(&self, branch_id: &BranchId) -> Result<Vec<KnowledgeNode>, StorageError>;

    /// Update an existing node. The node's `id` field identifies which row to update.
    fn update(&self, node: &KnowledgeNode) -> Result<(), StorageError>;

    /// Delete a node by its ID.
    fn delete(&self, id: NodeId) -> Result<(), StorageError>;

    /// Delete all nodes for the given branch. Returns the number of rows deleted.
    fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Delete only `fact` nodes for a branch (module structure, documentation).
    ///
    /// Preserves `convention`, `observation`, and user-recorded decision nodes.
    /// Use this instead of `delete_by_branch` when rebuilding module graphs
    /// to avoid wiping user-confirmed conventions.
    fn delete_facts_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Delete auto-detected convention nodes for a branch.
    ///
    /// Only removes nodes where `ext_data` contains `"source": "auto_detected"`.
    /// User-recorded decisions (`"source": "user"`) are preserved.
    /// Returns the number of rows deleted.
    fn delete_auto_detected_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Find all convention nodes for the given branch.
    ///
    /// Returns nodes where `ext_data` contains `"source": "auto_detected"` or
    /// `"source": "user"` (i.e., convention-related nodes, not module/doc facts).
    fn find_conventions_by_branch(
        &self,
        branch_id: &BranchId,
    ) -> Result<Vec<KnowledgeNode>, StorageError>;
}

/// Persistence operations for [`Edge`]s.
pub trait EdgeRepository {
    /// Insert a new edge. Returns the edge with its assigned ID.
    fn insert(&self, edge: &Edge) -> Result<Edge, StorageError>;

    /// Find all edges originating from the given source node.
    fn find_by_source(&self, source_id: NodeId) -> Result<Vec<Edge>, StorageError>;

    /// Find all edges targeting the given node.
    fn find_by_target(&self, target_id: NodeId) -> Result<Vec<Edge>, StorageError>;

    /// Find all edges of the given type.
    fn find_by_type(&self, edge_type: EdgeType) -> Result<Vec<Edge>, StorageError>;

    /// Delete an edge by its ID.
    fn delete(&self, id: EdgeId) -> Result<(), StorageError>;

    /// Delete all edges for the given branch. Returns the number of rows deleted.
    fn delete_by_branch(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
}

/// Persistence operations for file IR records (parsed source file cache).
pub trait FileIRRepository {
    /// Insert or update a file IR record. Uses `(branch_id, file_path)` as the
    /// natural key — if a row already exists, it is replaced.
    ///
    /// `last_commit_date` is the Unix timestamp of the most recent git commit
    /// that touched this file (from `collect_git_file_dates`). `None` means
    /// the project is not a git repo or the file has no commit history.
    fn upsert(
        &self,
        branch_id: &BranchId,
        file: &ProjectFile,
        last_commit_date: Option<i64>,
    ) -> Result<(), StorageError>;

    /// Insert or update a file IR record **and** replace the matching
    /// `symbol_definitions` / `symbol_imports` rows in a single transaction.
    ///
    /// Either every write commits, or none of them do.  Used by the scanner
    /// and the watcher hot tier so the symbol-index stays consistent with
    /// `files_ir` even if a write fails partway through.
    ///
    /// Definitions and imports are extracted from `file` via
    /// [`extract_definitions`] / [`extract_imports`].
    fn upsert_with_symbol_index(
        &self,
        branch_id: &BranchId,
        file: &ProjectFile,
        last_commit_date: Option<i64>,
    ) -> Result<(), StorageError>;

    /// Get the IR for a file by its path within a branch.
    fn get_by_path(
        &self,
        branch_id: &BranchId,
        file_path: &str,
    ) -> Result<ProjectFile, StorageError>;

    /// Get all file IR records for the given branch.
    fn get_by_branch(&self, branch_id: &BranchId) -> Result<Vec<ProjectFile>, StorageError>;

    /// Get all `(file_path, content_hash)` pairs for a branch.
    ///
    /// This is more efficient than [`get_by_branch`](Self::get_by_branch) when you only need
    /// path + hash for incremental comparison (avoids deserializing the full IR).
    fn get_file_hashes_by_branch(
        &self,
        branch_id: &BranchId,
    ) -> Result<HashMap<String, String>, StorageError>;

    /// Delete the IR record for a file path within a branch.
    fn delete_by_path(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;

    /// Delete the `files_ir` row **and** every matching `symbol_definitions` /
    /// `symbol_imports` row for `(branch_id, file_path)` in a single
    /// transaction.  Pairs with [`Self::upsert_with_symbol_index`] so the
    /// watcher / scanner have one atomic write path for both add/modify and
    /// delete — readers cannot observe `files_ir` gone while symbol-index
    /// rows linger (or vice versa).
    ///
    /// Returns [`StorageError::NotFound`] if no `files_ir` row matched; the
    /// symbol-index DELETEs are still attempted inside the same transaction
    /// (orphan symbol rows from an earlier non-atomic write are cleaned up).
    fn delete_with_symbol_index(
        &self,
        branch_id: &BranchId,
        file_path: &str,
    ) -> Result<(), StorageError>;

    /// Check whether the stored content hash matches the given hash.
    /// Returns `true` if a record exists and the hash matches, `false` otherwise.
    fn check_content_hash(
        &self,
        branch_id: &BranchId,
        file_path: &str,
        content_hash: &str,
    ) -> Result<bool, StorageError>;

    /// Get all `(file_path, last_commit_date)` pairs for a branch.
    ///
    /// Returns a map of file paths to their most recent git commit timestamps.
    /// Files without a recorded date are included with `None`.
    fn get_file_dates_by_branch(
        &self,
        branch_id: &BranchId,
    ) -> Result<HashMap<String, Option<i64>>, StorageError>;

    /// Update `convention_compliance_count` for multiple files in a single
    /// transaction.
    ///
    /// `counts` maps `file_path` → compliance count (number of
    /// `follows_convention == true` findings for that file).
    fn update_convention_compliance_counts(
        &self,
        branch_id: &BranchId,
        counts: &HashMap<String, u32>,
    ) -> Result<(), StorageError>;
}

/// Persistence operations for branch management.
///
/// Branch snapshots work by copying all nodes, edges, and files_ir rows with a
/// new `branch_id`. The current branch is tracked in the `metadata` table.
pub trait BranchRepository {
    /// Create a snapshot of the source branch under a new branch name.
    /// Copies all nodes, edges, and files_ir rows in a single transaction.
    fn create_snapshot(
        &self,
        source_branch: &BranchId,
        new_branch: &BranchId,
    ) -> Result<(), StorageError>;

    /// Switch the current branch to the given branch.
    fn switch_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;

    /// Delete all data associated with the given branch.
    fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;

    /// List all distinct branch IDs present in the database.
    fn list_branches(&self) -> Result<Vec<BranchId>, StorageError>;

    /// Get the current branch. Returns the branch stored in the metadata table,
    /// or a default of `"main"` if no current branch has been set.
    fn get_current_branch(&self) -> Result<BranchId, StorageError>;

    /// Read the last commit SHA recorded for a branch (sentinel for the
    /// `seshat serve` / `seshat review` startup freshness check).
    /// Returns `None` if the branch has no recorded commit yet.
    fn get_last_scanned_commit(&self, branch_id: &BranchId)
    -> Result<Option<String>, StorageError>;

    /// Record the latest commit SHA for a branch and bump `last_scanned_at`
    /// to the current Unix time. UPSERTs the `branches` row.
    fn set_last_scanned_commit(
        &self,
        branch_id: &BranchId,
        commit: &str,
    ) -> Result<(), StorageError>;

    /// Idempotent `INSERT OR IGNORE` of a branch row, used so freshness
    /// checks can rely on the sentinel always existing.
    fn ensure_branch_exists(&self, branch_id: &BranchId) -> Result<(), StorageError>;
}

/// Persistence operations for [`Decision`]s — user-recorded knowledge
/// keyed project-wide by `description_hash`.
pub trait DecisionRepository {
    /// UPSERT a decision row keyed by `description_hash`.
    fn upsert(&self, decision: &Decision) -> Result<(), StorageError>;

    /// Look up a single decision by hash.
    fn get_by_hash(&self, hash: &str) -> Result<Option<Decision>, StorageError>;

    /// Bulk lookup of decisions by a slice of hashes (chunked internally
    /// at 500 hashes per `IN (...)` SELECT — comfortably under SQLite's
    /// `SQLITE_MAX_VARIABLE_NUMBER` on either old (999) or new (32766) builds).
    fn get_by_hashes(&self, hashes: &[&str]) -> Result<HashMap<String, Decision>, StorageError>;

    /// Delete the decision row with the given hash.
    fn delete(&self, hash: &str) -> Result<(), StorageError>;

    /// Find decisions whose `description_hash` starts with `prefix`.
    ///
    /// Used by `seshat decisions forget <prefix>` for the prefix-lookup
    /// path. Implementations should push the filter down to the index
    /// (`WHERE description_hash GLOB 'prefix*'`) instead of materialising
    /// the full table and filtering in Rust — the PK index makes the
    /// SQL form `O(matching_rows)` rather than `O(total_rows)`.
    fn find_by_hash_prefix(&self, prefix: &str) -> Result<Vec<Decision>, StorageError>;

    /// Atomically migrate a decision from `old_hash` to the PK carried by
    /// `new_decision.description_hash`. The two writes happen inside a
    /// single transaction so a crash between the DELETE and the INSERT
    /// cannot lose the row.
    ///
    /// Use this when a content-derived PK has to follow a content change —
    /// e.g. `update_decision` rewrites the `description`, so the
    /// `description_hash` recomputes to a different value and the row's
    /// identity has to migrate accordingly.
    ///
    /// # Errors
    /// - `StorageError::Sqlite` with a UNIQUE constraint failure if a row
    ///   already lives at `new_decision.description_hash` — the caller
    ///   should pre-check and surface a domain-specific error.
    /// - Other storage errors propagate as usual.
    fn rekey(&self, old_hash: &str, new_decision: &Decision) -> Result<(), StorageError>;

    /// Count rows with the given `state`.
    fn count_by_state(&self, state: DecisionState) -> Result<usize, StorageError>;

    /// List all decisions, ordered by `decided_at DESC`.
    fn list(&self) -> Result<Vec<Decision>, StorageError>;

    /// List decisions filtered by `state`, ordered by `decided_at DESC`.
    fn list_by_state(&self, state: DecisionState) -> Result<Vec<Decision>, StorageError>;
}

/// Persistence operations for package registry metadata cache.
///
/// Stores categories, keywords, and descriptions fetched from package registries
/// (crates.io, npm, PyPI) keyed by `(name, registry)`.
pub trait PackageMetadataRepository {
    /// Insert or update a package metadata row. Uses `(name, registry)` as the
    /// natural key — if a row already exists, it is replaced.
    fn upsert(&self, row: &PackageMetadataRow) -> Result<(), StorageError>;

    /// Get metadata for a package from a specific registry.
    /// Returns `None` if no cached entry exists.
    fn get(&self, name: &str, registry: &str) -> Result<Option<PackageMetadataRow>, StorageError>;

    /// Get all cached metadata entries for a specific registry.
    fn get_by_registry(&self, registry: &str) -> Result<Vec<PackageMetadataRow>, StorageError>;

    /// Delete entries with `fetched_at` older than the given Unix timestamp.
    /// Returns the number of rows deleted.
    fn delete_stale(&self, before_timestamp: i64) -> Result<usize, StorageError>;
}

/// Persistence operations for submodule records.
///
/// Tracks git submodules linked to a parent project, each with a dedicated DB.
pub trait SubmoduleRepository {
    /// Insert a new submodule record. Returns the full row (with generated `id`
    /// and timestamps).
    fn insert(&self, input: &SubmoduleInput) -> Result<SubmoduleRow, StorageError>;

    /// Update an existing submodule by its `relative_path`.
    fn update(&self, input: &SubmoduleInput) -> Result<(), StorageError>;

    /// Insert or update a submodule record atomically.
    ///
    /// Uses `INSERT ... ON CONFLICT(relative_path) DO UPDATE` so the caller
    /// doesn't need a separate try-update-then-insert pattern.
    fn upsert(&self, input: &SubmoduleInput) -> Result<(), StorageError>;

    /// Delete a submodule record by its `relative_path`.
    fn delete(&self, relative_path: &str) -> Result<(), StorageError>;

    /// List all submodules, sorted by `relative_path`.
    fn list(&self) -> Result<Vec<SubmoduleRow>, StorageError>;

    /// Find a submodule by its mount path relative to the repo root.
    /// Returns `None` if no record exists for this path.
    fn find_by_path(&self, relative_path: &str) -> Result<Option<SubmoduleRow>, StorageError>;
}

/// Persistence operations for code embedding vectors.
///
/// Stores per-item (function, type, export) embeddings generated during
/// `seshat scan` when an embedding provider is configured. When the
/// `[embedding]` config section is absent, this table remains empty.
pub trait EmbeddingRepository {
    /// Insert or update a single embedding.
    fn upsert(&self, branch_id: &str, input: &EmbeddingInput) -> Result<(), StorageError>;

    /// Insert or update a batch of embeddings in a single transaction.
    fn upsert_batch(&self, branch_id: &str, inputs: &[EmbeddingInput]) -> Result<(), StorageError>;

    /// Get all embeddings for a branch.
    fn get_by_branch(&self, branch_id: &str) -> Result<Vec<EmbeddingRow>, StorageError>;

    /// Get embeddings for a specific file within a branch.
    fn get_by_file(
        &self,
        branch_id: &str,
        file_path: &str,
    ) -> Result<Vec<EmbeddingRow>, StorageError>;

    /// Delete all embeddings for a specific file within a branch.
    /// Returns the number of rows deleted.
    fn delete_by_file(&self, branch_id: &str, file_path: &str) -> Result<usize, StorageError>;

    /// Delete all embeddings for a branch. Returns the number of rows deleted.
    fn delete_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;

    /// Count embeddings for a branch.
    fn count_by_branch(&self, branch_id: &str) -> Result<usize, StorageError>;

    /// Get all (file_path, item_name, item_kind) keys stored for a branch.
    fn get_stored_keys(
        &self,
        branch_id: &str,
    ) -> Result<Vec<(String, String, String)>, StorageError>;

    /// Delete embedding rows identified by the given composite keys.
    ///
    /// Deletes in batches of 100 per transaction. Returns total rows deleted.
    fn delete_stale(
        &self,
        branch_id: &str,
        stale_keys: &[(String, String, String)],
    ) -> Result<usize, StorageError>;
}

/// Persistence operations for the per-symbol index (V13).
///
/// `symbol_definitions` and `symbol_imports` are the back-end for
/// `query_code_pattern`'s O(log N) name lookup — they replace the previous
/// scan-every-IR-blob path.  The two tables are updated together so they
/// stay consistent with `files_ir`: the writer always replaces both halves
/// for a given `(branch_id, file_path)` in a single transaction.
pub trait SymbolIndexRepository {
    /// Replace every symbol-definition and symbol-import row for the given
    /// `(branch_id, file_path)` with the supplied lists, atomically.
    ///
    /// Used by both the full-scan path and the hot-tier watcher.  Idempotent:
    /// calling with the same inputs twice leaves the same row set behind.
    fn replace_file(
        &self,
        branch_id: &BranchId,
        file_path: &str,
        definitions: &[symbol_index_repository::SymbolDefinitionRow],
        imports: &[symbol_index_repository::SymbolImportRow],
    ) -> Result<(), StorageError>;

    /// Drop all symbol-definition and symbol-import rows for a deleted file.
    fn delete_file(&self, branch_id: &BranchId, file_path: &str) -> Result<(), StorageError>;

    /// Drop every symbol-definition and symbol-import row for a branch.
    /// Used when a branch is wiped (`delete_branch`) or rebuilt from scratch.
    fn delete_branch(&self, branch_id: &BranchId) -> Result<(), StorageError>;

    /// Count `symbol_definitions` rows for a branch — primarily for tests
    /// and for the post-migration backfill gate.
    fn count_definitions(&self, branch_id: &BranchId) -> Result<usize, StorageError>;

    /// Return every `symbol_definitions` row recorded for a single file on a
    /// branch. Used to anchor a recorded decision to a concrete code snippet
    /// when the caller supplies a file but no snippet.
    fn definitions_for_file(
        &self,
        branch_id: &BranchId,
        file_path: &str,
    ) -> Result<Vec<symbol_index_repository::SymbolDefinitionRow>, StorageError>;

    /// Count `symbol_imports` rows for a branch.
    fn count_imports(&self, branch_id: &BranchId) -> Result<usize, StorageError>;
}

/// Persistence operations for per-branch key-value metadata.
///
/// Stores per-branch state that must not bleed across branches (e.g.
/// `workspace_crates`). Rows are keyed by `(branch_id, key)` and FK-cascade
/// with the parent branch — see migration V14.
///
/// This is the per-branch counterpart of [`RepoMetadataRepository`]: prefer
/// this trait for anything whose value depends on the currently-scanned
/// branch.
pub trait BranchMetadataRepository {
    /// Get the value for `(branch_id, key)`. Returns `None` if the row does
    /// not exist.
    fn get(&self, branch_id: &str, key: &str) -> Result<Option<String>, StorageError>;

    /// UPSERT a `(branch_id, key, value)` triple. Overwrites the existing
    /// value (and refreshes `updated_at`) on conflict.
    fn set(&self, branch_id: &str, key: &str, value: &str) -> Result<(), StorageError>;

    /// List every `(key, value)` pair stored under `branch_id`, ordered by
    /// `key`. Returns an empty vec if the branch has no metadata.
    fn list(&self, branch_id: &str) -> Result<Vec<(String, String)>, StorageError>;

    /// Delete the row identified by `(branch_id, key)`. No-op when the row
    /// does not exist.
    fn delete(&self, branch_id: &str, key: &str) -> Result<(), StorageError>;
}

/// Persistence operations for repo-level key-value metadata.
///
/// Stores lightweight metadata like `project_name`, `last_scan_time`,
/// `file_count`, `convention_count`, etc.
pub trait RepoMetadataRepository {
    /// Get the value for a key. Returns `None` if the key does not exist.
    fn get(&self, key: &str) -> Result<Option<String>, StorageError>;

    /// Set a key-value pair. Overwrites if the key already exists.
    fn set(&self, key: &str, value: &str) -> Result<(), StorageError>;

    /// Get all key-value pairs, sorted by key.
    fn get_all(&self) -> Result<Vec<(String, String)>, StorageError>;
}