Skip to main content

harn_hostlib/code_index/
snapshot.rs

1//! Persistent on-disk snapshot of the workspace index.
2//!
3//! Mirrors the Swift `CodeIndexSnapshot` struct. v1 uses a single JSON
4//! file at `.burin/index/snapshot.json`. The shape is intentionally
5//! tolerant of missing sections so we can extend it in place without a
6//! version bump (e.g. add a new sub-index without invalidating earlier
7//! snapshots).
8//!
9//! The snapshot is the recovery primitive. On daemon startup, the
10//! embedder restores from the snapshot if one exists, then calls
11//! [`super::IndexState::reap_after_recovery`] to drop stale agent
12//! records and locks before serving any traffic.
13
14use std::collections::HashMap;
15use std::path::{Path, PathBuf};
16
17use serde::{Deserialize, Serialize};
18
19use super::agents::{AgentRegistry, RegistryConfig, SerializedRegistry};
20use super::file_table::{FileId, IndexedFile, IndexedSymbol};
21use super::graph::DepGraph;
22use super::trigram::TrigramIndex;
23use super::versions::VersionLog;
24use super::words::WordIndex;
25use super::IndexState;
26
27/// Current format version. Bumped whenever the snapshot layout changes
28/// in a non-additive way.
29pub const SNAPSHOT_FORMAT_VERSION: u32 = 1;
30
31/// On-disk metadata header. Small and cheap to read so embedders can
32/// peek at a snapshot without parsing the whole thing.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct SnapshotMeta {
35    /// Format version. Must equal [`SNAPSHOT_FORMAT_VERSION`] for now;
36    /// older snapshots are dropped.
37    pub format_version: u32,
38    /// Workspace root the snapshot was captured against.
39    pub workspace_root: String,
40    /// `HEAD` SHA of the workspace at snapshot time, when known.
41    pub git_head: Option<String>,
42    /// Wall-clock ms since the Unix epoch when the snapshot was written.
43    pub indexed_at_ms: i64,
44    /// Total number of files captured.
45    pub file_count: usize,
46}
47
48/// Serialised form of one outline symbol. Mirrors [`IndexedSymbol`].
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct SnapshotSymbol {
51    /// Symbol name.
52    pub name: String,
53    /// Language-specific kind tag.
54    pub kind: String,
55    /// 1-based start line.
56    pub start_line: u32,
57    /// 1-based inclusive end line.
58    pub end_line: u32,
59    /// Single-line preview of the declaration.
60    pub signature: String,
61}
62
63/// Serialised form of one file row. Mirrors [`IndexedFile`].
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct SnapshotFile {
66    /// Stable file identifier.
67    pub id: FileId,
68    /// Workspace-relative path with `/` separators.
69    pub relative_path: String,
70    /// Best-effort language tag.
71    pub language: String,
72    /// Size in bytes.
73    pub size_bytes: u64,
74    /// Newline-delimited line count.
75    pub line_count: u32,
76    /// FNV-1a 64-bit content hash.
77    pub content_hash: u64,
78    /// Last-modified time (ms since epoch).
79    pub mtime_ms: i64,
80    /// Outline symbols.
81    pub symbols: Vec<SnapshotSymbol>,
82    /// Raw import statement strings.
83    pub imports: Vec<String>,
84}
85
86/// One trigram posting entry: `trigram → list of file ids`.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct TrigramPosting {
89    /// Packed trigram key.
90    pub trigram: u32,
91    /// Files containing this trigram.
92    pub files: Vec<FileId>,
93}
94
95/// One word posting entry: `word → list of (file, line) pairs`.
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct WordPosting {
98    /// Identifier-shaped token.
99    pub word: String,
100    /// All occurrences as `(file_id, line)` pairs.
101    pub hits: Vec<(FileId, u32)>,
102}
103
104/// One dep-graph row: `file → resolved imports + unresolved raw strings`.
105#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct DepRow {
107    /// Source file id.
108    pub from: FileId,
109    /// Resolved target file ids.
110    pub to: Vec<FileId>,
111    /// Raw import strings the resolver couldn't map back to a file.
112    #[serde(default)]
113    pub unresolved: Vec<String>,
114}
115
116/// Persistent on-disk form of the entire workspace index.
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct CodeIndexSnapshot {
119    /// Snapshot header.
120    pub meta: SnapshotMeta,
121    /// Next file id to hand out — preserved so reused ids don't collide
122    /// with historical version-log entries.
123    pub next_file_id: FileId,
124    /// File table.
125    pub files: Vec<SnapshotFile>,
126    /// Trigram postings.
127    pub trigrams: Vec<TrigramPosting>,
128    /// Word postings.
129    pub words: Vec<WordPosting>,
130    /// Dep graph rows.
131    pub deps: Vec<DepRow>,
132    /// Append-only version log.
133    pub versions: VersionLog,
134    /// Live agents at snapshot time.
135    pub agents: SerializedRegistry,
136}
137
138impl CodeIndexSnapshot {
139    /// Path the snapshot lives at, relative to the workspace root.
140    pub fn path_for(workspace_root: &Path) -> PathBuf {
141        workspace_root
142            .join(".burin")
143            .join("index")
144            .join("snapshot.json")
145    }
146
147    /// Save the snapshot atomically (`tmp` file + rename) so partial
148    /// writes never leave a half-encoded JSON blob on disk.
149    pub fn save(&self, workspace_root: &Path) -> std::io::Result<()> {
150        let path = Self::path_for(workspace_root);
151        if let Some(parent) = path.parent() {
152            std::fs::create_dir_all(parent)?;
153        }
154        let tmp = path.with_extension("json.tmp");
155        let bytes = serde_json::to_vec(self).map_err(std::io::Error::other)?;
156        std::fs::write(&tmp, bytes)?;
157        std::fs::rename(&tmp, &path)?;
158        Ok(())
159    }
160
161    /// Try to load the snapshot from `workspace_root/.burin/index/snapshot.json`.
162    /// Returns `Ok(None)` when no snapshot exists yet; returns `Err` when
163    /// one exists but couldn't be parsed (caller is expected to fall back
164    /// to `build_from_root`).
165    pub fn load(workspace_root: &Path) -> std::io::Result<Option<Self>> {
166        let path = Self::path_for(workspace_root);
167        if !path.exists() {
168            return Ok(None);
169        }
170        let bytes = std::fs::read(&path)?;
171        let snap: CodeIndexSnapshot =
172            serde_json::from_slice(&bytes).map_err(std::io::Error::other)?;
173        if snap.meta.format_version != SNAPSHOT_FORMAT_VERSION {
174            return Ok(None);
175        }
176        Ok(Some(snap))
177    }
178}
179
180impl IndexState {
181    /// Capture the current state as a [`CodeIndexSnapshot`].
182    pub fn snapshot(&self) -> CodeIndexSnapshot {
183        let files: Vec<SnapshotFile> = self
184            .files
185            .values()
186            .map(|f| SnapshotFile {
187                id: f.id,
188                relative_path: f.relative_path.clone(),
189                language: f.language.clone(),
190                size_bytes: f.size_bytes,
191                line_count: f.line_count,
192                content_hash: f.content_hash,
193                mtime_ms: f.mtime_ms,
194                symbols: f
195                    .symbols
196                    .iter()
197                    .map(|s| SnapshotSymbol {
198                        name: s.name.clone(),
199                        kind: s.kind.clone(),
200                        start_line: s.start_line,
201                        end_line: s.end_line,
202                        signature: s.signature.clone(),
203                    })
204                    .collect(),
205                imports: f.imports.clone(),
206            })
207            .collect();
208
209        let trigrams = self.trigrams.snapshot_postings();
210        let words = self.words.snapshot_postings();
211        let deps = self.deps.snapshot_rows();
212
213        CodeIndexSnapshot {
214            meta: SnapshotMeta {
215                format_version: SNAPSHOT_FORMAT_VERSION,
216                workspace_root: self.root.to_string_lossy().into_owned(),
217                git_head: self.git_head.clone(),
218                indexed_at_ms: self.last_built_unix_ms,
219                file_count: self.files.len(),
220            },
221            next_file_id: self.next_file_id_internal(),
222            files,
223            trigrams,
224            words,
225            deps,
226            versions: self.versions.clone(),
227            agents: self.agents.snapshot(),
228        }
229    }
230
231    /// Restore an [`IndexState`] from a snapshot. The workspace root is
232    /// taken from the snapshot meta; callers can then call
233    /// [`Self::reap_after_recovery`] to drop stale agent records.
234    pub fn from_snapshot(snap: CodeIndexSnapshot) -> Self {
235        let root = PathBuf::from(snap.meta.workspace_root);
236        let mut files: HashMap<FileId, IndexedFile> = HashMap::with_capacity(snap.files.len());
237        let mut path_to_id: HashMap<String, FileId> = HashMap::with_capacity(snap.files.len());
238        for f in snap.files {
239            let indexed = IndexedFile {
240                id: f.id,
241                relative_path: f.relative_path.clone(),
242                language: f.language,
243                size_bytes: f.size_bytes,
244                line_count: f.line_count,
245                content_hash: f.content_hash,
246                mtime_ms: f.mtime_ms,
247                symbols: f
248                    .symbols
249                    .into_iter()
250                    .map(|s| IndexedSymbol {
251                        name: s.name,
252                        kind: s.kind,
253                        start_line: s.start_line,
254                        end_line: s.end_line,
255                        signature: s.signature,
256                    })
257                    .collect(),
258                imports: f.imports,
259            };
260            path_to_id.insert(f.relative_path, f.id);
261            files.insert(f.id, indexed);
262        }
263        let trigrams = TrigramIndex::from_postings(snap.trigrams);
264        let words = WordIndex::from_postings(snap.words);
265        let deps = DepGraph::from_rows(snap.deps);
266        let agents = AgentRegistry::from_snapshot(RegistryConfig::default(), snap.agents);
267
268        let mut state = Self::empty(root);
269        state.files = files;
270        state.path_to_id = path_to_id;
271        state.trigrams = trigrams;
272        state.words = words;
273        state.deps = deps;
274        state.versions = snap.versions;
275        state.agents = agents;
276        state.last_built_unix_ms = snap.meta.indexed_at_ms;
277        state.git_head = snap.meta.git_head;
278        state.set_next_file_id(snap.next_file_id);
279        state
280    }
281
282    /// Drop stale agent records and release any locks held by agents
283    /// whose `last_seen_ms` is older than the configured timeout. Called
284    /// at startup after restoring from a snapshot.
285    pub fn reap_after_recovery(&mut self, now_ms: i64) {
286        self.agents.reap(now_ms);
287    }
288}