Skip to main content

harn_hostlib/code_index/
snapshot.rs

1//! Persistent on-disk snapshot of the workspace index.
2//!
3//! v1 uses a single JSON file at `.burin/index/snapshot.json` for
4//! on-disk compatibility. The shape is intentionally tolerant of missing
5//! sections so we can extend it in place without a version bump (e.g. add
6//! a new sub-index without invalidating earlier snapshots).
7//!
8//! The snapshot is the recovery primitive. On daemon startup, the
9//! embedder restores from the snapshot if one exists, then calls
10//! [`super::IndexState::reap_after_recovery`] to drop stale agent
11//! records and locks before serving any traffic.
12
13use std::collections::HashMap;
14use std::path::{Path, PathBuf};
15
16use serde::{Deserialize, Serialize};
17
18use super::agents::{AgentRegistry, RegistryConfig, SerializedRegistry};
19use super::file_table::{FileId, IndexedFile, IndexedSymbol};
20use super::graph::DepGraph;
21use super::trigram::TrigramIndex;
22use super::versions::VersionLog;
23use super::words::WordIndex;
24use super::IndexState;
25
26/// Current format version. Bumped whenever the snapshot layout changes
27/// in a non-additive way.
28pub const SNAPSHOT_FORMAT_VERSION: u32 = 1;
29
30/// On-disk metadata header. Small and cheap to read so embedders can
31/// peek at a snapshot without parsing the whole thing.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct SnapshotMeta {
34    /// Format version. Must equal [`SNAPSHOT_FORMAT_VERSION`] for now;
35    /// older snapshots are dropped.
36    pub format_version: u32,
37    /// Workspace root the snapshot was captured against.
38    pub workspace_root: String,
39    /// `HEAD` SHA of the workspace at snapshot time, when known.
40    pub git_head: Option<String>,
41    /// Wall-clock ms since the Unix epoch when the snapshot was written.
42    pub indexed_at_ms: i64,
43    /// Total number of files captured.
44    pub file_count: usize,
45}
46
47/// Serialised form of one outline symbol. Mirrors [`IndexedSymbol`].
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct SnapshotSymbol {
50    /// Symbol name.
51    pub name: String,
52    /// Language-specific kind tag.
53    pub kind: String,
54    /// Normalized declaration access level when known.
55    #[serde(default)]
56    pub access_level: Option<String>,
57    /// 1-based start line.
58    pub start_line: u32,
59    /// 1-based inclusive end line.
60    pub end_line: u32,
61    /// Single-line preview of the declaration.
62    pub signature: String,
63}
64
65/// Serialised form of one file row. Mirrors [`IndexedFile`].
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct SnapshotFile {
68    /// Stable file identifier.
69    pub id: FileId,
70    /// Workspace-relative path with `/` separators.
71    pub relative_path: String,
72    /// Best-effort language tag.
73    pub language: String,
74    /// Size in bytes.
75    pub size_bytes: u64,
76    /// Newline-delimited line count.
77    pub line_count: u32,
78    /// FNV-1a 64-bit content hash.
79    pub content_hash: u64,
80    /// Last-modified time (ms since epoch).
81    pub mtime_ms: i64,
82    /// Outline symbols.
83    pub symbols: Vec<SnapshotSymbol>,
84    /// Raw import statement strings.
85    pub imports: Vec<String>,
86}
87
88/// One trigram posting entry: `trigram → list of file ids`.
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct TrigramPosting {
91    /// Packed trigram key.
92    pub trigram: u32,
93    /// Files containing this trigram.
94    pub files: Vec<FileId>,
95}
96
97/// One word posting entry: `word → list of (file, line) pairs`.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct WordPosting {
100    /// Identifier-shaped token.
101    pub word: String,
102    /// All occurrences as `(file_id, line)` pairs.
103    pub hits: Vec<(FileId, u32)>,
104}
105
106/// One dep-graph row: `file → resolved imports + unresolved raw strings`.
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct DepRow {
109    /// Source file id.
110    pub from: FileId,
111    /// Resolved target file ids.
112    pub to: Vec<FileId>,
113    /// Raw import strings the resolver couldn't map back to a file.
114    #[serde(default)]
115    pub unresolved: Vec<String>,
116}
117
118/// Persistent on-disk form of the entire workspace index.
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct CodeIndexSnapshot {
121    /// Snapshot header.
122    pub meta: SnapshotMeta,
123    /// Next file id to hand out — preserved so reused ids don't collide
124    /// with historical version-log entries.
125    pub next_file_id: FileId,
126    /// File table.
127    pub files: Vec<SnapshotFile>,
128    /// Trigram postings.
129    pub trigrams: Vec<TrigramPosting>,
130    /// Word postings.
131    pub words: Vec<WordPosting>,
132    /// Dep graph rows.
133    pub deps: Vec<DepRow>,
134    /// Append-only version log.
135    pub versions: VersionLog,
136    /// Live agents at snapshot time.
137    pub agents: SerializedRegistry,
138}
139
140impl CodeIndexSnapshot {
141    /// Path the snapshot lives at, relative to the workspace root.
142    pub fn path_for(workspace_root: &Path) -> PathBuf {
143        workspace_root
144            .join(".burin")
145            .join("index")
146            .join("snapshot.json")
147    }
148
149    /// Save the snapshot atomically (`tmp` file + rename) so partial
150    /// writes never leave a half-encoded JSON blob on disk.
151    pub fn save(&self, workspace_root: &Path) -> std::io::Result<()> {
152        let path = Self::path_for(workspace_root);
153        if let Some(parent) = path.parent() {
154            std::fs::create_dir_all(parent)?;
155        }
156        let tmp = path.with_extension("json.tmp");
157        let bytes = serde_json::to_vec(self).map_err(std::io::Error::other)?;
158        std::fs::write(&tmp, bytes)?;
159        std::fs::rename(&tmp, &path)?;
160        Ok(())
161    }
162
163    /// Try to load the snapshot from `workspace_root/.burin/index/snapshot.json`.
164    /// Returns `Ok(None)` when no snapshot exists yet; returns `Err` when
165    /// one exists but couldn't be parsed (caller is expected to fall back
166    /// to `build_from_root`).
167    pub fn load(workspace_root: &Path) -> std::io::Result<Option<Self>> {
168        let path = Self::path_for(workspace_root);
169        if !path.exists() {
170            return Ok(None);
171        }
172        let bytes = std::fs::read(&path)?;
173        let snap: CodeIndexSnapshot =
174            serde_json::from_slice(&bytes).map_err(std::io::Error::other)?;
175        if snap.meta.format_version != SNAPSHOT_FORMAT_VERSION {
176            return Ok(None);
177        }
178        Ok(Some(snap))
179    }
180}
181
182impl IndexState {
183    /// Capture the current state as a [`CodeIndexSnapshot`].
184    pub fn snapshot(&self) -> CodeIndexSnapshot {
185        let files: Vec<SnapshotFile> = self
186            .files
187            .values()
188            .map(|f| SnapshotFile {
189                id: f.id,
190                relative_path: f.relative_path.clone(),
191                language: f.language.clone(),
192                size_bytes: f.size_bytes,
193                line_count: f.line_count,
194                content_hash: f.content_hash,
195                mtime_ms: f.mtime_ms,
196                symbols: f
197                    .symbols
198                    .iter()
199                    .map(|s| SnapshotSymbol {
200                        name: s.name.clone(),
201                        kind: s.kind.clone(),
202                        access_level: s.access_level.clone(),
203                        start_line: s.start_line,
204                        end_line: s.end_line,
205                        signature: s.signature.clone(),
206                    })
207                    .collect(),
208                imports: f.imports.clone(),
209            })
210            .collect();
211
212        let trigrams = self.trigrams.snapshot_postings();
213        let words = self.words.snapshot_postings();
214        let deps = self.deps.snapshot_rows();
215
216        CodeIndexSnapshot {
217            meta: SnapshotMeta {
218                format_version: SNAPSHOT_FORMAT_VERSION,
219                workspace_root: self.root.to_string_lossy().into_owned(),
220                git_head: self.git_head.clone(),
221                indexed_at_ms: self.last_built_unix_ms,
222                file_count: self.files.len(),
223            },
224            next_file_id: self.next_file_id_internal(),
225            files,
226            trigrams,
227            words,
228            deps,
229            versions: self.versions.clone(),
230            agents: self.agents.snapshot(),
231        }
232    }
233
234    /// Restore an [`IndexState`] from a snapshot. The workspace root is
235    /// taken from the snapshot meta; callers can then call
236    /// [`Self::reap_after_recovery`] to drop stale agent records.
237    pub fn from_snapshot(snap: CodeIndexSnapshot) -> Self {
238        let root = PathBuf::from(snap.meta.workspace_root);
239        let mut files: HashMap<FileId, IndexedFile> = HashMap::with_capacity(snap.files.len());
240        let mut path_to_id: HashMap<String, FileId> = HashMap::with_capacity(snap.files.len());
241        for f in snap.files {
242            let indexed = IndexedFile {
243                id: f.id,
244                relative_path: f.relative_path.clone(),
245                language: f.language,
246                size_bytes: f.size_bytes,
247                line_count: f.line_count,
248                content_hash: f.content_hash,
249                mtime_ms: f.mtime_ms,
250                symbols: f
251                    .symbols
252                    .into_iter()
253                    .map(|s| IndexedSymbol {
254                        name: s.name,
255                        kind: s.kind,
256                        access_level: s.access_level,
257                        start_line: s.start_line,
258                        end_line: s.end_line,
259                        signature: s.signature,
260                    })
261                    .collect(),
262                imports: f.imports,
263            };
264            path_to_id.insert(f.relative_path, f.id);
265            files.insert(f.id, indexed);
266        }
267        let trigrams = TrigramIndex::from_postings(snap.trigrams);
268        let words = WordIndex::from_postings(snap.words);
269        let deps = DepGraph::from_rows(snap.deps);
270        let agents = AgentRegistry::from_snapshot(RegistryConfig::default(), snap.agents);
271
272        let mut state = Self::empty(root);
273        state.files = files;
274        state.path_to_id = path_to_id;
275        state.trigrams = trigrams;
276        state.words = words;
277        state.deps = deps;
278        state.versions = snap.versions;
279        state.agents = agents;
280        state.last_built_unix_ms = snap.meta.indexed_at_ms;
281        state.git_head = snap.meta.git_head;
282        state.set_next_file_id(snap.next_file_id);
283        state
284    }
285
286    /// Drop stale agent records and release any locks held by agents
287    /// whose `last_seen_ms` is older than the configured timeout. Called
288    /// at startup after restoring from a snapshot.
289    pub fn reap_after_recovery(&mut self, now_ms: i64) {
290        self.agents.reap(now_ms);
291    }
292}