Skip to main content

harn_hostlib/code_index/
snapshot.rs

1//! Persistent on-disk snapshot of the workspace index.
2//!
3//! v1 uses a single JSON file at `.burin/index/snapshot.json` for
4//! on-disk compatibility. The shape is intentionally tolerant of missing
5//! sections so we can extend it in place without a version bump (e.g. add
6//! a new sub-index without invalidating earlier snapshots).
7//!
8//! The snapshot is the recovery primitive. On daemon startup, the
9//! embedder restores from the snapshot if one exists, then calls
10//! [`super::IndexState::reap_after_recovery`] to drop stale agent
11//! records and locks before serving any traffic.
12
13use std::collections::HashMap;
14use std::path::{Path, PathBuf};
15
16use serde::{Deserialize, Serialize};
17
18use super::agents::{AgentRegistry, RegistryConfig, SerializedRegistry};
19use super::file_table::{FileId, IndexedFile, IndexedSymbol};
20use super::graph::DepGraph;
21use super::trigram::TrigramIndex;
22use super::versions::VersionLog;
23use super::words::WordIndex;
24use super::IndexState;
25
26/// Current format version. Bumped whenever the snapshot layout changes
27/// in a non-additive way.
28pub const SNAPSHOT_FORMAT_VERSION: u32 = 1;
29
30/// On-disk metadata header. Small and cheap to read so embedders can
31/// peek at a snapshot without parsing the whole thing.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct SnapshotMeta {
34    /// Format version. Must equal [`SNAPSHOT_FORMAT_VERSION`] for now;
35    /// older snapshots are dropped.
36    pub format_version: u32,
37    /// Workspace root the snapshot was captured against.
38    pub workspace_root: String,
39    /// `HEAD` SHA of the workspace at snapshot time, when known.
40    pub git_head: Option<String>,
41    /// Wall-clock ms since the Unix epoch when the snapshot was written.
42    pub indexed_at_ms: i64,
43    /// Total number of files captured.
44    pub file_count: usize,
45}
46
47/// Serialised form of one outline symbol. Mirrors [`IndexedSymbol`].
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct SnapshotSymbol {
50    /// Symbol name.
51    pub name: String,
52    /// Language-specific kind tag.
53    pub kind: String,
54    /// 1-based start line.
55    pub start_line: u32,
56    /// 1-based inclusive end line.
57    pub end_line: u32,
58    /// Single-line preview of the declaration.
59    pub signature: String,
60}
61
62/// Serialised form of one file row. Mirrors [`IndexedFile`].
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct SnapshotFile {
65    /// Stable file identifier.
66    pub id: FileId,
67    /// Workspace-relative path with `/` separators.
68    pub relative_path: String,
69    /// Best-effort language tag.
70    pub language: String,
71    /// Size in bytes.
72    pub size_bytes: u64,
73    /// Newline-delimited line count.
74    pub line_count: u32,
75    /// FNV-1a 64-bit content hash.
76    pub content_hash: u64,
77    /// Last-modified time (ms since epoch).
78    pub mtime_ms: i64,
79    /// Outline symbols.
80    pub symbols: Vec<SnapshotSymbol>,
81    /// Raw import statement strings.
82    pub imports: Vec<String>,
83}
84
85/// One trigram posting entry: `trigram → list of file ids`.
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct TrigramPosting {
88    /// Packed trigram key.
89    pub trigram: u32,
90    /// Files containing this trigram.
91    pub files: Vec<FileId>,
92}
93
94/// One word posting entry: `word → list of (file, line) pairs`.
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct WordPosting {
97    /// Identifier-shaped token.
98    pub word: String,
99    /// All occurrences as `(file_id, line)` pairs.
100    pub hits: Vec<(FileId, u32)>,
101}
102
103/// One dep-graph row: `file → resolved imports + unresolved raw strings`.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct DepRow {
106    /// Source file id.
107    pub from: FileId,
108    /// Resolved target file ids.
109    pub to: Vec<FileId>,
110    /// Raw import strings the resolver couldn't map back to a file.
111    #[serde(default)]
112    pub unresolved: Vec<String>,
113}
114
115/// Persistent on-disk form of the entire workspace index.
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct CodeIndexSnapshot {
118    /// Snapshot header.
119    pub meta: SnapshotMeta,
120    /// Next file id to hand out — preserved so reused ids don't collide
121    /// with historical version-log entries.
122    pub next_file_id: FileId,
123    /// File table.
124    pub files: Vec<SnapshotFile>,
125    /// Trigram postings.
126    pub trigrams: Vec<TrigramPosting>,
127    /// Word postings.
128    pub words: Vec<WordPosting>,
129    /// Dep graph rows.
130    pub deps: Vec<DepRow>,
131    /// Append-only version log.
132    pub versions: VersionLog,
133    /// Live agents at snapshot time.
134    pub agents: SerializedRegistry,
135}
136
137impl CodeIndexSnapshot {
138    /// Path the snapshot lives at, relative to the workspace root.
139    pub fn path_for(workspace_root: &Path) -> PathBuf {
140        workspace_root
141            .join(".burin")
142            .join("index")
143            .join("snapshot.json")
144    }
145
146    /// Save the snapshot atomically (`tmp` file + rename) so partial
147    /// writes never leave a half-encoded JSON blob on disk.
148    pub fn save(&self, workspace_root: &Path) -> std::io::Result<()> {
149        let path = Self::path_for(workspace_root);
150        if let Some(parent) = path.parent() {
151            std::fs::create_dir_all(parent)?;
152        }
153        let tmp = path.with_extension("json.tmp");
154        let bytes = serde_json::to_vec(self).map_err(std::io::Error::other)?;
155        std::fs::write(&tmp, bytes)?;
156        std::fs::rename(&tmp, &path)?;
157        Ok(())
158    }
159
160    /// Try to load the snapshot from `workspace_root/.burin/index/snapshot.json`.
161    /// Returns `Ok(None)` when no snapshot exists yet; returns `Err` when
162    /// one exists but couldn't be parsed (caller is expected to fall back
163    /// to `build_from_root`).
164    pub fn load(workspace_root: &Path) -> std::io::Result<Option<Self>> {
165        let path = Self::path_for(workspace_root);
166        if !path.exists() {
167            return Ok(None);
168        }
169        let bytes = std::fs::read(&path)?;
170        let snap: CodeIndexSnapshot =
171            serde_json::from_slice(&bytes).map_err(std::io::Error::other)?;
172        if snap.meta.format_version != SNAPSHOT_FORMAT_VERSION {
173            return Ok(None);
174        }
175        Ok(Some(snap))
176    }
177}
178
179impl IndexState {
180    /// Capture the current state as a [`CodeIndexSnapshot`].
181    pub fn snapshot(&self) -> CodeIndexSnapshot {
182        let files: Vec<SnapshotFile> = self
183            .files
184            .values()
185            .map(|f| SnapshotFile {
186                id: f.id,
187                relative_path: f.relative_path.clone(),
188                language: f.language.clone(),
189                size_bytes: f.size_bytes,
190                line_count: f.line_count,
191                content_hash: f.content_hash,
192                mtime_ms: f.mtime_ms,
193                symbols: f
194                    .symbols
195                    .iter()
196                    .map(|s| SnapshotSymbol {
197                        name: s.name.clone(),
198                        kind: s.kind.clone(),
199                        start_line: s.start_line,
200                        end_line: s.end_line,
201                        signature: s.signature.clone(),
202                    })
203                    .collect(),
204                imports: f.imports.clone(),
205            })
206            .collect();
207
208        let trigrams = self.trigrams.snapshot_postings();
209        let words = self.words.snapshot_postings();
210        let deps = self.deps.snapshot_rows();
211
212        CodeIndexSnapshot {
213            meta: SnapshotMeta {
214                format_version: SNAPSHOT_FORMAT_VERSION,
215                workspace_root: self.root.to_string_lossy().into_owned(),
216                git_head: self.git_head.clone(),
217                indexed_at_ms: self.last_built_unix_ms,
218                file_count: self.files.len(),
219            },
220            next_file_id: self.next_file_id_internal(),
221            files,
222            trigrams,
223            words,
224            deps,
225            versions: self.versions.clone(),
226            agents: self.agents.snapshot(),
227        }
228    }
229
230    /// Restore an [`IndexState`] from a snapshot. The workspace root is
231    /// taken from the snapshot meta; callers can then call
232    /// [`Self::reap_after_recovery`] to drop stale agent records.
233    pub fn from_snapshot(snap: CodeIndexSnapshot) -> Self {
234        let root = PathBuf::from(snap.meta.workspace_root);
235        let mut files: HashMap<FileId, IndexedFile> = HashMap::with_capacity(snap.files.len());
236        let mut path_to_id: HashMap<String, FileId> = HashMap::with_capacity(snap.files.len());
237        for f in snap.files {
238            let indexed = IndexedFile {
239                id: f.id,
240                relative_path: f.relative_path.clone(),
241                language: f.language,
242                size_bytes: f.size_bytes,
243                line_count: f.line_count,
244                content_hash: f.content_hash,
245                mtime_ms: f.mtime_ms,
246                symbols: f
247                    .symbols
248                    .into_iter()
249                    .map(|s| IndexedSymbol {
250                        name: s.name,
251                        kind: s.kind,
252                        start_line: s.start_line,
253                        end_line: s.end_line,
254                        signature: s.signature,
255                    })
256                    .collect(),
257                imports: f.imports,
258            };
259            path_to_id.insert(f.relative_path, f.id);
260            files.insert(f.id, indexed);
261        }
262        let trigrams = TrigramIndex::from_postings(snap.trigrams);
263        let words = WordIndex::from_postings(snap.words);
264        let deps = DepGraph::from_rows(snap.deps);
265        let agents = AgentRegistry::from_snapshot(RegistryConfig::default(), snap.agents);
266
267        let mut state = Self::empty(root);
268        state.files = files;
269        state.path_to_id = path_to_id;
270        state.trigrams = trigrams;
271        state.words = words;
272        state.deps = deps;
273        state.versions = snap.versions;
274        state.agents = agents;
275        state.last_built_unix_ms = snap.meta.indexed_at_ms;
276        state.git_head = snap.meta.git_head;
277        state.set_next_file_id(snap.next_file_id);
278        state
279    }
280
281    /// Drop stale agent records and release any locks held by agents
282    /// whose `last_seen_ms` is older than the configured timeout. Called
283    /// at startup after restoring from a snapshot.
284    pub fn reap_after_recovery(&mut self, now_ms: i64) {
285        self.agents.reap(now_ms);
286    }
287}