Skip to main content

harn_hostlib/code_index/
state.rs

1//! Per-workspace index state.
2//!
3//! Owns the file table, trigram index, word index, dep graph, version
4//! log, and agent registry for one workspace root. Construction is via
5//! [`IndexState::build_from_root`], which walks the workspace, reads
6//! every indexable file, and populates every sub-index in a single pass
7//! before resolving imports.
8//!
9//! Single-file mutations (`reindex_file`, `remove_file`) flow through
10//! the same paths so the sub-indexes stay consistent across the
11//! incremental host ops drive.
12
13use std::collections::HashMap;
14use std::path::{Path, PathBuf};
15use std::time::{SystemTime, UNIX_EPOCH};
16
17use super::agents::AgentRegistry;
18use super::file_table::{fnv1a64, FileId, IndexedFile};
19use super::graph::DepGraph;
20use super::imports;
21use super::trigram::TrigramIndex;
22use super::versions::VersionLog;
23use super::walker::{is_indexable_file, language_for_extension, walk_indexable, MAX_FILE_BYTES};
24use super::words::WordIndex;
25
26/// In-memory index for one workspace. Composed from the per-file table,
27/// the trigram + word sub-indexes, the dep graph, the append-only version
28/// log, and the agent registry.
29pub struct IndexState {
30    /// Canonicalised workspace root.
31    pub root: PathBuf,
32    /// File table keyed on stable id.
33    pub files: HashMap<FileId, IndexedFile>,
34    /// Workspace-relative path → stable id.
35    pub path_to_id: HashMap<String, FileId>,
36    /// Trigram posting list.
37    pub trigrams: TrigramIndex,
38    /// Identifier-token inverted index.
39    pub words: WordIndex,
40    /// Forward + reverse import graph.
41    pub deps: DepGraph,
42    /// Append-only log of file mutations.
43    pub versions: VersionLog,
44    /// Live agents + advisory locks.
45    pub agents: AgentRegistry,
46    /// Wall-clock timestamp (ms since epoch) of the most recent rebuild.
47    pub last_built_unix_ms: i64,
48    /// Best-effort `HEAD` SHA, or `None` if the workspace isn't a git repo.
49    pub git_head: Option<String>,
50    next_id: FileId,
51}
52
53/// Summary returned from `IndexState::build_from_root`.
54#[derive(Debug, Default)]
55pub struct BuildOutcome {
56    /// Files that passed every filter and were ingested.
57    pub files_indexed: u64,
58    /// Files that matched the filename filter but couldn't be read or
59    /// were too large.
60    pub files_skipped: u64,
61}
62
63impl IndexState {
64    /// Build a fresh index over `root`. Returns the populated state plus a
65    /// summary of how many files were indexed vs skipped.
66    pub fn build_from_root(root: &Path) -> (Self, BuildOutcome) {
67        let canonical_root = canonicalize(root);
68        let mut state = IndexState {
69            root: canonical_root.clone(),
70            files: HashMap::new(),
71            path_to_id: HashMap::new(),
72            trigrams: TrigramIndex::new(),
73            words: WordIndex::new(),
74            deps: DepGraph::new(),
75            versions: VersionLog::new(),
76            agents: AgentRegistry::new(),
77            last_built_unix_ms: now_unix_ms(),
78            git_head: read_git_head(&canonical_root),
79            next_id: 1,
80        };
81        let mut outcome = BuildOutcome::default();
82        let mut to_resolve: Vec<(FileId, String)> = Vec::new();
83        walk_indexable(&canonical_root, |abs| match state.ingest(abs) {
84            Some(file_id) => {
85                outcome.files_indexed += 1;
86                if let Some(file) = state.files.get(&file_id) {
87                    to_resolve.push((file_id, file.relative_path.clone()));
88                }
89            }
90            None => {
91                outcome.files_skipped += 1;
92            }
93        });
94        for (id, rel) in to_resolve {
95            state.rebuild_deps(id, &rel);
96        }
97        (state, outcome)
98    }
99
100    /// Re-index a single file by its absolute path. Returns the id of the
101    /// affected file (newly assigned or existing). If the file no longer
102    /// exists or fails the indexability/sensitivity filter, any existing
103    /// entry under that path is removed and `None` is returned.
104    pub fn reindex_file(&mut self, abs: &Path) -> Option<FileId> {
105        if !abs.exists() {
106            self.remove_file_path(abs);
107            return None;
108        }
109        if !is_indexable_file(abs) || super::walker::is_sensitive_path(abs) {
110            self.remove_file_path(abs);
111            return None;
112        }
113        let id = self.ingest(abs)?;
114        let rel = self
115            .files
116            .get(&id)
117            .map(|f| f.relative_path.clone())
118            .unwrap_or_default();
119        if !rel.is_empty() {
120            self.rebuild_deps(id, &rel);
121        }
122        Some(id)
123    }
124
125    /// Remove an existing file from every sub-index. No-op when the file
126    /// isn't tracked.
127    pub fn remove_file_path(&mut self, abs: &Path) {
128        let Some(rel) = relative_path(&self.root, abs) else {
129            return;
130        };
131        let Some(id) = self.path_to_id.remove(&rel) else {
132            return;
133        };
134        self.files.remove(&id);
135        self.trigrams.remove_file(id);
136        self.words.remove_file(id);
137        self.deps.remove_file(id);
138    }
139
140    fn ingest(&mut self, abs: &Path) -> Option<FileId> {
141        if !is_indexable_file(abs) {
142            return None;
143        }
144        let metadata = std::fs::metadata(abs).ok()?;
145        if metadata.len() > MAX_FILE_BYTES {
146            return None;
147        }
148        let content = std::fs::read_to_string(abs).ok()?;
149        if content.len() > MAX_FILE_BYTES as usize {
150            return None;
151        }
152        let rel = relative_path(&self.root, abs)?;
153        let hash = fnv1a64(content.as_bytes());
154        let id = match self.path_to_id.get(&rel) {
155            Some(existing_id) => {
156                if let Some(file) = self.files.get(existing_id) {
157                    if file.content_hash == hash {
158                        return Some(*existing_id);
159                    }
160                }
161                *existing_id
162            }
163            None => {
164                let id = self.next_id;
165                self.next_id = self.next_id.checked_add(1).expect("FileId overflow");
166                self.path_to_id.insert(rel.clone(), id);
167                id
168            }
169        };
170
171        let ext = abs
172            .extension()
173            .and_then(|s| s.to_str())
174            .unwrap_or("")
175            .to_ascii_lowercase();
176        let language = language_for_extension(&ext).to_string();
177        let imports = imports::extract_imports(&content, &language);
178        let mtime_ms = metadata
179            .modified()
180            .ok()
181            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
182            .map(|d| d.as_millis() as i64)
183            .unwrap_or(0);
184        let line_count = if content.is_empty() {
185            0
186        } else {
187            content.split('\n').count() as u32
188        };
189
190        let file = IndexedFile {
191            id,
192            relative_path: rel,
193            language,
194            size_bytes: content.len() as u64,
195            line_count,
196            content_hash: hash,
197            mtime_ms,
198            symbols: Vec::new(),
199            imports,
200        };
201        self.trigrams.index_file(id, &content);
202        self.words.index_file(id, &content);
203        self.files.insert(id, file);
204        Some(id)
205    }
206
207    fn rebuild_deps(&mut self, id: FileId, relative_path: &str) {
208        let Some(file) = self.files.get(&id).cloned() else {
209            return;
210        };
211        let resolved = imports::resolve(
212            &file.imports,
213            relative_path,
214            &file.language,
215            &self.path_to_id,
216        );
217        self.deps
218            .set_edges(id, resolved.resolved, resolved.unresolved);
219    }
220
221    /// Look up a file by either its workspace-relative path or its
222    /// absolute path inside the workspace root.
223    pub fn lookup_path(&self, raw: &str) -> Option<FileId> {
224        if let Some(id) = self.path_to_id.get(raw) {
225            return Some(*id);
226        }
227        let path = Path::new(raw);
228        if path.is_absolute() {
229            if let Some(rel) = relative_path(&self.root, path) {
230                if let Some(id) = self.path_to_id.get(&rel) {
231                    return Some(*id);
232                }
233            }
234        }
235        None
236    }
237
238    /// Estimate the resident memory footprint of every sub-index. Cheap
239    /// order-of-magnitude figure surfaced by the `stats` builtin.
240    pub fn estimated_bytes(&self) -> usize {
241        let file_bytes: usize = self
242            .files
243            .values()
244            .map(|f| f.relative_path.len() + f.imports.iter().map(|s| s.len()).sum::<usize>() + 64)
245            .sum();
246        self.trigrams.estimated_bytes() + self.words.estimated_bytes() + file_bytes
247    }
248
249    /// Resolve a workspace-relative path against the canonical root.
250    /// Used by host builtins that take a `path` argument and need to
251    /// open the underlying file (e.g. `read_range`, `file_hash`).
252    pub fn absolute_path(&self, rel_or_abs: &str) -> PathBuf {
253        let p = Path::new(rel_or_abs);
254        if p.is_absolute() {
255            p.to_path_buf()
256        } else {
257            self.root.join(p)
258        }
259    }
260
261    /// Construct an empty [`IndexState`] anchored at `root`. Used by the
262    /// snapshot path which fills in the sub-indexes itself.
263    pub(crate) fn empty(root: PathBuf) -> Self {
264        Self {
265            root,
266            files: HashMap::new(),
267            path_to_id: HashMap::new(),
268            trigrams: TrigramIndex::new(),
269            words: WordIndex::new(),
270            deps: DepGraph::new(),
271            versions: VersionLog::new(),
272            agents: AgentRegistry::new(),
273            last_built_unix_ms: 0,
274            git_head: None,
275            next_id: 1,
276        }
277    }
278
279    /// Borrow the `next_id` counter — exposed for snapshot serialisation.
280    pub(crate) fn next_file_id_internal(&self) -> FileId {
281        self.next_id
282    }
283
284    /// Restore the `next_id` counter from a serialised snapshot.
285    pub(crate) fn set_next_file_id(&mut self, id: FileId) {
286        self.next_id = id.max(1);
287    }
288}
289
290/// Return the current wall-clock time in milliseconds since the Unix
291/// epoch.
292pub(crate) fn now_unix_ms() -> i64 {
293    SystemTime::now()
294        .duration_since(UNIX_EPOCH)
295        .map(|d| d.as_millis() as i64)
296        .unwrap_or(0)
297}
298
299fn canonicalize(root: &Path) -> PathBuf {
300    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
301}
302
303/// Compute `abs` relative to `root`, using `/` separators. Returns `None`
304/// if `abs` is not inside `root`. Handles the missing-file case (where
305/// `canonicalize` would fail) by canonicalising the longest existing
306/// prefix and re-attaching the missing tail — so `remove_file_path` keeps
307/// working when the underlying path has just been deleted.
308pub(crate) fn relative_path(root: &Path, abs: &Path) -> Option<String> {
309    let canonical_abs = canonicalize_existing(abs);
310    let stripped = canonical_abs.strip_prefix(root).ok()?;
311    Some(stripped.to_string_lossy().replace('\\', "/"))
312}
313
314fn canonicalize_existing(abs: &Path) -> PathBuf {
315    if let Ok(c) = std::fs::canonicalize(abs) {
316        return c;
317    }
318    // Walk upward until we find a parent that does exist; canonicalise
319    // that and re-attach the missing tail.
320    let mut tail: Vec<&std::ffi::OsStr> = Vec::new();
321    let mut cursor = abs;
322    loop {
323        if cursor.exists() {
324            if let Ok(canonical) = std::fs::canonicalize(cursor) {
325                let mut out = canonical;
326                for piece in tail.iter().rev() {
327                    out = out.join(piece);
328                }
329                return out;
330            }
331            break;
332        }
333        match (cursor.parent(), cursor.file_name()) {
334            (Some(parent), Some(name)) if !parent.as_os_str().is_empty() => {
335                tail.push(name);
336                cursor = parent;
337            }
338            _ => break,
339        }
340    }
341    abs.to_path_buf()
342}
343
344fn read_git_head(workspace_root: &Path) -> Option<String> {
345    let head = workspace_root.join(".git").join("HEAD");
346    let txt = std::fs::read_to_string(&head).ok()?;
347    let line = txt.trim().to_string();
348    if let Some(ref_target) = line.strip_prefix("ref: ") {
349        let ref_path = workspace_root.join(".git").join(ref_target);
350        if let Ok(sha) = std::fs::read_to_string(&ref_path) {
351            return Some(sha.trim().to_string());
352        }
353    }
354    Some(line)
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360    use std::fs;
361    use tempfile::tempdir;
362
363    #[test]
364    fn build_indexes_files_and_resolves_imports() {
365        let dir = tempdir().unwrap();
366        let root = dir.path();
367        fs::create_dir_all(root.join("src")).unwrap();
368        fs::write(
369            root.join("src/main.rs"),
370            "use crate::util::helper;\nfn main() {}\n",
371        )
372        .unwrap();
373        fs::write(root.join("src/util.rs"), "pub fn helper() {}").unwrap();
374
375        let (state, outcome) = IndexState::build_from_root(root);
376        assert_eq!(outcome.files_indexed, 2);
377        assert_eq!(state.files.len(), 2);
378        let main_id = state.path_to_id["src/main.rs"];
379        let util_id = state.path_to_id["src/util.rs"];
380        // Rust uses `noop` resolution, so dep graph is empty.
381        assert_eq!(state.deps.imports_of(main_id), Vec::<FileId>::new());
382        let _ = util_id;
383    }
384
385    #[test]
386    fn typescript_imports_get_resolved() {
387        let dir = tempdir().unwrap();
388        let root = dir.path();
389        fs::create_dir_all(root.join("src")).unwrap();
390        fs::write(
391            root.join("src/index.ts"),
392            "import { helper } from \"./util\";\n",
393        )
394        .unwrap();
395        fs::write(root.join("src/util.ts"), "export function helper() {}").unwrap();
396
397        let (state, _) = IndexState::build_from_root(root);
398        let index_id = state.path_to_id["src/index.ts"];
399        let util_id = state.path_to_id["src/util.ts"];
400        assert_eq!(state.deps.imports_of(index_id), vec![util_id]);
401        assert_eq!(state.deps.importers_of(util_id), vec![index_id]);
402    }
403
404    #[test]
405    fn lookup_path_handles_absolute_paths() {
406        let dir = tempdir().unwrap();
407        let root = dir.path();
408        fs::create_dir_all(root.join("a/b")).unwrap();
409        fs::write(root.join("a/b/c.py"), "x = 1\n").unwrap();
410        let (state, _) = IndexState::build_from_root(root);
411        let abs = root.join("a/b/c.py");
412        let id = state.lookup_path(abs.to_str().unwrap()).unwrap();
413        assert_eq!(state.path_to_id["a/b/c.py"], id);
414    }
415
416    #[test]
417    fn reindex_file_picks_up_changes_in_place() {
418        let dir = tempdir().unwrap();
419        let root = dir.path();
420        fs::create_dir_all(root.join("src")).unwrap();
421        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
422        let (mut state, _) = IndexState::build_from_root(root);
423        let id = state.path_to_id["src/a.ts"];
424        let before_hash = state.files[&id].content_hash;
425
426        fs::write(root.join("src/a.ts"), "export const x = 2;\n").unwrap();
427        let new_id = state.reindex_file(&root.join("src/a.ts")).unwrap();
428        assert_eq!(new_id, id, "file id should be stable across reindex");
429        let after_hash = state.files[&id].content_hash;
430        assert_ne!(before_hash, after_hash);
431    }
432
433    #[test]
434    fn reindex_file_removes_entry_when_path_disappears() {
435        let dir = tempdir().unwrap();
436        let root = dir.path();
437        fs::create_dir_all(root.join("src")).unwrap();
438        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
439        let (mut state, _) = IndexState::build_from_root(root);
440        assert!(state.path_to_id.contains_key("src/a.ts"));
441
442        fs::remove_file(root.join("src/a.ts")).unwrap();
443        let result = state.reindex_file(&root.join("src/a.ts"));
444        assert!(result.is_none());
445        assert!(!state.path_to_id.contains_key("src/a.ts"));
446    }
447}