Skip to main content

harn_hostlib/code_index/
state.rs

1//! Per-workspace index state.
2//!
3//! Owns the file table, trigram index, word index, dep graph, version
4//! log, and agent registry for one workspace root. Construction is via
5//! [`IndexState::build_from_root`], which walks the workspace, reads
6//! every indexable file, and populates every sub-index in a single pass
7//! before resolving imports.
8//!
9//! Single-file mutations (`reindex_file`, `remove_file`) flow through
10//! the same paths so the sub-indexes stay consistent across the
11//! incremental host ops drive.
12
13use std::collections::HashMap;
14use std::path::{Path, PathBuf};
15use std::time::{SystemTime, UNIX_EPOCH};
16
17use super::agents::AgentRegistry;
18use super::file_table::{fnv1a64, FileId, IndexedFile, IndexedSymbol};
19use super::graph::DepGraph;
20use super::imports;
21use super::overlay::OverlayState;
22use super::symbol_graph::SymbolGraph;
23use super::trigram::TrigramIndex;
24use super::versions::VersionLog;
25use super::walker::{is_indexable_file, language_for_extension, walk_indexable, MAX_FILE_BYTES};
26use super::words::WordIndex;
27
28use crate::ast::{Language as AstLanguage, Symbol as AstSymbol};
29
30/// In-memory index for one workspace. Composed from the per-file table,
31/// the trigram + word sub-indexes, the dep graph, the append-only version
32/// log, and the agent registry.
33pub struct IndexState {
34    /// Canonicalised workspace root.
35    pub root: PathBuf,
36    /// File table keyed on stable id.
37    pub files: HashMap<FileId, IndexedFile>,
38    /// Workspace-relative path → stable id.
39    pub path_to_id: HashMap<String, FileId>,
40    /// Trigram posting list.
41    pub trigrams: TrigramIndex,
42    /// Identifier-token inverted index.
43    pub words: WordIndex,
44    /// Forward + reverse import graph.
45    pub deps: DepGraph,
46    /// Append-only log of file mutations.
47    pub versions: VersionLog,
48    /// Live agents + advisory locks.
49    pub agents: AgentRegistry,
50    /// Typed symbol graph (issue #2434). Populated lazily on rebuild.
51    pub symbols: SymbolGraph,
52    /// Per-branch overlay registry (issue #2434).
53    pub overlays: OverlayState,
54    /// Wall-clock timestamp (ms since epoch) of the most recent rebuild.
55    pub last_built_unix_ms: i64,
56    /// Best-effort `HEAD` SHA, or `None` if the workspace isn't a git repo.
57    pub git_head: Option<String>,
58    next_id: FileId,
59}
60
61/// Summary returned from `IndexState::build_from_root`.
62#[derive(Debug, Default)]
63pub struct BuildOutcome {
64    /// Files that passed every filter and were ingested.
65    pub files_indexed: u64,
66    /// Files that matched the filename filter but couldn't be read or
67    /// were too large.
68    pub files_skipped: u64,
69}
70
71impl IndexState {
72    /// Build a fresh index over `root`. Returns the populated state plus a
73    /// summary of how many files were indexed vs skipped.
74    pub fn build_from_root(root: &Path) -> (Self, BuildOutcome) {
75        let canonical_root = canonicalize(root);
76        let mut state = IndexState {
77            root: canonical_root.clone(),
78            files: HashMap::new(),
79            path_to_id: HashMap::new(),
80            trigrams: TrigramIndex::new(),
81            words: WordIndex::new(),
82            deps: DepGraph::new(),
83            versions: VersionLog::new(),
84            agents: AgentRegistry::new(),
85            symbols: SymbolGraph::new(),
86            overlays: OverlayState::new(),
87            last_built_unix_ms: now_unix_ms(),
88            git_head: read_git_head(&canonical_root),
89            next_id: 1,
90        };
91        let mut outcome = BuildOutcome::default();
92        let mut to_resolve: Vec<(FileId, String)> = Vec::new();
93        walk_indexable(&canonical_root, |abs| match state.ingest(abs) {
94            Some(file_id) => {
95                outcome.files_indexed += 1;
96                if let Some(file) = state.files.get(&file_id) {
97                    to_resolve.push((file_id, file.relative_path.clone()));
98                }
99            }
100            None => {
101                outcome.files_skipped += 1;
102            }
103        });
104        for (id, rel) in to_resolve {
105            state.rebuild_deps(id, &rel);
106            state.rebuild_symbol_graph_for(id);
107        }
108        // Second pass: every Module node exists now, so resolve IMPORTS.
109        state.link_symbol_imports();
110        (state, outcome)
111    }
112
113    /// Re-index a single file by its absolute path. Returns the id of the
114    /// affected file (newly assigned or existing). If the file no longer
115    /// exists or fails the indexability/sensitivity filter, any existing
116    /// entry under that path is removed and `None` is returned.
117    pub fn reindex_file(&mut self, abs: &Path) -> Option<FileId> {
118        if !abs.exists() {
119            self.remove_file_path(abs);
120            return None;
121        }
122        if !is_indexable_file(abs) || super::walker::is_sensitive_path(abs) {
123            self.remove_file_path(abs);
124            return None;
125        }
126        let id = self.ingest(abs)?;
127        let rel = self
128            .files
129            .get(&id)
130            .map(|f| f.relative_path.clone())
131            .unwrap_or_default();
132        if !rel.is_empty() {
133            self.rebuild_deps(id, &rel);
134            self.rebuild_symbol_graph_for(id);
135            self.link_symbol_imports();
136        }
137        Some(id)
138    }
139
140    /// Remove an existing file from every sub-index. No-op when the file
141    /// isn't tracked.
142    pub fn remove_file_path(&mut self, abs: &Path) {
143        let Some(rel) = relative_path(&self.root, abs) else {
144            return;
145        };
146        let Some(id) = self.path_to_id.remove(&rel) else {
147            return;
148        };
149        self.files.remove(&id);
150        self.trigrams.remove_file(id);
151        self.words.remove_file(id);
152        self.deps.remove_file(id);
153        self.symbols.remove_file(id);
154    }
155
156    fn ingest(&mut self, abs: &Path) -> Option<FileId> {
157        if !is_indexable_file(abs) {
158            return None;
159        }
160        let metadata = std::fs::metadata(abs).ok()?;
161        if metadata.len() > MAX_FILE_BYTES {
162            return None;
163        }
164        let content = std::fs::read_to_string(abs).ok()?;
165        if content.len() > MAX_FILE_BYTES as usize {
166            return None;
167        }
168        let rel = relative_path(&self.root, abs)?;
169        let hash = fnv1a64(content.as_bytes());
170        let id = match self.path_to_id.get(&rel) {
171            Some(existing_id) => {
172                if let Some(file) = self.files.get(existing_id) {
173                    if file.content_hash == hash {
174                        return Some(*existing_id);
175                    }
176                }
177                *existing_id
178            }
179            None => {
180                let id = self.next_id;
181                self.next_id = self.next_id.checked_add(1).expect("FileId overflow");
182                self.path_to_id.insert(rel.clone(), id);
183                id
184            }
185        };
186
187        let ext = abs
188            .extension()
189            .and_then(|s| s.to_str())
190            .unwrap_or("")
191            .to_ascii_lowercase();
192        let language = language_for_extension(&ext).to_string();
193        let imports = imports::extract_imports(&content, &language);
194        let mtime_ms = metadata
195            .modified()
196            .ok()
197            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
198            .map(|d| d.as_millis() as i64)
199            .unwrap_or(0);
200        let line_count = if content.is_empty() {
201            0
202        } else {
203            content.split('\n').count() as u32
204        };
205
206        let file = IndexedFile {
207            id,
208            relative_path: rel,
209            language,
210            size_bytes: content.len() as u64,
211            line_count,
212            content_hash: hash,
213            mtime_ms,
214            symbols: Vec::new(),
215            imports,
216        };
217        self.trigrams.index_file(id, &content);
218        self.words.index_file(id, &content);
219        self.files.insert(id, file);
220        Some(id)
221    }
222
223    fn rebuild_deps(&mut self, id: FileId, relative_path: &str) {
224        let Some(file) = self.files.get(&id).cloned() else {
225            return;
226        };
227        let resolved = imports::resolve(
228            &file.imports,
229            relative_path,
230            &file.language,
231            &self.path_to_id,
232        );
233        self.deps
234            .set_edges(id, resolved.resolved, resolved.unresolved);
235    }
236
237    /// Re-parse `id`'s source and replace its slice of the typed symbol
238    /// graph in [`Self::symbols`]. Cheap to call after a single-file
239    /// reindex; the full-rebuild loop calls this once per file. Files
240    /// with no recognised tree-sitter grammar (the index also handles
241    /// `.md`, `.json`, …) are skipped silently — `IndexedFile::symbols`
242    /// stays empty for those files. For grammar-recognised files the
243    /// same parse populates `IndexedFile::symbols` so the
244    /// `outline_get` builtin doesn't have to re-parse on every call
245    /// (issue #2456).
246    pub(super) fn rebuild_symbol_graph_for(&mut self, id: FileId) {
247        let Some(file) = self.files.get(&id).cloned() else {
248            return;
249        };
250        let abs = self.root.join(&file.relative_path);
251        let Ok(source) = std::fs::read_to_string(&abs) else {
252            return;
253        };
254        let Some(language) = AstLanguage::detect(std::path::Path::new(&file.relative_path), None)
255        else {
256            return;
257        };
258        let outcome =
259            self.symbols
260                .rebuild_file(id, &file.relative_path, language, &source, &file.imports);
261        if let Some(file_mut) = self.files.get_mut(&id) {
262            file_mut.symbols = outcome
263                .symbols
264                .iter()
265                .map(indexed_symbol_from_ast)
266                .collect();
267        }
268    }
269
270    /// Walk every file's import-resolution table and add the
271    /// corresponding Module→Module IMPORTS edges in the typed graph.
272    /// Idempotent; called once at end-of-rebuild and after every
273    /// per-file reindex.
274    pub(super) fn link_symbol_imports(&mut self) {
275        let mut resolved: HashMap<FileId, Vec<FileId>> = HashMap::new();
276        for id in self.files.keys() {
277            resolved.insert(*id, self.deps.imports_of(*id));
278        }
279        self.symbols.link_imports(&resolved);
280    }
281
282    /// Look up a file by either its workspace-relative path or its
283    /// absolute path inside the workspace root.
284    pub fn lookup_path(&self, raw: &str) -> Option<FileId> {
285        if let Some(id) = self.path_to_id.get(raw) {
286            return Some(*id);
287        }
288        let path = Path::new(raw);
289        if path.is_absolute() {
290            if let Some(rel) = relative_path(&self.root, path) {
291                if let Some(id) = self.path_to_id.get(&rel) {
292                    return Some(*id);
293                }
294            }
295        }
296        None
297    }
298
299    /// Estimate the resident memory footprint of every sub-index. Cheap
300    /// order-of-magnitude figure surfaced by the `stats` builtin.
301    pub fn estimated_bytes(&self) -> usize {
302        let file_bytes: usize = self
303            .files
304            .values()
305            .map(|f| f.relative_path.len() + f.imports.iter().map(|s| s.len()).sum::<usize>() + 64)
306            .sum();
307        self.trigrams.estimated_bytes() + self.words.estimated_bytes() + file_bytes
308    }
309
310    /// Resolve a workspace-relative path against the canonical root.
311    /// Used by host builtins that take a `path` argument and need to
312    /// open the underlying file (e.g. `read_range`, `file_hash`).
313    pub fn absolute_path(&self, rel_or_abs: &str) -> Option<PathBuf> {
314        let p = Path::new(rel_or_abs);
315        let candidate = if p.is_absolute() {
316            p.to_path_buf()
317        } else {
318            self.root.join(p)
319        };
320        let canonical = canonicalize_existing(&candidate);
321        if canonical.strip_prefix(&self.root).is_ok() {
322            Some(canonical)
323        } else {
324            None
325        }
326    }
327
328    /// Construct an empty [`IndexState`] anchored at `root`. Used by the
329    /// snapshot path which fills in the sub-indexes itself.
330    pub(crate) fn empty(root: PathBuf) -> Self {
331        Self {
332            root,
333            files: HashMap::new(),
334            path_to_id: HashMap::new(),
335            trigrams: TrigramIndex::new(),
336            words: WordIndex::new(),
337            deps: DepGraph::new(),
338            versions: VersionLog::new(),
339            agents: AgentRegistry::new(),
340            symbols: SymbolGraph::new(),
341            overlays: OverlayState::new(),
342            last_built_unix_ms: 0,
343            git_head: None,
344            next_id: 1,
345        }
346    }
347
348    /// Borrow the `next_id` counter — exposed for snapshot serialisation.
349    pub(crate) fn next_file_id_internal(&self) -> FileId {
350        self.next_id
351    }
352
353    /// Restore the `next_id` counter from a serialised snapshot.
354    pub(crate) fn set_next_file_id(&mut self, id: FileId) {
355        self.next_id = id.max(1);
356    }
357}
358
359/// Map an AST-level [`AstSymbol`] (0-based tree-sitter coordinates) into
360/// the flat [`IndexedSymbol`] (1-based outline coordinates) that the
361/// `outline_get` builtin returns. Pure, used by
362/// [`IndexState::rebuild_symbol_graph_for`].
363fn indexed_symbol_from_ast(sym: &AstSymbol) -> IndexedSymbol {
364    IndexedSymbol {
365        name: sym.name.clone(),
366        kind: sym.kind.as_str().to_string(),
367        start_line: sym.start_row.saturating_add(1),
368        end_line: sym.end_row.saturating_add(1),
369        signature: sym.signature.clone(),
370    }
371}
372
373/// Return the current wall-clock time in milliseconds since the Unix
374/// epoch.
375pub(crate) fn now_unix_ms() -> i64 {
376    SystemTime::now()
377        .duration_since(UNIX_EPOCH)
378        .map(|d| d.as_millis() as i64)
379        .unwrap_or(0)
380}
381
382fn canonicalize(root: &Path) -> PathBuf {
383    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
384}
385
386/// Compute `abs` relative to `root`, using `/` separators. Returns `None`
387/// if `abs` is not inside `root`. Handles the missing-file case (where
388/// `canonicalize` would fail) by canonicalising the longest existing
389/// prefix and re-attaching the missing tail — so `remove_file_path` keeps
390/// working when the underlying path has just been deleted.
391pub(crate) fn relative_path(root: &Path, abs: &Path) -> Option<String> {
392    let canonical_abs = canonicalize_existing(abs);
393    let stripped = canonical_abs.strip_prefix(root).ok()?;
394    Some(stripped.to_string_lossy().replace('\\', "/"))
395}
396
397fn canonicalize_existing(abs: &Path) -> PathBuf {
398    if let Ok(c) = std::fs::canonicalize(abs) {
399        return c;
400    }
401    // Walk upward until we find a parent that does exist; canonicalise
402    // that and re-attach the missing tail.
403    let mut tail: Vec<&std::ffi::OsStr> = Vec::new();
404    let mut cursor = abs;
405    loop {
406        if cursor.exists() {
407            if let Ok(canonical) = std::fs::canonicalize(cursor) {
408                let mut out = canonical;
409                for piece in tail.iter().rev() {
410                    out = out.join(piece);
411                }
412                return out;
413            }
414            break;
415        }
416        match (cursor.parent(), cursor.file_name()) {
417            (Some(parent), Some(name)) if !parent.as_os_str().is_empty() => {
418                tail.push(name);
419                cursor = parent;
420            }
421            _ => break,
422        }
423    }
424    abs.to_path_buf()
425}
426
427fn read_git_head(workspace_root: &Path) -> Option<String> {
428    let head = workspace_root.join(".git").join("HEAD");
429    let txt = std::fs::read_to_string(&head).ok()?;
430    let line = txt.trim().to_string();
431    if let Some(ref_target) = line.strip_prefix("ref: ") {
432        let ref_path = workspace_root.join(".git").join(ref_target);
433        if let Ok(sha) = std::fs::read_to_string(&ref_path) {
434            return Some(sha.trim().to_string());
435        }
436    }
437    Some(line)
438}
439
440#[cfg(test)]
441mod tests {
442    use super::*;
443    use std::fs;
444    use tempfile::tempdir;
445
446    #[test]
447    fn build_indexes_files_and_resolves_imports() {
448        let dir = tempdir().unwrap();
449        let root = dir.path();
450        fs::create_dir_all(root.join("src")).unwrap();
451        fs::write(
452            root.join("src/main.rs"),
453            "use crate::util::helper;\nfn main() {}\n",
454        )
455        .unwrap();
456        fs::write(root.join("src/util.rs"), "pub fn helper() {}").unwrap();
457
458        let (state, outcome) = IndexState::build_from_root(root);
459        assert_eq!(outcome.files_indexed, 2);
460        assert_eq!(state.files.len(), 2);
461        let main_id = state.path_to_id["src/main.rs"];
462        let util_id = state.path_to_id["src/util.rs"];
463        // Rust uses `noop` resolution, so dep graph is empty.
464        assert_eq!(state.deps.imports_of(main_id), Vec::<FileId>::new());
465        let _ = util_id;
466    }
467
468    #[test]
469    fn typescript_imports_get_resolved() {
470        let dir = tempdir().unwrap();
471        let root = dir.path();
472        fs::create_dir_all(root.join("src")).unwrap();
473        fs::write(
474            root.join("src/index.ts"),
475            "import { helper } from \"./util\";\n",
476        )
477        .unwrap();
478        fs::write(root.join("src/util.ts"), "export function helper() {}").unwrap();
479
480        let (state, _) = IndexState::build_from_root(root);
481        let index_id = state.path_to_id["src/index.ts"];
482        let util_id = state.path_to_id["src/util.ts"];
483        assert_eq!(state.deps.imports_of(index_id), vec![util_id]);
484        assert_eq!(state.deps.importers_of(util_id), vec![index_id]);
485    }
486
487    #[test]
488    fn lookup_path_handles_absolute_paths() {
489        let dir = tempdir().unwrap();
490        let root = dir.path();
491        fs::create_dir_all(root.join("a/b")).unwrap();
492        fs::write(root.join("a/b/c.py"), "x = 1\n").unwrap();
493        let (state, _) = IndexState::build_from_root(root);
494        let abs = root.join("a/b/c.py");
495        let id = state.lookup_path(abs.to_str().unwrap()).unwrap();
496        assert_eq!(state.path_to_id["a/b/c.py"], id);
497    }
498
499    #[test]
500    fn reindex_file_picks_up_changes_in_place() {
501        let dir = tempdir().unwrap();
502        let root = dir.path();
503        fs::create_dir_all(root.join("src")).unwrap();
504        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
505        let (mut state, _) = IndexState::build_from_root(root);
506        let id = state.path_to_id["src/a.ts"];
507        let before_hash = state.files[&id].content_hash;
508
509        fs::write(root.join("src/a.ts"), "export const x = 2;\n").unwrap();
510        let new_id = state.reindex_file(&root.join("src/a.ts")).unwrap();
511        assert_eq!(new_id, id, "file id should be stable across reindex");
512        let after_hash = state.files[&id].content_hash;
513        assert_ne!(before_hash, after_hash);
514    }
515
516    #[test]
517    fn reindex_file_removes_entry_when_path_disappears() {
518        let dir = tempdir().unwrap();
519        let root = dir.path();
520        fs::create_dir_all(root.join("src")).unwrap();
521        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
522        let (mut state, _) = IndexState::build_from_root(root);
523        assert!(state.path_to_id.contains_key("src/a.ts"));
524
525        fs::remove_file(root.join("src/a.ts")).unwrap();
526        let result = state.reindex_file(&root.join("src/a.ts"));
527        assert!(result.is_none());
528        assert!(!state.path_to_id.contains_key("src/a.ts"));
529    }
530}