Skip to main content

harn_hostlib/code_index/
state.rs

1//! Per-workspace index state.
2//!
3//! Owns the file table, trigram index, word index, dep graph, version
4//! log, and agent registry for one workspace root. Construction is via
5//! [`IndexState::build_from_root`], which walks the workspace, reads
6//! every indexable file, and populates every sub-index in a single pass
7//! before resolving imports.
8//!
9//! Single-file mutations (`reindex_file`, `remove_file`) flow through
10//! the same paths so the sub-indexes stay consistent across the
11//! incremental host ops drive.
12
13use std::collections::HashMap;
14use std::path::{Path, PathBuf};
15use std::time::{SystemTime, UNIX_EPOCH};
16
17use super::agents::AgentRegistry;
18use super::file_table::{fnv1a64, FileId, IndexedFile, IndexedSymbol};
19use super::graph::DepGraph;
20use super::imports;
21use super::overlay::OverlayState;
22use super::symbol_graph::SymbolGraph;
23use super::trigram::TrigramIndex;
24use super::versions::VersionLog;
25use super::walker::{is_indexable_file, language_for_extension, walk_indexable, MAX_FILE_BYTES};
26use super::words::WordIndex;
27
28use crate::ast::{Language as AstLanguage, Symbol as AstSymbol};
29
30/// In-memory index for one workspace. Composed from the per-file table,
31/// the trigram + word sub-indexes, the dep graph, the append-only version
32/// log, and the agent registry.
33pub struct IndexState {
34    /// Canonicalised workspace root.
35    pub root: PathBuf,
36    /// File table keyed on stable id.
37    pub files: HashMap<FileId, IndexedFile>,
38    /// Workspace-relative path → stable id.
39    pub path_to_id: HashMap<String, FileId>,
40    /// Trigram posting list.
41    pub trigrams: TrigramIndex,
42    /// Identifier-token inverted index.
43    pub words: WordIndex,
44    /// Forward + reverse import graph.
45    pub deps: DepGraph,
46    /// Append-only log of file mutations.
47    pub versions: VersionLog,
48    /// Live agents + advisory locks.
49    pub agents: AgentRegistry,
50    /// Typed symbol graph (issue #2434). Populated lazily on rebuild.
51    pub symbols: SymbolGraph,
52    /// Per-branch overlay registry (issue #2434).
53    pub overlays: OverlayState,
54    /// Wall-clock timestamp (ms since epoch) of the most recent rebuild.
55    pub last_built_unix_ms: i64,
56    /// Best-effort `HEAD` SHA, or `None` if the workspace isn't a git repo.
57    pub git_head: Option<String>,
58    next_id: FileId,
59}
60
61/// Summary returned from `IndexState::build_from_root`.
62#[derive(Debug, Default)]
63pub struct BuildOutcome {
64    /// Files that passed every filter and were ingested.
65    pub files_indexed: u64,
66    /// Files that matched the filename filter but couldn't be read or
67    /// were too large.
68    pub files_skipped: u64,
69}
70
71impl IndexState {
72    /// Build a fresh index over `root`. Returns the populated state plus a
73    /// summary of how many files were indexed vs skipped.
74    pub fn build_from_root(root: &Path) -> (Self, BuildOutcome) {
75        let canonical_root = canonicalize(root);
76        let mut state = IndexState {
77            root: canonical_root.clone(),
78            files: HashMap::new(),
79            path_to_id: HashMap::new(),
80            trigrams: TrigramIndex::new(),
81            words: WordIndex::new(),
82            deps: DepGraph::new(),
83            versions: VersionLog::new(),
84            agents: AgentRegistry::new(),
85            symbols: SymbolGraph::new(),
86            overlays: OverlayState::new(),
87            last_built_unix_ms: now_unix_ms(),
88            git_head: read_git_head(&canonical_root),
89            next_id: 1,
90        };
91        let mut outcome = BuildOutcome::default();
92        let mut to_resolve: Vec<(FileId, String)> = Vec::new();
93        walk_indexable(&canonical_root, |abs| match state.ingest(abs) {
94            Some(file_id) => {
95                outcome.files_indexed += 1;
96                if let Some(file) = state.files.get(&file_id) {
97                    to_resolve.push((file_id, file.relative_path.clone()));
98                }
99            }
100            None => {
101                outcome.files_skipped += 1;
102            }
103        });
104        for (id, rel) in to_resolve {
105            state.rebuild_deps(id, &rel);
106            state.rebuild_symbol_graph_for(id);
107        }
108        // Second pass: every Module node exists now, so resolve IMPORTS.
109        state.link_symbol_imports();
110        (state, outcome)
111    }
112
113    /// Re-index a single file by its absolute path. Returns the id of the
114    /// affected file (newly assigned or existing). If the file no longer
115    /// exists or fails the indexability/sensitivity filter, any existing
116    /// entry under that path is removed and `None` is returned.
117    pub fn reindex_file(&mut self, abs: &Path) -> Option<FileId> {
118        if !abs.exists() {
119            self.remove_file_path(abs);
120            return None;
121        }
122        if !is_indexable_file(abs) || super::walker::is_sensitive_path(abs) {
123            self.remove_file_path(abs);
124            return None;
125        }
126        let id = self.ingest(abs)?;
127        let rel = self
128            .files
129            .get(&id)
130            .map(|f| f.relative_path.clone())
131            .unwrap_or_default();
132        if !rel.is_empty() {
133            self.rebuild_deps(id, &rel);
134            self.rebuild_symbol_graph_for(id);
135            self.link_symbol_imports();
136        }
137        Some(id)
138    }
139
140    /// Remove an existing file from every sub-index. No-op when the file
141    /// isn't tracked.
142    pub fn remove_file_path(&mut self, abs: &Path) {
143        let Some(rel) = relative_path(&self.root, abs) else {
144            return;
145        };
146        let Some(id) = self.path_to_id.remove(&rel) else {
147            return;
148        };
149        self.files.remove(&id);
150        self.trigrams.remove_file(id);
151        self.words.remove_file(id);
152        self.deps.remove_file(id);
153        self.symbols.remove_file(id);
154    }
155
156    fn ingest(&mut self, abs: &Path) -> Option<FileId> {
157        if !is_indexable_file(abs) {
158            return None;
159        }
160        let metadata = std::fs::metadata(abs).ok()?;
161        if metadata.len() > MAX_FILE_BYTES {
162            return None;
163        }
164        let content = std::fs::read_to_string(abs).ok()?;
165        if content.len() > MAX_FILE_BYTES as usize {
166            return None;
167        }
168        let rel = relative_path(&self.root, abs)?;
169        let hash = fnv1a64(content.as_bytes());
170        let id = match self.path_to_id.get(&rel) {
171            Some(existing_id) => {
172                if let Some(file) = self.files.get(existing_id) {
173                    if file.content_hash == hash {
174                        return Some(*existing_id);
175                    }
176                }
177                *existing_id
178            }
179            None => {
180                let id = self.next_id;
181                self.next_id = self.next_id.checked_add(1).expect("FileId overflow");
182                self.path_to_id.insert(rel.clone(), id);
183                id
184            }
185        };
186
187        let ext = abs
188            .extension()
189            .and_then(|s| s.to_str())
190            .unwrap_or("")
191            .to_ascii_lowercase();
192        let language = language_for_extension(&ext).to_string();
193        let imports = imports::extract_imports(&content, &language);
194        let mtime_ms = metadata
195            .modified()
196            .ok()
197            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
198            .map(|d| d.as_millis() as i64)
199            .unwrap_or(0);
200        let line_count = crate::text::count_lines(content.as_bytes()) as u32;
201
202        let file = IndexedFile {
203            id,
204            relative_path: rel,
205            language,
206            size_bytes: content.len() as u64,
207            line_count,
208            content_hash: hash,
209            mtime_ms,
210            symbols: Vec::new(),
211            imports,
212        };
213        self.trigrams.index_file(id, &content);
214        self.words.index_file(id, &content);
215        self.files.insert(id, file);
216        Some(id)
217    }
218
219    fn rebuild_deps(&mut self, id: FileId, relative_path: &str) {
220        let Some(file) = self.files.get(&id).cloned() else {
221            return;
222        };
223        let resolved = imports::resolve(
224            &file.imports,
225            relative_path,
226            &file.language,
227            &self.path_to_id,
228        );
229        self.deps
230            .set_edges(id, resolved.resolved, resolved.unresolved);
231    }
232
233    /// Re-parse `id`'s source and replace its slice of the typed symbol
234    /// graph in [`Self::symbols`]. Cheap to call after a single-file
235    /// reindex; the full-rebuild loop calls this once per file. Files
236    /// with no recognised tree-sitter grammar (the index also handles
237    /// `.md`, `.json`, …) are skipped silently — `IndexedFile::symbols`
238    /// stays empty for those files. For grammar-recognised files the
239    /// same parse populates `IndexedFile::symbols` so the
240    /// `outline_get` builtin doesn't have to re-parse on every call
241    /// (issue #2456).
242    pub(super) fn rebuild_symbol_graph_for(&mut self, id: FileId) {
243        let Some(file) = self.files.get(&id).cloned() else {
244            return;
245        };
246        let abs = self.root.join(&file.relative_path);
247        let Ok(source) = std::fs::read_to_string(&abs) else {
248            return;
249        };
250        let Some(language) = AstLanguage::detect(std::path::Path::new(&file.relative_path), None)
251        else {
252            return;
253        };
254        let outcome =
255            self.symbols
256                .rebuild_file(id, &file.relative_path, language, &source, &file.imports);
257        if let Some(file_mut) = self.files.get_mut(&id) {
258            file_mut.symbols = outcome
259                .symbols
260                .iter()
261                .map(indexed_symbol_from_ast)
262                .collect();
263        }
264    }
265
266    /// Walk every file's import-resolution table and add the
267    /// corresponding Module→Module IMPORTS edges in the typed graph.
268    /// Idempotent; called once at end-of-rebuild and after every
269    /// per-file reindex.
270    pub(super) fn link_symbol_imports(&mut self) {
271        let mut resolved: HashMap<FileId, Vec<FileId>> = HashMap::new();
272        for id in self.files.keys() {
273            resolved.insert(*id, self.deps.imports_of(*id));
274        }
275        self.symbols.link_imports(&resolved);
276    }
277
278    /// Look up a file by either its workspace-relative path or its
279    /// absolute path inside the workspace root.
280    pub fn lookup_path(&self, raw: &str) -> Option<FileId> {
281        if let Some(id) = self.path_to_id.get(raw) {
282            return Some(*id);
283        }
284        let path = Path::new(raw);
285        if path.is_absolute() {
286            if let Some(rel) = relative_path(&self.root, path) {
287                if let Some(id) = self.path_to_id.get(&rel) {
288                    return Some(*id);
289                }
290            }
291        }
292        None
293    }
294
295    /// Estimate the resident memory footprint of every sub-index. Cheap
296    /// order-of-magnitude figure surfaced by the `stats` builtin.
297    pub fn estimated_bytes(&self) -> usize {
298        let file_bytes: usize = self
299            .files
300            .values()
301            .map(|f| f.relative_path.len() + f.imports.iter().map(|s| s.len()).sum::<usize>() + 64)
302            .sum();
303        self.trigrams.estimated_bytes() + self.words.estimated_bytes() + file_bytes
304    }
305
306    /// Resolve a workspace-relative path against the canonical root.
307    /// Used by host builtins that take a `path` argument and need to
308    /// open the underlying file (e.g. `read_range`, `file_hash`).
309    pub fn absolute_path(&self, rel_or_abs: &str) -> Option<PathBuf> {
310        let p = Path::new(rel_or_abs);
311        let candidate = if p.is_absolute() {
312            p.to_path_buf()
313        } else {
314            self.root.join(p)
315        };
316        let canonical = canonicalize_existing(&candidate);
317        if canonical.strip_prefix(&self.root).is_ok() {
318            Some(canonical)
319        } else {
320            None
321        }
322    }
323
324    /// Construct an empty [`IndexState`] anchored at `root`. Used by the
325    /// snapshot path which fills in the sub-indexes itself.
326    pub(crate) fn empty(root: PathBuf) -> Self {
327        Self {
328            root,
329            files: HashMap::new(),
330            path_to_id: HashMap::new(),
331            trigrams: TrigramIndex::new(),
332            words: WordIndex::new(),
333            deps: DepGraph::new(),
334            versions: VersionLog::new(),
335            agents: AgentRegistry::new(),
336            symbols: SymbolGraph::new(),
337            overlays: OverlayState::new(),
338            last_built_unix_ms: 0,
339            git_head: None,
340            next_id: 1,
341        }
342    }
343
344    /// Borrow the `next_id` counter — exposed for snapshot serialisation.
345    pub(crate) fn next_file_id_internal(&self) -> FileId {
346        self.next_id
347    }
348
349    /// Restore the `next_id` counter from a serialised snapshot.
350    pub(crate) fn set_next_file_id(&mut self, id: FileId) {
351        self.next_id = id.max(1);
352    }
353}
354
355/// Map an AST-level [`AstSymbol`] (0-based tree-sitter coordinates) into
356/// the flat [`IndexedSymbol`] (1-based outline coordinates) that the
357/// `outline_get` builtin returns. Pure, used by
358/// [`IndexState::rebuild_symbol_graph_for`].
359fn indexed_symbol_from_ast(sym: &AstSymbol) -> IndexedSymbol {
360    IndexedSymbol {
361        name: sym.name.clone(),
362        kind: sym.kind.as_str().to_string(),
363        start_line: sym.start_row.saturating_add(1),
364        end_line: sym.end_row.saturating_add(1),
365        signature: sym.signature.clone(),
366    }
367}
368
369/// Return the current wall-clock time in milliseconds since the Unix
370/// epoch.
371pub(crate) fn now_unix_ms() -> i64 {
372    SystemTime::now()
373        .duration_since(UNIX_EPOCH)
374        .map(|d| d.as_millis() as i64)
375        .unwrap_or(0)
376}
377
378fn canonicalize(root: &Path) -> PathBuf {
379    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
380}
381
382/// Compute `abs` relative to `root`, using `/` separators. Returns `None`
383/// if `abs` is not inside `root`. Handles the missing-file case (where
384/// `canonicalize` would fail) by canonicalising the longest existing
385/// prefix and re-attaching the missing tail — so `remove_file_path` keeps
386/// working when the underlying path has just been deleted.
387pub(crate) fn relative_path(root: &Path, abs: &Path) -> Option<String> {
388    let canonical_abs = canonicalize_existing(abs);
389    let stripped = canonical_abs.strip_prefix(root).ok()?;
390    Some(stripped.to_string_lossy().replace('\\', "/"))
391}
392
393fn canonicalize_existing(abs: &Path) -> PathBuf {
394    if let Ok(c) = std::fs::canonicalize(abs) {
395        return c;
396    }
397    // Walk upward until we find a parent that does exist; canonicalise
398    // that and re-attach the missing tail.
399    let mut tail: Vec<&std::ffi::OsStr> = Vec::new();
400    let mut cursor = abs;
401    loop {
402        if cursor.exists() {
403            if let Ok(canonical) = std::fs::canonicalize(cursor) {
404                let mut out = canonical;
405                for piece in tail.iter().rev() {
406                    out = out.join(piece);
407                }
408                return out;
409            }
410            break;
411        }
412        match (cursor.parent(), cursor.file_name()) {
413            (Some(parent), Some(name)) if !parent.as_os_str().is_empty() => {
414                tail.push(name);
415                cursor = parent;
416            }
417            _ => break,
418        }
419    }
420    abs.to_path_buf()
421}
422
423fn read_git_head(workspace_root: &Path) -> Option<String> {
424    let head = workspace_root.join(".git").join("HEAD");
425    let txt = std::fs::read_to_string(&head).ok()?;
426    let line = txt.trim().to_string();
427    if let Some(ref_target) = line.strip_prefix("ref: ") {
428        let ref_path = workspace_root.join(".git").join(ref_target);
429        if let Ok(sha) = std::fs::read_to_string(&ref_path) {
430            return Some(sha.trim().to_string());
431        }
432    }
433    Some(line)
434}
435
436#[cfg(test)]
437mod tests {
438    use super::*;
439    use std::fs;
440    use tempfile::tempdir;
441
442    #[test]
443    fn build_indexes_files_and_resolves_imports() {
444        let dir = tempdir().unwrap();
445        let root = dir.path();
446        fs::create_dir_all(root.join("src")).unwrap();
447        fs::write(
448            root.join("src/main.rs"),
449            "use crate::util::helper;\nfn main() {}\n",
450        )
451        .unwrap();
452        fs::write(root.join("src/util.rs"), "pub fn helper() {}").unwrap();
453
454        let (state, outcome) = IndexState::build_from_root(root);
455        assert_eq!(outcome.files_indexed, 2);
456        assert_eq!(state.files.len(), 2);
457        let main_id = state.path_to_id["src/main.rs"];
458        let util_id = state.path_to_id["src/util.rs"];
459        // Rust uses `noop` resolution, so dep graph is empty.
460        assert_eq!(state.deps.imports_of(main_id), Vec::<FileId>::new());
461        let _ = util_id;
462    }
463
464    #[test]
465    fn typescript_imports_get_resolved() {
466        let dir = tempdir().unwrap();
467        let root = dir.path();
468        fs::create_dir_all(root.join("src")).unwrap();
469        fs::write(
470            root.join("src/index.ts"),
471            "import { helper } from \"./util\";\n",
472        )
473        .unwrap();
474        fs::write(root.join("src/util.ts"), "export function helper() {}").unwrap();
475
476        let (state, _) = IndexState::build_from_root(root);
477        let index_id = state.path_to_id["src/index.ts"];
478        let util_id = state.path_to_id["src/util.ts"];
479        assert_eq!(state.deps.imports_of(index_id), vec![util_id]);
480        assert_eq!(state.deps.importers_of(util_id), vec![index_id]);
481    }
482
483    #[test]
484    fn lookup_path_handles_absolute_paths() {
485        let dir = tempdir().unwrap();
486        let root = dir.path();
487        fs::create_dir_all(root.join("a/b")).unwrap();
488        fs::write(root.join("a/b/c.py"), "x = 1\n").unwrap();
489        let (state, _) = IndexState::build_from_root(root);
490        let abs = root.join("a/b/c.py");
491        let id = state.lookup_path(abs.to_str().unwrap()).unwrap();
492        assert_eq!(state.path_to_id["a/b/c.py"], id);
493    }
494
495    #[test]
496    fn reindex_file_picks_up_changes_in_place() {
497        let dir = tempdir().unwrap();
498        let root = dir.path();
499        fs::create_dir_all(root.join("src")).unwrap();
500        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
501        let (mut state, _) = IndexState::build_from_root(root);
502        let id = state.path_to_id["src/a.ts"];
503        let before_hash = state.files[&id].content_hash;
504
505        fs::write(root.join("src/a.ts"), "export const x = 2;\n").unwrap();
506        let new_id = state.reindex_file(&root.join("src/a.ts")).unwrap();
507        assert_eq!(new_id, id, "file id should be stable across reindex");
508        let after_hash = state.files[&id].content_hash;
509        assert_ne!(before_hash, after_hash);
510    }
511
512    #[test]
513    fn reindex_file_removes_entry_when_path_disappears() {
514        let dir = tempdir().unwrap();
515        let root = dir.path();
516        fs::create_dir_all(root.join("src")).unwrap();
517        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
518        let (mut state, _) = IndexState::build_from_root(root);
519        assert!(state.path_to_id.contains_key("src/a.ts"));
520
521        fs::remove_file(root.join("src/a.ts")).unwrap();
522        let result = state.reindex_file(&root.join("src/a.ts"));
523        assert!(result.is_none());
524        assert!(!state.path_to_id.contains_key("src/a.ts"));
525    }
526}