Skip to main content

harn_hostlib/code_index/
state.rs

1//! Per-workspace index state.
2//!
3//! Owns the file table, trigram index, word index, dep graph, version
4//! log, and agent registry for one workspace root. Construction is via
5//! [`IndexState::build_from_root`], which walks the workspace, reads
6//! every indexable file, and populates every sub-index in a single pass
7//! before resolving imports.
8//!
9//! Single-file mutations (`reindex_file`, `remove_file`) flow through
10//! the same paths so the sub-indexes stay consistent across the
11//! incremental host ops drive.
12
13use std::collections::HashMap;
14use std::path::{Path, PathBuf};
15use std::time::{SystemTime, UNIX_EPOCH};
16
17use super::agents::AgentRegistry;
18use super::file_table::{fnv1a64, FileId, IndexedFile};
19use super::graph::DepGraph;
20use super::imports;
21use super::trigram::TrigramIndex;
22use super::versions::VersionLog;
23use super::walker::{is_indexable_file, language_for_extension, walk_indexable, MAX_FILE_BYTES};
24use super::words::WordIndex;
25
26/// In-memory index for one workspace. Composed from the per-file table,
27/// the trigram + word sub-indexes, the dep graph, the append-only version
28/// log, and the agent registry.
29pub struct IndexState {
30    /// Canonicalised workspace root.
31    pub root: PathBuf,
32    /// File table keyed on stable id.
33    pub files: HashMap<FileId, IndexedFile>,
34    /// Workspace-relative path → stable id.
35    pub path_to_id: HashMap<String, FileId>,
36    /// Trigram posting list.
37    pub trigrams: TrigramIndex,
38    /// Identifier-token inverted index.
39    pub words: WordIndex,
40    /// Forward + reverse import graph.
41    pub deps: DepGraph,
42    /// Append-only log of file mutations.
43    pub versions: VersionLog,
44    /// Live agents + advisory locks.
45    pub agents: AgentRegistry,
46    /// Wall-clock timestamp (ms since epoch) of the most recent rebuild.
47    pub last_built_unix_ms: i64,
48    /// Best-effort `HEAD` SHA, or `None` if the workspace isn't a git repo.
49    pub git_head: Option<String>,
50    next_id: FileId,
51}
52
53/// Summary returned from `IndexState::build_from_root`.
54#[derive(Debug, Default)]
55pub struct BuildOutcome {
56    /// Files that passed every filter and were ingested.
57    pub files_indexed: u64,
58    /// Files that matched the filename filter but couldn't be read or
59    /// were too large.
60    pub files_skipped: u64,
61}
62
63impl IndexState {
64    /// Build a fresh index over `root`. Returns the populated state plus a
65    /// summary of how many files were indexed vs skipped.
66    pub fn build_from_root(root: &Path) -> (Self, BuildOutcome) {
67        let canonical_root = canonicalize(root);
68        let mut state = IndexState {
69            root: canonical_root.clone(),
70            files: HashMap::new(),
71            path_to_id: HashMap::new(),
72            trigrams: TrigramIndex::new(),
73            words: WordIndex::new(),
74            deps: DepGraph::new(),
75            versions: VersionLog::new(),
76            agents: AgentRegistry::new(),
77            last_built_unix_ms: now_unix_ms(),
78            git_head: read_git_head(&canonical_root),
79            next_id: 1,
80        };
81        let mut outcome = BuildOutcome::default();
82        let mut to_resolve: Vec<(FileId, String)> = Vec::new();
83        walk_indexable(&canonical_root, |abs| match state.ingest(abs) {
84            Some(file_id) => {
85                outcome.files_indexed += 1;
86                if let Some(file) = state.files.get(&file_id) {
87                    to_resolve.push((file_id, file.relative_path.clone()));
88                }
89            }
90            None => {
91                outcome.files_skipped += 1;
92            }
93        });
94        for (id, rel) in to_resolve {
95            state.rebuild_deps(id, &rel);
96        }
97        (state, outcome)
98    }
99
100    /// Re-index a single file by its absolute path. Returns the id of the
101    /// affected file (newly assigned or existing). If the file no longer
102    /// exists or fails the indexability/sensitivity filter, any existing
103    /// entry under that path is removed and `None` is returned.
104    pub fn reindex_file(&mut self, abs: &Path) -> Option<FileId> {
105        if !abs.exists() {
106            self.remove_file_path(abs);
107            return None;
108        }
109        if !is_indexable_file(abs) || super::walker::is_sensitive_path(abs) {
110            self.remove_file_path(abs);
111            return None;
112        }
113        let id = self.ingest(abs)?;
114        let rel = self
115            .files
116            .get(&id)
117            .map(|f| f.relative_path.clone())
118            .unwrap_or_default();
119        if !rel.is_empty() {
120            self.rebuild_deps(id, &rel);
121        }
122        Some(id)
123    }
124
125    /// Remove an existing file from every sub-index. No-op when the file
126    /// isn't tracked.
127    pub fn remove_file_path(&mut self, abs: &Path) {
128        let Some(rel) = relative_path(&self.root, abs) else {
129            return;
130        };
131        let Some(id) = self.path_to_id.remove(&rel) else {
132            return;
133        };
134        self.files.remove(&id);
135        self.trigrams.remove_file(id);
136        self.words.remove_file(id);
137        self.deps.remove_file(id);
138    }
139
140    fn ingest(&mut self, abs: &Path) -> Option<FileId> {
141        if !is_indexable_file(abs) {
142            return None;
143        }
144        let metadata = std::fs::metadata(abs).ok()?;
145        if metadata.len() > MAX_FILE_BYTES {
146            return None;
147        }
148        let content = std::fs::read_to_string(abs).ok()?;
149        if content.len() > MAX_FILE_BYTES as usize {
150            return None;
151        }
152        let rel = relative_path(&self.root, abs)?;
153        let hash = fnv1a64(content.as_bytes());
154        let id = match self.path_to_id.get(&rel) {
155            Some(existing_id) => {
156                if let Some(file) = self.files.get(existing_id) {
157                    if file.content_hash == hash {
158                        return Some(*existing_id);
159                    }
160                }
161                *existing_id
162            }
163            None => {
164                let id = self.next_id;
165                self.next_id = self.next_id.checked_add(1).expect("FileId overflow");
166                self.path_to_id.insert(rel.clone(), id);
167                id
168            }
169        };
170
171        let ext = abs
172            .extension()
173            .and_then(|s| s.to_str())
174            .unwrap_or("")
175            .to_ascii_lowercase();
176        let language = language_for_extension(&ext).to_string();
177        let imports = imports::extract_imports(&content, &language);
178        let mtime_ms = metadata
179            .modified()
180            .ok()
181            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
182            .map(|d| d.as_millis() as i64)
183            .unwrap_or(0);
184        let line_count = if content.is_empty() {
185            0
186        } else {
187            content.split('\n').count() as u32
188        };
189
190        let file = IndexedFile {
191            id,
192            relative_path: rel,
193            language,
194            size_bytes: content.len() as u64,
195            line_count,
196            content_hash: hash,
197            mtime_ms,
198            symbols: Vec::new(),
199            imports,
200        };
201        self.trigrams.index_file(id, &content);
202        self.words.index_file(id, &content);
203        self.files.insert(id, file);
204        Some(id)
205    }
206
207    fn rebuild_deps(&mut self, id: FileId, relative_path: &str) {
208        let Some(file) = self.files.get(&id).cloned() else {
209            return;
210        };
211        let resolved = imports::resolve(
212            &file.imports,
213            relative_path,
214            &file.language,
215            &self.path_to_id,
216        );
217        self.deps
218            .set_edges(id, resolved.resolved, resolved.unresolved);
219    }
220
221    /// Look up a file by either its workspace-relative path or its
222    /// absolute path inside the workspace root.
223    pub fn lookup_path(&self, raw: &str) -> Option<FileId> {
224        if let Some(id) = self.path_to_id.get(raw) {
225            return Some(*id);
226        }
227        let path = Path::new(raw);
228        if path.is_absolute() {
229            if let Some(rel) = relative_path(&self.root, path) {
230                if let Some(id) = self.path_to_id.get(&rel) {
231                    return Some(*id);
232                }
233            }
234        }
235        None
236    }
237
238    /// Estimate the resident memory footprint of every sub-index. Cheap
239    /// order-of-magnitude figure surfaced by the `stats` builtin.
240    pub fn estimated_bytes(&self) -> usize {
241        let file_bytes: usize = self
242            .files
243            .values()
244            .map(|f| f.relative_path.len() + f.imports.iter().map(|s| s.len()).sum::<usize>() + 64)
245            .sum();
246        self.trigrams.estimated_bytes() + self.words.estimated_bytes() + file_bytes
247    }
248
249    /// Resolve a workspace-relative path against the canonical root.
250    /// Used by host builtins that take a `path` argument and need to
251    /// open the underlying file (e.g. `read_range`, `file_hash`).
252    pub fn absolute_path(&self, rel_or_abs: &str) -> Option<PathBuf> {
253        let p = Path::new(rel_or_abs);
254        let candidate = if p.is_absolute() {
255            p.to_path_buf()
256        } else {
257            self.root.join(p)
258        };
259        let canonical = canonicalize_existing(&candidate);
260        if canonical.strip_prefix(&self.root).is_ok() {
261            Some(canonical)
262        } else {
263            None
264        }
265    }
266
267    /// Construct an empty [`IndexState`] anchored at `root`. Used by the
268    /// snapshot path which fills in the sub-indexes itself.
269    pub(crate) fn empty(root: PathBuf) -> Self {
270        Self {
271            root,
272            files: HashMap::new(),
273            path_to_id: HashMap::new(),
274            trigrams: TrigramIndex::new(),
275            words: WordIndex::new(),
276            deps: DepGraph::new(),
277            versions: VersionLog::new(),
278            agents: AgentRegistry::new(),
279            last_built_unix_ms: 0,
280            git_head: None,
281            next_id: 1,
282        }
283    }
284
285    /// Borrow the `next_id` counter — exposed for snapshot serialisation.
286    pub(crate) fn next_file_id_internal(&self) -> FileId {
287        self.next_id
288    }
289
290    /// Restore the `next_id` counter from a serialised snapshot.
291    pub(crate) fn set_next_file_id(&mut self, id: FileId) {
292        self.next_id = id.max(1);
293    }
294}
295
296/// Return the current wall-clock time in milliseconds since the Unix
297/// epoch.
298pub(crate) fn now_unix_ms() -> i64 {
299    SystemTime::now()
300        .duration_since(UNIX_EPOCH)
301        .map(|d| d.as_millis() as i64)
302        .unwrap_or(0)
303}
304
305fn canonicalize(root: &Path) -> PathBuf {
306    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
307}
308
309/// Compute `abs` relative to `root`, using `/` separators. Returns `None`
310/// if `abs` is not inside `root`. Handles the missing-file case (where
311/// `canonicalize` would fail) by canonicalising the longest existing
312/// prefix and re-attaching the missing tail — so `remove_file_path` keeps
313/// working when the underlying path has just been deleted.
314pub(crate) fn relative_path(root: &Path, abs: &Path) -> Option<String> {
315    let canonical_abs = canonicalize_existing(abs);
316    let stripped = canonical_abs.strip_prefix(root).ok()?;
317    Some(stripped.to_string_lossy().replace('\\', "/"))
318}
319
320fn canonicalize_existing(abs: &Path) -> PathBuf {
321    if let Ok(c) = std::fs::canonicalize(abs) {
322        return c;
323    }
324    // Walk upward until we find a parent that does exist; canonicalise
325    // that and re-attach the missing tail.
326    let mut tail: Vec<&std::ffi::OsStr> = Vec::new();
327    let mut cursor = abs;
328    loop {
329        if cursor.exists() {
330            if let Ok(canonical) = std::fs::canonicalize(cursor) {
331                let mut out = canonical;
332                for piece in tail.iter().rev() {
333                    out = out.join(piece);
334                }
335                return out;
336            }
337            break;
338        }
339        match (cursor.parent(), cursor.file_name()) {
340            (Some(parent), Some(name)) if !parent.as_os_str().is_empty() => {
341                tail.push(name);
342                cursor = parent;
343            }
344            _ => break,
345        }
346    }
347    abs.to_path_buf()
348}
349
350fn read_git_head(workspace_root: &Path) -> Option<String> {
351    let head = workspace_root.join(".git").join("HEAD");
352    let txt = std::fs::read_to_string(&head).ok()?;
353    let line = txt.trim().to_string();
354    if let Some(ref_target) = line.strip_prefix("ref: ") {
355        let ref_path = workspace_root.join(".git").join(ref_target);
356        if let Ok(sha) = std::fs::read_to_string(&ref_path) {
357            return Some(sha.trim().to_string());
358        }
359    }
360    Some(line)
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366    use std::fs;
367    use tempfile::tempdir;
368
369    #[test]
370    fn build_indexes_files_and_resolves_imports() {
371        let dir = tempdir().unwrap();
372        let root = dir.path();
373        fs::create_dir_all(root.join("src")).unwrap();
374        fs::write(
375            root.join("src/main.rs"),
376            "use crate::util::helper;\nfn main() {}\n",
377        )
378        .unwrap();
379        fs::write(root.join("src/util.rs"), "pub fn helper() {}").unwrap();
380
381        let (state, outcome) = IndexState::build_from_root(root);
382        assert_eq!(outcome.files_indexed, 2);
383        assert_eq!(state.files.len(), 2);
384        let main_id = state.path_to_id["src/main.rs"];
385        let util_id = state.path_to_id["src/util.rs"];
386        // Rust uses `noop` resolution, so dep graph is empty.
387        assert_eq!(state.deps.imports_of(main_id), Vec::<FileId>::new());
388        let _ = util_id;
389    }
390
391    #[test]
392    fn typescript_imports_get_resolved() {
393        let dir = tempdir().unwrap();
394        let root = dir.path();
395        fs::create_dir_all(root.join("src")).unwrap();
396        fs::write(
397            root.join("src/index.ts"),
398            "import { helper } from \"./util\";\n",
399        )
400        .unwrap();
401        fs::write(root.join("src/util.ts"), "export function helper() {}").unwrap();
402
403        let (state, _) = IndexState::build_from_root(root);
404        let index_id = state.path_to_id["src/index.ts"];
405        let util_id = state.path_to_id["src/util.ts"];
406        assert_eq!(state.deps.imports_of(index_id), vec![util_id]);
407        assert_eq!(state.deps.importers_of(util_id), vec![index_id]);
408    }
409
410    #[test]
411    fn lookup_path_handles_absolute_paths() {
412        let dir = tempdir().unwrap();
413        let root = dir.path();
414        fs::create_dir_all(root.join("a/b")).unwrap();
415        fs::write(root.join("a/b/c.py"), "x = 1\n").unwrap();
416        let (state, _) = IndexState::build_from_root(root);
417        let abs = root.join("a/b/c.py");
418        let id = state.lookup_path(abs.to_str().unwrap()).unwrap();
419        assert_eq!(state.path_to_id["a/b/c.py"], id);
420    }
421
422    #[test]
423    fn reindex_file_picks_up_changes_in_place() {
424        let dir = tempdir().unwrap();
425        let root = dir.path();
426        fs::create_dir_all(root.join("src")).unwrap();
427        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
428        let (mut state, _) = IndexState::build_from_root(root);
429        let id = state.path_to_id["src/a.ts"];
430        let before_hash = state.files[&id].content_hash;
431
432        fs::write(root.join("src/a.ts"), "export const x = 2;\n").unwrap();
433        let new_id = state.reindex_file(&root.join("src/a.ts")).unwrap();
434        assert_eq!(new_id, id, "file id should be stable across reindex");
435        let after_hash = state.files[&id].content_hash;
436        assert_ne!(before_hash, after_hash);
437    }
438
439    #[test]
440    fn reindex_file_removes_entry_when_path_disappears() {
441        let dir = tempdir().unwrap();
442        let root = dir.path();
443        fs::create_dir_all(root.join("src")).unwrap();
444        fs::write(root.join("src/a.ts"), "export const x = 1;\n").unwrap();
445        let (mut state, _) = IndexState::build_from_root(root);
446        assert!(state.path_to_id.contains_key("src/a.ts"));
447
448        fs::remove_file(root.join("src/a.ts")).unwrap();
449        let result = state.reindex_file(&root.join("src/a.ts"));
450        assert!(result.is_none());
451        assert!(!state.path_to_id.contains_key("src/a.ts"));
452    }
453}