Skip to main content

harn_hostlib/code_index/
state.rs

1//! Per-workspace index state.
2//!
3//! Owns the file table, trigram index, word index, and dep graph for one
4//! workspace root. Construction is via [`IndexState::build_from_root`],
5//! which walks the workspace, reads every indexable file, and populates
6//! every sub-index in a single pass before resolving imports.
7
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10use std::time::{SystemTime, UNIX_EPOCH};
11
12use super::file_table::{fnv1a64, FileId, IndexedFile};
13use super::graph::DepGraph;
14use super::imports;
15use super::trigram::TrigramIndex;
16use super::walker::{is_indexable_file, language_for_extension, walk_indexable, MAX_FILE_BYTES};
17use super::words::WordIndex;
18
19/// In-memory index for one workspace. Composed from the per-file table,
20/// the trigram + word sub-indexes, and the dep graph.
21pub struct IndexState {
22    /// Canonicalised workspace root.
23    pub root: PathBuf,
24    /// File table keyed on stable id.
25    pub files: HashMap<FileId, IndexedFile>,
26    /// Workspace-relative path → stable id.
27    pub path_to_id: HashMap<String, FileId>,
28    /// Trigram posting list.
29    pub trigrams: TrigramIndex,
30    /// Identifier-token inverted index.
31    pub words: WordIndex,
32    /// Forward + reverse import graph.
33    pub deps: DepGraph,
34    /// Wall-clock timestamp (ms since epoch) of the most recent rebuild.
35    pub last_built_unix_ms: i64,
36    /// Best-effort `HEAD` SHA, or `None` if the workspace isn't a git repo.
37    pub git_head: Option<String>,
38    next_id: FileId,
39}
40
41/// Summary returned from `IndexState::build_from_root`.
42#[derive(Debug, Default)]
43pub struct BuildOutcome {
44    /// Files that passed every filter and were ingested.
45    pub files_indexed: u64,
46    /// Files that matched the filename filter but couldn't be read or
47    /// were too large.
48    pub files_skipped: u64,
49}
50
51impl IndexState {
52    /// Build a fresh index over `root`. Returns the populated state plus a
53    /// summary of how many files were indexed vs skipped.
54    pub fn build_from_root(root: &Path) -> (Self, BuildOutcome) {
55        let canonical_root = canonicalize(root);
56        let mut state = IndexState {
57            root: canonical_root.clone(),
58            files: HashMap::new(),
59            path_to_id: HashMap::new(),
60            trigrams: TrigramIndex::new(),
61            words: WordIndex::new(),
62            deps: DepGraph::new(),
63            last_built_unix_ms: now_unix_ms(),
64            git_head: read_git_head(&canonical_root),
65            next_id: 1,
66        };
67        let mut outcome = BuildOutcome::default();
68        let mut to_resolve: Vec<(FileId, String)> = Vec::new();
69        walk_indexable(&canonical_root, |abs| match state.ingest(abs) {
70            Some(file_id) => {
71                outcome.files_indexed += 1;
72                if let Some(file) = state.files.get(&file_id) {
73                    to_resolve.push((file_id, file.relative_path.clone()));
74                }
75            }
76            None => {
77                outcome.files_skipped += 1;
78            }
79        });
80        for (id, rel) in to_resolve {
81            state.rebuild_deps(id, &rel);
82        }
83        (state, outcome)
84    }
85
86    fn ingest(&mut self, abs: &Path) -> Option<FileId> {
87        if !is_indexable_file(abs) {
88            return None;
89        }
90        let metadata = std::fs::metadata(abs).ok()?;
91        if metadata.len() > MAX_FILE_BYTES {
92            return None;
93        }
94        let content = std::fs::read_to_string(abs).ok()?;
95        if content.len() > MAX_FILE_BYTES as usize {
96            return None;
97        }
98        let rel = relative_path(&self.root, abs)?;
99        let hash = fnv1a64(content.as_bytes());
100        let id = match self.path_to_id.get(&rel) {
101            Some(existing_id) => {
102                if let Some(file) = self.files.get(existing_id) {
103                    if file.content_hash == hash {
104                        return Some(*existing_id);
105                    }
106                }
107                *existing_id
108            }
109            None => {
110                let id = self.next_id;
111                self.next_id = self.next_id.checked_add(1).expect("FileId overflow");
112                self.path_to_id.insert(rel.clone(), id);
113                id
114            }
115        };
116
117        let ext = abs
118            .extension()
119            .and_then(|s| s.to_str())
120            .unwrap_or("")
121            .to_ascii_lowercase();
122        let language = language_for_extension(&ext).to_string();
123        let imports = imports::extract_imports(&content, &language);
124        let mtime_ms = metadata
125            .modified()
126            .ok()
127            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
128            .map(|d| d.as_millis() as i64)
129            .unwrap_or(0);
130        let line_count = if content.is_empty() {
131            0
132        } else {
133            content.split('\n').count() as u32
134        };
135
136        let file = IndexedFile {
137            id,
138            relative_path: rel,
139            language,
140            size_bytes: content.len() as u64,
141            line_count,
142            content_hash: hash,
143            mtime_ms,
144            symbols: Vec::new(),
145            imports,
146        };
147        self.trigrams.index_file(id, &content);
148        self.words.index_file(id, &content);
149        self.files.insert(id, file);
150        Some(id)
151    }
152
153    fn rebuild_deps(&mut self, id: FileId, relative_path: &str) {
154        let Some(file) = self.files.get(&id).cloned() else {
155            return;
156        };
157        let resolved = imports::resolve(
158            &file.imports,
159            relative_path,
160            &file.language,
161            &self.path_to_id,
162        );
163        self.deps
164            .set_edges(id, resolved.resolved, resolved.unresolved);
165    }
166
167    /// Look up a file by either its workspace-relative path or its
168    /// absolute path inside the workspace root.
169    pub fn lookup_path(&self, raw: &str) -> Option<FileId> {
170        if let Some(id) = self.path_to_id.get(raw) {
171            return Some(*id);
172        }
173        let path = Path::new(raw);
174        if path.is_absolute() {
175            if let Some(rel) = relative_path(&self.root, path) {
176                if let Some(id) = self.path_to_id.get(&rel) {
177                    return Some(*id);
178                }
179            }
180        }
181        None
182    }
183
184    /// Estimate the resident memory footprint of every sub-index. Cheap
185    /// order-of-magnitude figure surfaced by the `stats` builtin.
186    pub fn estimated_bytes(&self) -> usize {
187        let file_bytes: usize = self
188            .files
189            .values()
190            .map(|f| f.relative_path.len() + f.imports.iter().map(|s| s.len()).sum::<usize>() + 64)
191            .sum();
192        self.trigrams.estimated_bytes() + self.words.estimated_bytes() + file_bytes
193    }
194}
195
196fn now_unix_ms() -> i64 {
197    SystemTime::now()
198        .duration_since(UNIX_EPOCH)
199        .map(|d| d.as_millis() as i64)
200        .unwrap_or(0)
201}
202
203fn canonicalize(root: &Path) -> PathBuf {
204    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
205}
206
207/// Compute `abs` relative to `root`, using `/` separators. Returns `None`
208/// if `abs` is not inside `root`.
209pub(crate) fn relative_path(root: &Path, abs: &Path) -> Option<String> {
210    let canonical_abs = std::fs::canonicalize(abs).unwrap_or_else(|_| abs.to_path_buf());
211    let stripped = canonical_abs.strip_prefix(root).ok()?;
212    Some(stripped.to_string_lossy().replace('\\', "/"))
213}
214
215fn read_git_head(workspace_root: &Path) -> Option<String> {
216    let head = workspace_root.join(".git").join("HEAD");
217    let txt = std::fs::read_to_string(&head).ok()?;
218    let line = txt.trim().to_string();
219    if let Some(ref_target) = line.strip_prefix("ref: ") {
220        let ref_path = workspace_root.join(".git").join(ref_target);
221        if let Ok(sha) = std::fs::read_to_string(&ref_path) {
222            return Some(sha.trim().to_string());
223        }
224    }
225    Some(line)
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231    use std::fs;
232    use tempfile::tempdir;
233
234    #[test]
235    fn build_indexes_files_and_resolves_imports() {
236        let dir = tempdir().unwrap();
237        let root = dir.path();
238        fs::create_dir_all(root.join("src")).unwrap();
239        fs::write(
240            root.join("src/main.rs"),
241            "use crate::util::helper;\nfn main() {}\n",
242        )
243        .unwrap();
244        fs::write(root.join("src/util.rs"), "pub fn helper() {}").unwrap();
245
246        let (state, outcome) = IndexState::build_from_root(root);
247        assert_eq!(outcome.files_indexed, 2);
248        assert_eq!(state.files.len(), 2);
249        let main_id = state.path_to_id["src/main.rs"];
250        let util_id = state.path_to_id["src/util.rs"];
251        // Rust uses `noop` resolution, so dep graph is empty.
252        assert_eq!(state.deps.imports_of(main_id), Vec::<FileId>::new());
253        let _ = util_id;
254    }
255
256    #[test]
257    fn typescript_imports_get_resolved() {
258        let dir = tempdir().unwrap();
259        let root = dir.path();
260        fs::create_dir_all(root.join("src")).unwrap();
261        fs::write(
262            root.join("src/index.ts"),
263            "import { helper } from \"./util\";\n",
264        )
265        .unwrap();
266        fs::write(root.join("src/util.ts"), "export function helper() {}").unwrap();
267
268        let (state, _) = IndexState::build_from_root(root);
269        let index_id = state.path_to_id["src/index.ts"];
270        let util_id = state.path_to_id["src/util.ts"];
271        assert_eq!(state.deps.imports_of(index_id), vec![util_id]);
272        assert_eq!(state.deps.importers_of(util_id), vec![index_id]);
273    }
274
275    #[test]
276    fn lookup_path_handles_absolute_paths() {
277        let dir = tempdir().unwrap();
278        let root = dir.path();
279        fs::create_dir_all(root.join("a/b")).unwrap();
280        fs::write(root.join("a/b/c.py"), "x = 1\n").unwrap();
281        let (state, _) = IndexState::build_from_root(root);
282        let abs = root.join("a/b/c.py");
283        let id = state.lookup_path(abs.to_str().unwrap()).unwrap();
284        assert_eq!(state.path_to_id["a/b/c.py"], id);
285    }
286}