Skip to main content

harn_hostlib/code_index/
mod.rs

1//! Code index host capability.
2//!
3//! Deterministic trigram/word index plus live workspace state (agent
4//! registry, advisory locks, append-only version log, file id assignment,
5//! cached reads). The capability owns one [`SharedIndex`] cell per
6//! instance; cloning the capability shares state with every Harn VM that
7//! has been wired against it.
8//!
9//! Surface — every builtin is locked by `schemas/code_index/<method>.json`:
10//!
11//! ### Workspace queries (the original 5)
12//!
13//! | Builtin                          | What it does                                           |
14//! |----------------------------------|--------------------------------------------------------|
15//! | `hostlib_code_index_query`       | Trigram-accelerated literal substring search.          |
16//! | `hostlib_code_index_rebuild`     | Walk a workspace and (re)build the in-memory index.    |
17//! | `hostlib_code_index_stats`       | Count files/trigrams/words + last rebuild timestamp.   |
18//! | `hostlib_code_index_imports_for` | Imports declared by a single file (with resolutions).  |
19//! | `hostlib_code_index_importers_of`| Reverse lookup: who imports the given module/path?     |
20//!
21//! ### Live workspace state (added in #776)
22//!
23//! - **Agents**: `agent_register`, `agent_heartbeat`, `agent_unregister`,
24//!   `current_agent_id`, `status`.
25//! - **Locks**: `lock_try`, `lock_release`.
26//! - **Change log**: `current_seq`, `changes_since`, `version_record`.
27//! - **File table**: `path_to_id`, `id_to_path`, `file_ids`, `file_meta`,
28//!   `file_hash`.
29//! - **Cached reads**: `read_range`, `reindex_file`, `trigram_query`,
30//!   `extract_trigrams`, `word_get`, `deps_get`, `outline_get`.
31//!
32//! ### Typed symbol graph (added in #2434)
33//!
34//! - **`cypher`**: read-only Cypher executor over the typed graph
35//!   ([`SymbolGraph`]) — `MATCH ... WHERE ... RETURN` with typed
36//!   nodes (Function|Type|Module|Import|CallSite|Macro), typed edges
37//!   (CALLS|REFS|IMPORTS|CONTAINS|OVERRIDES, plus `_BY` inverses),
38//!   and variable-length hops up to depth 4.
39//! - **`branch_overlay`**: per-branch CDC overlay that layers a delta
40//!   on top of the base graph; reuses ≥95% of the main index in
41//!   storage/CPU for untouched files. See [`BranchOverlay`].
42//! - **`freshness`**: per-file hash + mtime comparison against the
43//!   indexed snapshot; consumers detect staleness without forcing a
44//!   rebuild.
45//! - **`repo_map`**: personalized PageRank over the typed graph, rendered
46//!   as a token-budgeted symbol map for agent grounding.
47//!
48//! ### Cross-file safe rename (added in #2508)
49//!
50//! - **`rename_symbol`**: rewrite a symbol across `file | module |
51//!   workspace` using the typed graph for symbol resolution and
52//!   tree-sitter identifier kinds for safe text spans. Detects
53//!   `new_name` shadowing in any rewritten file and aborts before any
54//!   write. Routes through staged-fs (#1722) when a `session_id` is
55//!   supplied so all touched files succeed or none do.
56//!
57//! ## Concurrency model
58//!
59//! All ops serialise through a single `Arc<Mutex<Option<IndexState>>>` so
60//! the IDE editor, eval, and live agent all see one consistent view. The
61//! capability is `Send + Sync` so embedders can share it across threads,
62//! but the mutex still serialises actual work.
63
64mod agents;
65mod builtins;
66mod cypher;
67mod file_table;
68mod graph;
69mod imports;
70mod overlay;
71mod readonly;
72mod rename;
73mod repo_map;
74mod snapshot;
75mod state;
76mod symbol_graph;
77mod trigram;
78mod versions;
79mod walker;
80mod words;
81
82use std::path::Path;
83use std::sync::{Arc, Mutex};
84
85use harn_vm::VmValue;
86
87use crate::error::HostlibError;
88use crate::registry::{BuiltinRegistry, HostlibCapability, RegisteredBuiltin, SyncHandler};
89
90pub use agents::{AgentId, AgentInfo, AgentRegistry, AgentState, RegistryConfig};
91pub use builtins::SharedIndex;
92pub use cypher::{CypherError, CypherRow, CypherValue};
93pub use file_table::{FileId, IndexedFile, IndexedSymbol};
94pub use graph::DepGraph;
95pub use overlay::{BranchOverlay, OverlayState};
96pub use readonly::ReadonlyRoots;
97pub use snapshot::{CodeIndexSnapshot, SnapshotMeta};
98pub use state::{BuildOutcome, IndexState};
99pub use symbol_graph::{Edge, EdgeKind, Node, NodeId, NodeKind, SymbolGraph};
100pub use trigram::TrigramIndex;
101pub use versions::{ChangeRecord, EditOp, VersionEntry, VersionLog, HISTORY_LIMIT};
102pub use words::{WordHit, WordIndex};
103
104/// Code-index capability handle.
105///
106/// Holds the [`SharedIndex`] cell behind an `Arc<Mutex<...>>`; cloning
107/// the capability shares state. The capability also threads a
108/// `current_agent_id` slot used by the `current_agent_id` host builtin —
109/// embedders update this slot from the request-handling layer so each
110/// host call surfaces the right agent identity to scripts.
111#[derive(Clone, Default)]
112pub struct CodeIndexCapability {
113    index: SharedIndex,
114    /// Additive, read-only secondary roots (issue #2403 follow-up). Live
115    /// beside the primary slot; query/read_range merge them in but no
116    /// mutating builtin ever touches them, so indexing a dependency root
117    /// never clobbers the project index.
118    readonly: ReadonlyRoots,
119    current_agent: Arc<Mutex<Option<AgentId>>>,
120}
121
122impl CodeIndexCapability {
123    /// Create a capability with an empty workspace slot. The first
124    /// `hostlib_code_index_rebuild` call populates it.
125    pub fn new() -> Self {
126        Self {
127            index: Arc::new(Mutex::new(None)),
128            readonly: Arc::new(Mutex::new(Vec::new())),
129            current_agent: Arc::new(Mutex::new(None)),
130        }
131    }
132
133    /// Borrow the underlying shared cell. Useful for tests and embedders
134    /// that want to introspect index state without going through the
135    /// builtins.
136    pub fn shared(&self) -> SharedIndex {
137        self.index.clone()
138    }
139
140    /// Borrow the current-agent slot. Embedders bind this slot before
141    /// dispatching a host call so that `current_agent_id` returns the
142    /// right value to the script.
143    pub fn current_agent_slot(&self) -> Arc<Mutex<Option<AgentId>>> {
144        self.current_agent.clone()
145    }
146
147    /// Convenience: set the current agent id. Returns the previous value
148    /// (so callers can restore on completion if they bind per-call).
149    pub fn set_current_agent(&self, id: Option<AgentId>) -> Option<AgentId> {
150        let mut guard = self.current_agent.lock().expect("current_agent poisoned");
151        std::mem::replace(&mut *guard, id)
152    }
153
154    /// Restore from a previously saved snapshot at the path returned by
155    /// [`CodeIndexSnapshot::path_for`]. After restoring, runs
156    /// [`IndexState::reap_after_recovery`] so stale agent records and
157    /// locks are dropped before the daemon serves traffic.
158    ///
159    /// Returns `true` on a successful restore, `false` if no snapshot
160    /// existed (or the format was unrecognised). Errors propagate I/O
161    /// problems verbatim so callers can decide whether to fall back to
162    /// `rebuild`.
163    pub fn restore_from_disk(&self, workspace_root: &Path) -> std::io::Result<bool> {
164        match CodeIndexSnapshot::load(workspace_root)? {
165            Some(snap) => {
166                let mut state = IndexState::from_snapshot(snap);
167                state.reap_after_recovery(state::now_unix_ms());
168                let mut guard = self.index.lock().expect("code_index mutex poisoned");
169                *guard = Some(state);
170                Ok(true)
171            }
172            None => Ok(false),
173        }
174    }
175
176    /// Persist the current in-memory state to the path returned by
177    /// [`CodeIndexSnapshot::path_for`]. Returns `Ok(false)` when the
178    /// capability is empty (nothing to save).
179    pub fn persist_to_disk(&self) -> std::io::Result<bool> {
180        let snap = {
181            let guard = self.index.lock().expect("code_index mutex poisoned");
182            guard
183                .as_ref()
184                .map(|state| (state.snapshot(), state.root.clone()))
185        };
186        match snap {
187            Some((snap, root)) => {
188                snap.save(&root)?;
189                Ok(true)
190            }
191            None => Ok(false),
192        }
193    }
194}
195
196impl HostlibCapability for CodeIndexCapability {
197    fn module_name(&self) -> &'static str {
198        "code_index"
199    }
200
201    fn register_builtins(&self, registry: &mut BuiltinRegistry) {
202        // Workspace queries (original 5). `query` and `read_range` merge in
203        // the read-only secondary roots (issue #2403 follow-up), so they
204        // capture both the primary and the read-only cells.
205        {
206            let index = self.index.clone();
207            let readonly = self.readonly.clone();
208            let handler: SyncHandler =
209                Arc::new(move |args| builtins::run_query_merged(&index, Some(&readonly), args));
210            registry.register(RegisteredBuiltin {
211                name: builtins::BUILTIN_QUERY,
212                module: "code_index",
213                method: "query",
214                handler,
215            });
216        }
217        register(
218            registry,
219            self.index.clone(),
220            builtins::BUILTIN_REBUILD,
221            "rebuild",
222            builtins::run_rebuild,
223        );
224        register(
225            registry,
226            self.index.clone(),
227            builtins::BUILTIN_STATS,
228            "stats",
229            builtins::run_stats,
230        );
231        register(
232            registry,
233            self.index.clone(),
234            builtins::BUILTIN_IMPORTS_FOR,
235            "imports_for",
236            builtins::run_imports_for,
237        );
238        register(
239            registry,
240            self.index.clone(),
241            builtins::BUILTIN_IMPORTERS_OF,
242            "importers_of",
243            builtins::run_importers_of,
244        );
245
246        // Additive read-only secondary roots (issue #2403 follow-up).
247        // Captures the read-only cell directly — it never touches the
248        // primary index slot.
249        {
250            let readonly = self.readonly.clone();
251            let handler: SyncHandler =
252                Arc::new(move |args| readonly::run_add_readonly_roots(&readonly, args));
253            registry.register(RegisteredBuiltin {
254                name: readonly::BUILTIN_ADD_READONLY_ROOTS,
255                module: "code_index",
256                method: "add_readonly_roots",
257                handler,
258            });
259        }
260
261        // File table accessors.
262        register(
263            registry,
264            self.index.clone(),
265            builtins::BUILTIN_PATH_TO_ID,
266            "path_to_id",
267            builtins::run_path_to_id,
268        );
269        register(
270            registry,
271            self.index.clone(),
272            builtins::BUILTIN_ID_TO_PATH,
273            "id_to_path",
274            builtins::run_id_to_path,
275        );
276        register(
277            registry,
278            self.index.clone(),
279            builtins::BUILTIN_FILE_IDS,
280            "file_ids",
281            builtins::run_file_ids,
282        );
283        register(
284            registry,
285            self.index.clone(),
286            builtins::BUILTIN_FILE_META,
287            "file_meta",
288            builtins::run_file_meta,
289        );
290        register(
291            registry,
292            self.index.clone(),
293            builtins::BUILTIN_FILE_HASH,
294            "file_hash",
295            builtins::run_file_hash,
296        );
297
298        // Cached read paths. `read_range` falls back to the read-only
299        // secondary roots (issue #2403 follow-up) so a symbol discovered in
300        // a dependency root can be read back.
301        {
302            let index = self.index.clone();
303            let readonly = self.readonly.clone();
304            let handler: SyncHandler = Arc::new(move |args| {
305                builtins::run_read_range_merged(&index, Some(&readonly), args)
306            });
307            registry.register(RegisteredBuiltin {
308                name: builtins::BUILTIN_READ_RANGE,
309                module: "code_index",
310                method: "read_range",
311                handler,
312            });
313        }
314        register(
315            registry,
316            self.index.clone(),
317            builtins::BUILTIN_REINDEX_FILE,
318            "reindex_file",
319            builtins::run_reindex_file,
320        );
321        register(
322            registry,
323            self.index.clone(),
324            builtins::BUILTIN_TRIGRAM_QUERY,
325            "trigram_query",
326            builtins::run_trigram_query,
327        );
328        register(
329            registry,
330            self.index.clone(),
331            builtins::BUILTIN_EXTRACT_TRIGRAMS,
332            "extract_trigrams",
333            builtins::run_extract_trigrams,
334        );
335        register(
336            registry,
337            self.index.clone(),
338            builtins::BUILTIN_WORD_GET,
339            "word_get",
340            builtins::run_word_get,
341        );
342        register(
343            registry,
344            self.index.clone(),
345            builtins::BUILTIN_DEPS_GET,
346            "deps_get",
347            builtins::run_deps_get,
348        );
349        register(
350            registry,
351            self.index.clone(),
352            builtins::BUILTIN_OUTLINE_GET,
353            "outline_get",
354            builtins::run_outline_get,
355        );
356
357        // Change log.
358        register(
359            registry,
360            self.index.clone(),
361            builtins::BUILTIN_CURRENT_SEQ,
362            "current_seq",
363            builtins::run_current_seq,
364        );
365        register(
366            registry,
367            self.index.clone(),
368            builtins::BUILTIN_CHANGES_SINCE,
369            "changes_since",
370            builtins::run_changes_since,
371        );
372        register(
373            registry,
374            self.index.clone(),
375            builtins::BUILTIN_VERSION_RECORD,
376            "version_record",
377            builtins::run_version_record,
378        );
379
380        // Agent registry + locks.
381        register(
382            registry,
383            self.index.clone(),
384            builtins::BUILTIN_AGENT_REGISTER,
385            "agent_register",
386            builtins::run_agent_register,
387        );
388        register(
389            registry,
390            self.index.clone(),
391            builtins::BUILTIN_AGENT_HEARTBEAT,
392            "agent_heartbeat",
393            builtins::run_agent_heartbeat,
394        );
395        register(
396            registry,
397            self.index.clone(),
398            builtins::BUILTIN_AGENT_UNREGISTER,
399            "agent_unregister",
400            builtins::run_agent_unregister,
401        );
402        register(
403            registry,
404            self.index.clone(),
405            builtins::BUILTIN_LOCK_TRY,
406            "lock_try",
407            builtins::run_lock_try,
408        );
409        register(
410            registry,
411            self.index.clone(),
412            builtins::BUILTIN_LOCK_RELEASE,
413            "lock_release",
414            builtins::run_lock_release,
415        );
416        register(
417            registry,
418            self.index.clone(),
419            builtins::BUILTIN_STATUS,
420            "status",
421            builtins::run_status,
422        );
423
424        // `current_agent_id` is the only handler that reads from the
425        // capability's per-call `current_agent` slot rather than the
426        // index state, so it gets its own closure.
427        let slot = self.current_agent.clone();
428        let handler: SyncHandler =
429            Arc::new(move |args| builtins::run_current_agent_id(&slot, args));
430        registry.register(RegisteredBuiltin {
431            name: builtins::BUILTIN_CURRENT_AGENT_ID,
432            module: "code_index",
433            method: "current_agent_id",
434            handler,
435        });
436
437        // Typed symbol graph builtins (issue #2434).
438        register(
439            registry,
440            self.index.clone(),
441            builtins::BUILTIN_CYPHER,
442            "cypher",
443            builtins::run_cypher,
444        );
445        register(
446            registry,
447            self.index.clone(),
448            repo_map::BUILTIN,
449            "repo_map",
450            repo_map::run,
451        );
452        register(
453            registry,
454            self.index.clone(),
455            builtins::BUILTIN_BRANCH_OVERLAY,
456            "branch_overlay",
457            builtins::run_branch_overlay,
458        );
459        register(
460            registry,
461            self.index.clone(),
462            builtins::BUILTIN_FRESHNESS,
463            "freshness",
464            builtins::run_freshness,
465        );
466
467        // Cross-file safe rename (issue #2508). Builds on the typed
468        // symbol graph (#2434) and routes writes through staged-fs
469        // (#1722) so all touched files succeed or none do.
470        register(
471            registry,
472            self.index.clone(),
473            rename::BUILTIN,
474            "rename_symbol",
475            rename::run,
476        );
477    }
478}
479
480/// Programmatic entry point for callers that need to compose
481/// `rename_symbol` with another hostlib capability while sharing the
482/// same in-memory code-index state.
483pub(crate) fn run_rename_symbol(
484    index: &SharedIndex,
485    args: &[VmValue],
486) -> Result<VmValue, HostlibError> {
487    rename::run(index, args)
488}
489
490fn register(
491    registry: &mut BuiltinRegistry,
492    index: SharedIndex,
493    name: &'static str,
494    method: &'static str,
495    runner: fn(&SharedIndex, &[VmValue]) -> Result<VmValue, HostlibError>,
496) {
497    let captured = index;
498    let handler: SyncHandler = Arc::new(move |args| runner(&captured, args));
499    registry.register(RegisteredBuiltin {
500        name,
501        module: "code_index",
502        method,
503        handler,
504    });
505}