harn_hostlib/code_index/mod.rs
1//! Code index host capability.
2//!
3//! Deterministic trigram/word index plus live workspace state (agent
4//! registry, advisory locks, append-only version log, file id assignment,
5//! cached reads). The capability owns one [`SharedIndex`] cell per
6//! instance; cloning the capability shares state with every Harn VM that
7//! has been wired against it.
8//!
9//! Surface — every builtin is locked by `schemas/code_index/<method>.json`:
10//!
11//! ### Workspace queries (the original 5)
12//!
13//! | Builtin | What it does |
14//! |----------------------------------|--------------------------------------------------------|
15//! | `hostlib_code_index_query` | Trigram-accelerated literal substring search. |
16//! | `hostlib_code_index_rebuild` | Walk a workspace and (re)build the in-memory index. |
17//! | `hostlib_code_index_stats` | Count files/trigrams/words + last rebuild timestamp. |
18//! | `hostlib_code_index_imports_for` | Imports declared by a single file (with resolutions). |
19//! | `hostlib_code_index_importers_of`| Reverse lookup: who imports the given module/path? |
20//!
21//! ### Live workspace state (added in #776)
22//!
23//! - **Agents**: `agent_register`, `agent_heartbeat`, `agent_unregister`,
24//! `current_agent_id`, `status`.
25//! - **Locks**: `lock_try`, `lock_release`.
26//! - **Change log**: `current_seq`, `changes_since`, `version_record`.
27//! - **File table**: `path_to_id`, `id_to_path`, `file_ids`, `file_meta`,
28//! `file_hash`.
29//! - **Cached reads**: `read_range`, `reindex_file`, `trigram_query`,
30//! `extract_trigrams`, `word_get`, `deps_get`, `outline_get`.
31//!
32//! ### Typed symbol graph (added in #2434)
33//!
34//! - **`cypher`**: read-only Cypher executor over the typed graph
35//! ([`SymbolGraph`]) — `MATCH ... WHERE ... RETURN` with typed
36//! nodes (Function|Type|Module|Import|CallSite|Macro), typed edges
37//! (CALLS|REFS|IMPORTS|CONTAINS|OVERRIDES, plus `_BY` inverses),
38//! and variable-length hops up to depth 4.
39//! - **`branch_overlay`**: per-branch CDC overlay that layers a delta
40//! on top of the base graph; reuses ≥95% of the main index in
41//! storage/CPU for untouched files. See [`BranchOverlay`].
42//! - **`freshness`**: per-file hash + mtime comparison against the
43//! indexed snapshot; consumers detect staleness without forcing a
44//! rebuild.
45//! - **`repo_map`**: personalized PageRank over the typed graph, rendered
46//! as a token-budgeted symbol map for agent grounding.
47//!
48//! ### Cross-file safe rename (added in #2508)
49//!
50//! - **`rename_symbol`**: rewrite a symbol across `file | module |
51//! workspace` using the typed graph for symbol resolution and
52//! tree-sitter identifier kinds for safe text spans. Detects
53//! `new_name` shadowing in any rewritten file and aborts before any
54//! write. Routes through staged-fs (#1722) when a `session_id` is
55//! supplied so all touched files succeed or none do.
56//!
57//! ## Concurrency model
58//!
59//! All ops serialise through a single `Arc<Mutex<Option<IndexState>>>` so
60//! the IDE editor, eval, and live agent all see one consistent view. The
61//! capability is `Send + Sync` so embedders can share it across threads,
62//! but the mutex still serialises actual work.
63
64mod agents;
65mod builtins;
66mod cypher;
67mod file_table;
68mod graph;
69mod imports;
70mod overlay;
71mod readonly;
72mod rename;
73mod repo_map;
74mod snapshot;
75mod state;
76mod symbol_graph;
77mod trigram;
78mod versions;
79mod walker;
80mod words;
81
82use std::path::Path;
83use std::sync::{Arc, Mutex};
84
85use harn_vm::VmValue;
86
87use crate::error::HostlibError;
88use crate::registry::{BuiltinRegistry, HostlibCapability, RegisteredBuiltin, SyncHandler};
89
90pub use agents::{AgentId, AgentInfo, AgentRegistry, AgentState, RegistryConfig};
91pub use builtins::SharedIndex;
92pub use cypher::{CypherError, CypherRow, CypherValue};
93pub use file_table::{FileId, IndexedFile, IndexedSymbol};
94pub use graph::DepGraph;
95pub use overlay::{BranchOverlay, OverlayState};
96pub use readonly::ReadonlyRoots;
97pub use snapshot::{CodeIndexSnapshot, SnapshotMeta};
98pub use state::{BuildOutcome, IndexState};
99pub use symbol_graph::{Edge, EdgeKind, Node, NodeId, NodeKind, SymbolGraph};
100pub use trigram::TrigramIndex;
101pub use versions::{ChangeRecord, EditOp, VersionEntry, VersionLog, HISTORY_LIMIT};
102pub use words::{WordHit, WordIndex};
103
104/// Code-index capability handle.
105///
106/// Holds the [`SharedIndex`] cell behind an `Arc<Mutex<...>>`; cloning
107/// the capability shares state. The capability also threads a
108/// `current_agent_id` slot used by the `current_agent_id` host builtin —
109/// embedders update this slot from the request-handling layer so each
110/// host call surfaces the right agent identity to scripts.
111#[derive(Clone, Default)]
112pub struct CodeIndexCapability {
113 index: SharedIndex,
114 /// Additive, read-only secondary roots (issue #2403 follow-up). Live
115 /// beside the primary slot; query/read_range merge them in but no
116 /// mutating builtin ever touches them, so indexing a dependency root
117 /// never clobbers the project index.
118 readonly: ReadonlyRoots,
119 current_agent: Arc<Mutex<Option<AgentId>>>,
120}
121
122impl CodeIndexCapability {
123 /// Create a capability with an empty workspace slot. The first
124 /// `hostlib_code_index_rebuild` call populates it.
125 pub fn new() -> Self {
126 Self {
127 index: Arc::new(Mutex::new(None)),
128 readonly: Arc::new(Mutex::new(Vec::new())),
129 current_agent: Arc::new(Mutex::new(None)),
130 }
131 }
132
133 /// Borrow the underlying shared cell. Useful for tests and embedders
134 /// that want to introspect index state without going through the
135 /// builtins.
136 pub fn shared(&self) -> SharedIndex {
137 self.index.clone()
138 }
139
140 /// Borrow the current-agent slot. Embedders bind this slot before
141 /// dispatching a host call so that `current_agent_id` returns the
142 /// right value to the script.
143 pub fn current_agent_slot(&self) -> Arc<Mutex<Option<AgentId>>> {
144 self.current_agent.clone()
145 }
146
147 /// Convenience: set the current agent id. Returns the previous value
148 /// (so callers can restore on completion if they bind per-call).
149 pub fn set_current_agent(&self, id: Option<AgentId>) -> Option<AgentId> {
150 let mut guard = self.current_agent.lock().expect("current_agent poisoned");
151 std::mem::replace(&mut *guard, id)
152 }
153
154 /// Restore from a previously saved snapshot at the path returned by
155 /// [`CodeIndexSnapshot::path_for`]. After restoring, runs
156 /// [`IndexState::reap_after_recovery`] so stale agent records and
157 /// locks are dropped before the daemon serves traffic.
158 ///
159 /// Returns `true` on a successful restore, `false` if no snapshot
160 /// existed (or the format was unrecognised). Errors propagate I/O
161 /// problems verbatim so callers can decide whether to fall back to
162 /// `rebuild`.
163 pub fn restore_from_disk(&self, workspace_root: &Path) -> std::io::Result<bool> {
164 match CodeIndexSnapshot::load(workspace_root)? {
165 Some(snap) => {
166 let mut state = IndexState::from_snapshot(snap);
167 state.reap_after_recovery(state::now_unix_ms());
168 let mut guard = self.index.lock().expect("code_index mutex poisoned");
169 *guard = Some(state);
170 Ok(true)
171 }
172 None => Ok(false),
173 }
174 }
175
176 /// Persist the current in-memory state to the path returned by
177 /// [`CodeIndexSnapshot::path_for`]. Returns `Ok(false)` when the
178 /// capability is empty (nothing to save).
179 pub fn persist_to_disk(&self) -> std::io::Result<bool> {
180 let snap = {
181 let guard = self.index.lock().expect("code_index mutex poisoned");
182 guard
183 .as_ref()
184 .map(|state| (state.snapshot(), state.root.clone()))
185 };
186 match snap {
187 Some((snap, root)) => {
188 snap.save(&root)?;
189 Ok(true)
190 }
191 None => Ok(false),
192 }
193 }
194}
195
196impl HostlibCapability for CodeIndexCapability {
197 fn module_name(&self) -> &'static str {
198 "code_index"
199 }
200
201 fn register_builtins(&self, registry: &mut BuiltinRegistry) {
202 // Workspace queries (original 5). `query` and `read_range` merge in
203 // the read-only secondary roots (issue #2403 follow-up), so they
204 // capture both the primary and the read-only cells.
205 {
206 let index = self.index.clone();
207 let readonly = self.readonly.clone();
208 let handler: SyncHandler =
209 Arc::new(move |args| builtins::run_query_merged(&index, Some(&readonly), args));
210 registry.register(RegisteredBuiltin {
211 name: builtins::BUILTIN_QUERY,
212 module: "code_index",
213 method: "query",
214 handler,
215 });
216 }
217 register(
218 registry,
219 self.index.clone(),
220 builtins::BUILTIN_REBUILD,
221 "rebuild",
222 builtins::run_rebuild,
223 );
224 register(
225 registry,
226 self.index.clone(),
227 builtins::BUILTIN_STATS,
228 "stats",
229 builtins::run_stats,
230 );
231 register(
232 registry,
233 self.index.clone(),
234 builtins::BUILTIN_IMPORTS_FOR,
235 "imports_for",
236 builtins::run_imports_for,
237 );
238 register(
239 registry,
240 self.index.clone(),
241 builtins::BUILTIN_IMPORTERS_OF,
242 "importers_of",
243 builtins::run_importers_of,
244 );
245
246 // Additive read-only secondary roots (issue #2403 follow-up).
247 // Captures the read-only cell directly — it never touches the
248 // primary index slot.
249 {
250 let readonly = self.readonly.clone();
251 let handler: SyncHandler =
252 Arc::new(move |args| readonly::run_add_readonly_roots(&readonly, args));
253 registry.register(RegisteredBuiltin {
254 name: readonly::BUILTIN_ADD_READONLY_ROOTS,
255 module: "code_index",
256 method: "add_readonly_roots",
257 handler,
258 });
259 }
260
261 // File table accessors.
262 register(
263 registry,
264 self.index.clone(),
265 builtins::BUILTIN_PATH_TO_ID,
266 "path_to_id",
267 builtins::run_path_to_id,
268 );
269 register(
270 registry,
271 self.index.clone(),
272 builtins::BUILTIN_ID_TO_PATH,
273 "id_to_path",
274 builtins::run_id_to_path,
275 );
276 register(
277 registry,
278 self.index.clone(),
279 builtins::BUILTIN_FILE_IDS,
280 "file_ids",
281 builtins::run_file_ids,
282 );
283 register(
284 registry,
285 self.index.clone(),
286 builtins::BUILTIN_FILE_META,
287 "file_meta",
288 builtins::run_file_meta,
289 );
290 register(
291 registry,
292 self.index.clone(),
293 builtins::BUILTIN_FILE_HASH,
294 "file_hash",
295 builtins::run_file_hash,
296 );
297
298 // Cached read paths. `read_range` falls back to the read-only
299 // secondary roots (issue #2403 follow-up) so a symbol discovered in
300 // a dependency root can be read back.
301 {
302 let index = self.index.clone();
303 let readonly = self.readonly.clone();
304 let handler: SyncHandler = Arc::new(move |args| {
305 builtins::run_read_range_merged(&index, Some(&readonly), args)
306 });
307 registry.register(RegisteredBuiltin {
308 name: builtins::BUILTIN_READ_RANGE,
309 module: "code_index",
310 method: "read_range",
311 handler,
312 });
313 }
314 register(
315 registry,
316 self.index.clone(),
317 builtins::BUILTIN_REINDEX_FILE,
318 "reindex_file",
319 builtins::run_reindex_file,
320 );
321 register(
322 registry,
323 self.index.clone(),
324 builtins::BUILTIN_TRIGRAM_QUERY,
325 "trigram_query",
326 builtins::run_trigram_query,
327 );
328 register(
329 registry,
330 self.index.clone(),
331 builtins::BUILTIN_EXTRACT_TRIGRAMS,
332 "extract_trigrams",
333 builtins::run_extract_trigrams,
334 );
335 register(
336 registry,
337 self.index.clone(),
338 builtins::BUILTIN_WORD_GET,
339 "word_get",
340 builtins::run_word_get,
341 );
342 register(
343 registry,
344 self.index.clone(),
345 builtins::BUILTIN_DEPS_GET,
346 "deps_get",
347 builtins::run_deps_get,
348 );
349 register(
350 registry,
351 self.index.clone(),
352 builtins::BUILTIN_OUTLINE_GET,
353 "outline_get",
354 builtins::run_outline_get,
355 );
356
357 // Change log.
358 register(
359 registry,
360 self.index.clone(),
361 builtins::BUILTIN_CURRENT_SEQ,
362 "current_seq",
363 builtins::run_current_seq,
364 );
365 register(
366 registry,
367 self.index.clone(),
368 builtins::BUILTIN_CHANGES_SINCE,
369 "changes_since",
370 builtins::run_changes_since,
371 );
372 register(
373 registry,
374 self.index.clone(),
375 builtins::BUILTIN_VERSION_RECORD,
376 "version_record",
377 builtins::run_version_record,
378 );
379
380 // Agent registry + locks.
381 register(
382 registry,
383 self.index.clone(),
384 builtins::BUILTIN_AGENT_REGISTER,
385 "agent_register",
386 builtins::run_agent_register,
387 );
388 register(
389 registry,
390 self.index.clone(),
391 builtins::BUILTIN_AGENT_HEARTBEAT,
392 "agent_heartbeat",
393 builtins::run_agent_heartbeat,
394 );
395 register(
396 registry,
397 self.index.clone(),
398 builtins::BUILTIN_AGENT_UNREGISTER,
399 "agent_unregister",
400 builtins::run_agent_unregister,
401 );
402 register(
403 registry,
404 self.index.clone(),
405 builtins::BUILTIN_LOCK_TRY,
406 "lock_try",
407 builtins::run_lock_try,
408 );
409 register(
410 registry,
411 self.index.clone(),
412 builtins::BUILTIN_LOCK_RELEASE,
413 "lock_release",
414 builtins::run_lock_release,
415 );
416 register(
417 registry,
418 self.index.clone(),
419 builtins::BUILTIN_STATUS,
420 "status",
421 builtins::run_status,
422 );
423
424 // `current_agent_id` is the only handler that reads from the
425 // capability's per-call `current_agent` slot rather than the
426 // index state, so it gets its own closure.
427 let slot = self.current_agent.clone();
428 let handler: SyncHandler =
429 Arc::new(move |args| builtins::run_current_agent_id(&slot, args));
430 registry.register(RegisteredBuiltin {
431 name: builtins::BUILTIN_CURRENT_AGENT_ID,
432 module: "code_index",
433 method: "current_agent_id",
434 handler,
435 });
436
437 // Typed symbol graph builtins (issue #2434).
438 register(
439 registry,
440 self.index.clone(),
441 builtins::BUILTIN_CYPHER,
442 "cypher",
443 builtins::run_cypher,
444 );
445 register(
446 registry,
447 self.index.clone(),
448 repo_map::BUILTIN,
449 "repo_map",
450 repo_map::run,
451 );
452 register(
453 registry,
454 self.index.clone(),
455 builtins::BUILTIN_BRANCH_OVERLAY,
456 "branch_overlay",
457 builtins::run_branch_overlay,
458 );
459 register(
460 registry,
461 self.index.clone(),
462 builtins::BUILTIN_FRESHNESS,
463 "freshness",
464 builtins::run_freshness,
465 );
466
467 // Cross-file safe rename (issue #2508). Builds on the typed
468 // symbol graph (#2434) and routes writes through staged-fs
469 // (#1722) so all touched files succeed or none do.
470 register(
471 registry,
472 self.index.clone(),
473 rename::BUILTIN,
474 "rename_symbol",
475 rename::run,
476 );
477 }
478}
479
480/// Programmatic entry point for callers that need to compose
481/// `rename_symbol` with another hostlib capability while sharing the
482/// same in-memory code-index state.
483pub(crate) fn run_rename_symbol(
484 index: &SharedIndex,
485 args: &[VmValue],
486) -> Result<VmValue, HostlibError> {
487 rename::run(index, args)
488}
489
490fn register(
491 registry: &mut BuiltinRegistry,
492 index: SharedIndex,
493 name: &'static str,
494 method: &'static str,
495 runner: fn(&SharedIndex, &[VmValue]) -> Result<VmValue, HostlibError>,
496) {
497 let captured = index;
498 let handler: SyncHandler = Arc::new(move |args| runner(&captured, args));
499 registry.register(RegisteredBuiltin {
500 name,
501 module: "code_index",
502 method,
503 handler,
504 });
505}