Skip to main content

harn_hostlib/scanner/
mod.rs

1//! Repo scanner host capability.
2//!
3//! Deterministic project-wide file enumeration honoring `.gitignore` and
4//! the [`extensions::EXCLUDED_DIRS`] table, symbol extraction,
5//! import-derived dependency graph, reference + churn + importance
6//! scoring, source/test pairing, folder aggregates, project metadata
7//! (language stats + detected test commands + code-pattern hints),
8//! sub-project detection, and a token-budgeted text repo map.
9//!
10//! `scan_project` returns the full [`result::ScanResult`] alongside an
11//! opaque `snapshot_token` derived from the canonicalized root path. The
12//! result is persisted to `<root>/.harn/hostlib/scanner-snapshot.json` so
13//! that `scan_incremental` can diff against it later — without forcing the
14//! caller to pass the previous result back over the wire.
15
16use std::path::{Path, PathBuf};
17use std::process::Command;
18use std::sync::Arc;
19use std::time::{SystemTime, UNIX_EPOCH};
20
21use harn_vm::VmValue;
22
23use crate::error::HostlibError;
24use crate::registry::{BuiltinRegistry, HostlibCapability};
25use crate::tools::args::{
26    build_dict, dict_arg, optional_bool, optional_int, require_string, str_value,
27};
28
29mod commands;
30mod discover;
31mod extensions;
32mod folders;
33mod git;
34mod imports;
35mod manifest;
36mod result;
37mod scoring;
38mod snapshot;
39mod subproject;
40mod symbols;
41mod test_mapping;
42
43fn strip_ambient_git_env(cmd: &mut Command) {
44    // Git exports repository-specific GIT_* variables while running hooks.
45    // Scanner probes must honor their explicit `-C <root>` argument instead.
46    for (key, _) in std::env::vars() {
47        if key.starts_with("GIT_") {
48            cmd.env_remove(&key);
49        }
50    }
51}
52
53pub use git::GitCapabilities;
54pub use result::{
55    DependencyEdge, FileRecord, FolderRecord, LanguageStat, ProjectMetadata, ScanDelta, ScanResult,
56    SubProject, SymbolKind, SymbolRecord,
57};
58
59const SCAN_PROJECT_BUILTIN: &str = "hostlib_scanner_scan_project";
60const SCAN_INCREMENTAL_BUILTIN: &str = "hostlib_scanner_scan_incremental";
61
62/// Scanner capability handle.
63#[derive(Default)]
64pub struct ScannerCapability;
65
66impl HostlibCapability for ScannerCapability {
67    fn module_name(&self) -> &'static str {
68        "scanner"
69    }
70
71    fn register_builtins(&self, registry: &mut BuiltinRegistry) {
72        registry.register_fn(
73            "scanner",
74            SCAN_PROJECT_BUILTIN,
75            "scan_project",
76            scan_project_handler,
77        );
78        registry.register_fn(
79            "scanner",
80            SCAN_INCREMENTAL_BUILTIN,
81            "scan_incremental",
82            scan_incremental_handler,
83        );
84    }
85}
86
87// MARK: - Public Rust API (used by tests + by harn-cli embedders).
88
89/// Tunable knobs accepted by [`scan_project`].
90#[derive(Clone, Debug)]
91pub struct ScanProjectOptions {
92    /// Include hidden (`.`) entries during walking.
93    pub include_hidden: bool,
94    /// Honor `.gitignore`.
95    pub respect_gitignore: bool,
96    /// Hard cap on file count (0 = unlimited).
97    pub max_files: usize,
98    /// Run `git log` to compute churn scores.
99    pub include_git_history: bool,
100    /// Approximate token budget for the text repo map.
101    pub repo_map_token_budget: usize,
102}
103
104impl Default for ScanProjectOptions {
105    fn default() -> Self {
106        Self {
107            include_hidden: false,
108            respect_gitignore: true,
109            max_files: 0,
110            include_git_history: true,
111            repo_map_token_budget: 1200,
112        }
113    }
114}
115
116/// Run a full scan of `root`, persist a snapshot, and return the result.
117pub fn scan_project(root: &Path, opts: ScanProjectOptions) -> ScanResult {
118    scan_project_with_git(root, opts, &git::CliGitCapabilities)
119}
120
121/// Run a full scan using caller-supplied Git data.
122///
123/// Embedders normally call [`scan_project`]. Tests and hosts that already
124/// virtualize Git can use this entry point to keep scanner behavior
125/// deterministic without depending on ambient process state.
126pub fn scan_project_with_git(
127    root: &Path,
128    opts: ScanProjectOptions,
129    git: &dyn GitCapabilities,
130) -> ScanResult {
131    let canonical = canonicalize(root);
132    let discover_opts = discover::DiscoverOptions {
133        include_hidden: opts.include_hidden,
134        respect_gitignore: opts.respect_gitignore,
135    };
136    let mut discovered = discover::discover_files(&canonical, discover_opts, git);
137    let truncated = if opts.max_files > 0 && discovered.len() > opts.max_files {
138        discovered.truncate(opts.max_files);
139        true
140    } else {
141        false
142    };
143
144    let (mut files, mut symbols, mut dependencies) = extract_per_file(&discovered);
145
146    scoring::compute_reference_counts(&mut symbols, &files);
147
148    if opts.include_git_history {
149        let churn = git.churn_scores(&canonical);
150        scoring::apply_churn(&mut files, &churn);
151    }
152    scoring::compute_importance_scores(&mut symbols, &files);
153
154    test_mapping::map_test_files(&mut files);
155
156    let folder_records = folders::build_folder_records(&files, &symbols);
157    let test_commands = commands::detect_test_commands(&canonical);
158    let code_patterns = commands::detect_code_patterns(&files, &canonical);
159    let mut project = folders::build_project_metadata(
160        &canonical,
161        &files,
162        test_commands,
163        code_patterns,
164        now_iso8601(),
165    );
166    let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
167    let mut sub_projects = subproject::detect_subprojects(&canonical, 2);
168    attach_manifest_dependencies(&canonical, &mut project, &mut sub_projects);
169
170    sort_for_output(&mut files, &mut symbols, &mut dependencies);
171
172    let token = snapshot::root_to_token(&canonical);
173    let result = ScanResult {
174        snapshot_token: token,
175        truncated,
176        project,
177        folders: folder_records,
178        files,
179        symbols,
180        dependencies,
181        sub_projects,
182        repo_map,
183    };
184    snapshot::save(&canonical, &result);
185    result
186}
187
188/// Result returned by [`scan_incremental`].
189#[derive(Clone, Debug)]
190pub struct IncrementalScan {
191    /// Refreshed scan result.
192    pub result: ScanResult,
193    /// Path delta computed against the snapshot.
194    pub delta: ScanDelta,
195}
196
197/// Refresh the snapshot named by `token`. If the snapshot is missing, the
198/// diff is too large (>30%), or `changed_paths` is empty after `>30%` of
199/// the workspace mtime-mismatched, falls back to a full rescan.
200pub fn scan_incremental(
201    token: &str,
202    explicit_changed: Option<&[String]>,
203    opts: ScanProjectOptions,
204) -> IncrementalScan {
205    scan_incremental_with_git(token, explicit_changed, opts, &git::CliGitCapabilities)
206}
207
208/// Refresh a snapshot using caller-supplied Git data.
209pub fn scan_incremental_with_git(
210    token: &str,
211    explicit_changed: Option<&[String]>,
212    opts: ScanProjectOptions,
213    git: &dyn GitCapabilities,
214) -> IncrementalScan {
215    let root = snapshot::token_to_root(token);
216    let canonical = canonicalize(&root);
217
218    let cached = snapshot::load(&canonical);
219    let cached = match cached {
220        Some(c) => c,
221        None => {
222            let result = scan_project_with_git(&canonical, opts, git);
223            return IncrementalScan {
224                result,
225                delta: ScanDelta {
226                    full_rescan: true,
227                    ..ScanDelta::default()
228                },
229            };
230        }
231    };
232
233    let discover_opts = discover::DiscoverOptions {
234        include_hidden: opts.include_hidden,
235        respect_gitignore: opts.respect_gitignore,
236    };
237    let mut current = discover::discover_files(&canonical, discover_opts, git);
238    if opts.max_files > 0 && current.len() > opts.max_files {
239        current.truncate(opts.max_files);
240    }
241
242    let delta = compute_delta(&current, &cached, explicit_changed);
243    let total = current.len();
244    let needs_full_rescan =
245        total > 0 && (delta.added.len() + delta.modified.len()) * 10 > total * 3;
246
247    if needs_full_rescan {
248        let result = scan_project_with_git(&canonical, opts, git);
249        return IncrementalScan {
250            result,
251            delta: ScanDelta {
252                full_rescan: true,
253                ..delta
254            },
255        };
256    }
257
258    if delta.added.is_empty() && delta.modified.is_empty() && delta.removed.is_empty() {
259        return IncrementalScan {
260            result: cached,
261            delta,
262        };
263    }
264
265    // Incremental path: rebuild only the touched files, then re-finalize.
266    let mut files = cached.files;
267    let mut symbols = cached.symbols;
268    let mut dependencies = cached.dependencies;
269
270    let removed_set: std::collections::HashSet<&str> =
271        delta.removed.iter().map(|s| s.as_str()).collect();
272    let touched_set: std::collections::HashSet<&str> = delta
273        .added
274        .iter()
275        .chain(delta.modified.iter())
276        .map(|s| s.as_str())
277        .collect();
278
279    files.retain(|f| !removed_set.contains(f.relative_path.as_str()));
280    symbols.retain(|s| {
281        !removed_set.contains(s.file_path.as_str()) && !touched_set.contains(s.file_path.as_str())
282    });
283    dependencies.retain(|d| {
284        !removed_set.contains(d.from_file.as_str()) && !touched_set.contains(d.from_file.as_str())
285    });
286
287    let touched_entries: Vec<discover::DiscoveredFile> = current
288        .iter()
289        .filter(|e| touched_set.contains(e.relative_path.as_str()))
290        .cloned()
291        .collect();
292    let (new_files, new_symbols, new_deps) = extract_per_file(&touched_entries);
293
294    let mut by_path: std::collections::BTreeMap<String, FileRecord> = files
295        .into_iter()
296        .map(|f| (f.relative_path.clone(), f))
297        .collect();
298    for new_file in new_files {
299        by_path.insert(new_file.relative_path.clone(), new_file);
300    }
301    let mut files: Vec<FileRecord> = by_path.into_values().collect();
302    symbols.extend(new_symbols);
303    dependencies.extend(new_deps);
304
305    scoring::compute_reference_counts(&mut symbols, &files);
306    if opts.include_git_history {
307        let churn = git.churn_scores(&canonical);
308        scoring::apply_churn(&mut files, &churn);
309    }
310    scoring::compute_importance_scores(&mut symbols, &files);
311    test_mapping::map_test_files(&mut files);
312
313    let folder_records = folders::build_folder_records(&files, &symbols);
314    let test_commands = commands::detect_test_commands(&canonical);
315    let code_patterns = commands::detect_code_patterns(&files, &canonical);
316    let mut project = folders::build_project_metadata(
317        &canonical,
318        &files,
319        test_commands,
320        code_patterns,
321        now_iso8601(),
322    );
323    let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
324    let mut sub_projects = subproject::detect_subprojects(&canonical, 2);
325    attach_manifest_dependencies(&canonical, &mut project, &mut sub_projects);
326
327    sort_for_output(&mut files, &mut symbols, &mut dependencies);
328
329    let token = snapshot::root_to_token(&canonical);
330    let result = ScanResult {
331        snapshot_token: token,
332        truncated: cached.truncated,
333        project,
334        folders: folder_records,
335        files,
336        symbols,
337        dependencies,
338        sub_projects,
339        repo_map,
340    };
341    snapshot::save(&canonical, &result);
342    IncrementalScan { result, delta }
343}
344
345// MARK: - Internals
346
347fn canonicalize(root: &Path) -> PathBuf {
348    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
349}
350
351/// Compute package-manifest dependencies for the root and each detected
352/// sub-project. Centralized here so manifest parsing (in [`manifest`]) is
353/// invoked exactly once per project directory by both the full and the
354/// incremental scan paths.
355fn attach_manifest_dependencies(
356    canonical: &Path,
357    project: &mut ProjectMetadata,
358    sub_projects: &mut [SubProject],
359) {
360    project.available_dependencies = manifest::directory_dependencies(canonical);
361    for sp in sub_projects.iter_mut() {
362        sp.dependencies = manifest::directory_dependencies(Path::new(&sp.path));
363    }
364}
365
366fn extract_per_file(
367    discovered: &[discover::DiscoveredFile],
368) -> (Vec<FileRecord>, Vec<SymbolRecord>, Vec<DependencyEdge>) {
369    let mut files: Vec<FileRecord> = Vec::with_capacity(discovered.len());
370    let mut symbols: Vec<SymbolRecord> = Vec::new();
371    let mut dependencies: Vec<DependencyEdge> = Vec::new();
372
373    for entry in discovered {
374        let metadata = std::fs::metadata(&entry.absolute_path);
375        let size = metadata.as_ref().map(|m| m.len()).unwrap_or(0);
376        let modified = metadata
377            .as_ref()
378            .ok()
379            .and_then(|m| m.modified().ok())
380            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
381            .map(|d| d.as_millis() as i64)
382            .unwrap_or(0);
383
384        let content = std::fs::read_to_string(&entry.absolute_path).unwrap_or_default();
385        if content.is_empty() && size != 0 {
386            // Likely a non-utf8 binary; skip symbol/import extraction but still record the file.
387        }
388        let language = extensions::file_extension(&entry.relative_path);
389        let imports = imports::extract_imports(&content, &language);
390        let file_symbols = symbols::extract_symbols(&content, &language, &entry.relative_path);
391        let line_count = crate::text::count_lines(content.as_bytes()) as usize;
392
393        for imp in &imports {
394            dependencies.push(DependencyEdge {
395                from_file: entry.relative_path.clone(),
396                to_module: imp.clone(),
397            });
398        }
399        symbols.extend(file_symbols);
400
401        files.push(FileRecord {
402            id: entry.relative_path.clone(),
403            relative_path: entry.relative_path.clone(),
404            file_name: extensions::file_name(&entry.relative_path).to_string(),
405            language,
406            line_count,
407            size_bytes: size,
408            last_modified_unix_ms: modified,
409            imports,
410            churn_score: 0.0,
411            corresponding_test_file: None,
412        });
413    }
414
415    (files, symbols, dependencies)
416}
417
418fn sort_for_output(
419    files: &mut [FileRecord],
420    symbols: &mut [SymbolRecord],
421    dependencies: &mut [DependencyEdge],
422) {
423    files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
424    symbols.sort_by(|a, b| a.id.cmp(&b.id));
425    dependencies.sort_by(|a, b| {
426        a.from_file
427            .cmp(&b.from_file)
428            .then_with(|| a.to_module.cmp(&b.to_module))
429    });
430}
431
432fn compute_delta(
433    current: &[discover::DiscoveredFile],
434    cached: &ScanResult,
435    explicit_changed: Option<&[String]>,
436) -> ScanDelta {
437    let cached_files: std::collections::BTreeMap<&str, &FileRecord> = cached
438        .files
439        .iter()
440        .map(|f| (f.relative_path.as_str(), f))
441        .collect();
442    let current_paths: std::collections::HashSet<&str> =
443        current.iter().map(|e| e.relative_path.as_str()).collect();
444
445    let added: Vec<String> = current
446        .iter()
447        .filter(|e| !cached_files.contains_key(e.relative_path.as_str()))
448        .map(|e| e.relative_path.clone())
449        .collect();
450    let removed: Vec<String> = cached
451        .files
452        .iter()
453        .filter(|f| !current_paths.contains(f.relative_path.as_str()))
454        .map(|f| f.relative_path.clone())
455        .collect();
456
457    let modified: Vec<String> = if let Some(explicit) = explicit_changed {
458        explicit
459            .iter()
460            .filter(|p| cached_files.contains_key(p.as_str()) && current_paths.contains(p.as_str()))
461            .cloned()
462            .collect()
463    } else {
464        let mut out = Vec::new();
465        for entry in current {
466            if let Some(prev) = cached_files.get(entry.relative_path.as_str()) {
467                let meta = std::fs::metadata(&entry.absolute_path).ok();
468                let mtime = meta
469                    .as_ref()
470                    .and_then(|m| m.modified().ok())
471                    .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
472                    .map(|d| d.as_millis() as i64)
473                    .unwrap_or(0);
474                let size = meta.as_ref().map(|m| m.len()).unwrap_or(prev.size_bytes);
475                // A newer mtime is the cheap common signal, but mtime
476                // granularity collides on same-turn/same-second edits (and on
477                // coarse-granularity filesystems), silently dropping the edit.
478                // A changed byte size is an mtime-independent modification
479                // signal that catches the overwhelmingly common add/remove edit
480                // for free — `meta.len()` is already in hand. Without it, an
481                // agent that writes a file and re-scans in the same instant
482                // keeps reading the pre-edit symbol facts.
483                if mtime > prev.last_modified_unix_ms || size != prev.size_bytes {
484                    out.push(entry.relative_path.clone());
485                }
486            }
487        }
488        out
489    };
490
491    ScanDelta {
492        added,
493        modified,
494        removed,
495        full_rescan: false,
496    }
497}
498
499fn now_iso8601() -> String {
500    let now = SystemTime::now()
501        .duration_since(UNIX_EPOCH)
502        .unwrap_or_default();
503    let secs = now.as_secs() as i64;
504    let nanos = now.subsec_nanos();
505    let (year, month, day, hour, minute, second) = unix_to_civil(secs);
506    format!(
507        "{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}.{millis:03}Z",
508        millis = nanos / 1_000_000
509    )
510}
511
512/// Convert a unix timestamp (seconds, UTC) to civil date components. Uses
513/// Howard Hinnant's algorithm so we don't pull in `chrono` for one
514/// formatter.
515fn unix_to_civil(secs: i64) -> (i64, u32, u32, u32, u32, u32) {
516    let days = secs.div_euclid(86_400);
517    let day_secs = secs.rem_euclid(86_400);
518    let hour = (day_secs / 3600) as u32;
519    let minute = ((day_secs % 3600) / 60) as u32;
520    let second = (day_secs % 60) as u32;
521
522    // Days from 1970-01-01.
523    let z = days + 719_468;
524    let era = z.div_euclid(146_097);
525    let doe = z.rem_euclid(146_097) as u64;
526    let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
527    let y = yoe as i64 + era * 400;
528    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
529    let mp = (5 * doy + 2) / 153;
530    let day = (doy - (153 * mp + 2) / 5 + 1) as u32;
531    let month = if mp < 10 { mp + 3 } else { mp - 9 } as u32;
532    let year = if month <= 2 { y + 1 } else { y };
533    (year, month, day, hour, minute, second)
534}
535
536// MARK: - Builtin handlers (Harn dict ↔ Rust struct).
537
538fn scan_project_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
539    let raw = dict_arg(SCAN_PROJECT_BUILTIN, args)?;
540    let dict = raw.as_ref();
541    let root = require_string(SCAN_PROJECT_BUILTIN, dict, "root")?;
542    let opts = parse_options(SCAN_PROJECT_BUILTIN, dict)?;
543    let result = scan_project(Path::new(&root), opts);
544    Ok(scan_result_to_value(&result, None))
545}
546
547fn scan_incremental_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
548    let raw = dict_arg(SCAN_INCREMENTAL_BUILTIN, args)?;
549    let dict = raw.as_ref();
550    let token = require_string(SCAN_INCREMENTAL_BUILTIN, dict, "snapshot_token")?;
551    let opts = parse_options(SCAN_INCREMENTAL_BUILTIN, dict)?;
552    let changed = parse_changed_paths(SCAN_INCREMENTAL_BUILTIN, dict)?;
553    let scan = scan_incremental(&token, changed.as_deref(), opts);
554    Ok(scan_result_to_value(&scan.result, Some(&scan.delta)))
555}
556
557fn parse_options(
558    builtin: &'static str,
559    dict: &harn_vm::value::DictMap,
560) -> Result<ScanProjectOptions, HostlibError> {
561    let include_hidden = optional_bool(builtin, dict, "include_hidden", false)?;
562    let respect_gitignore = optional_bool(builtin, dict, "respect_gitignore", true)?;
563    let max_files = optional_int(builtin, dict, "max_files", 0)?;
564    let include_git_history_default = builtin == SCAN_PROJECT_BUILTIN;
565    let include_git_history = optional_bool(
566        builtin,
567        dict,
568        "include_git_history",
569        include_git_history_default,
570    )?;
571    let repo_map_token_budget = optional_int(builtin, dict, "repo_map_token_budget", 1200)?;
572    if max_files < 0 {
573        return Err(HostlibError::InvalidParameter {
574            builtin,
575            param: "max_files",
576            message: "must be >= 0".to_string(),
577        });
578    }
579    if repo_map_token_budget < 0 {
580        return Err(HostlibError::InvalidParameter {
581            builtin,
582            param: "repo_map_token_budget",
583            message: "must be >= 0".to_string(),
584        });
585    }
586    Ok(ScanProjectOptions {
587        include_hidden,
588        respect_gitignore,
589        max_files: max_files as usize,
590        include_git_history,
591        repo_map_token_budget: repo_map_token_budget as usize,
592    })
593}
594
595fn parse_changed_paths(
596    builtin: &'static str,
597    dict: &harn_vm::value::DictMap,
598) -> Result<Option<Vec<String>>, HostlibError> {
599    let value = match dict.get("changed_paths") {
600        None | Some(VmValue::Nil) => return Ok(None),
601        Some(v) => v,
602    };
603    let list = match value {
604        VmValue::List(items) => items,
605        other => {
606            return Err(HostlibError::InvalidParameter {
607                builtin,
608                param: "changed_paths",
609                message: format!("expected list of strings, got {}", other.type_name()),
610            });
611        }
612    };
613    let mut out = Vec::with_capacity(list.len());
614    for item in list.iter() {
615        match item {
616            VmValue::String(s) => out.push(s.to_string()),
617            other => {
618                return Err(HostlibError::InvalidParameter {
619                    builtin,
620                    param: "changed_paths",
621                    message: format!("non-string entry: {}", other.type_name()),
622                });
623            }
624        }
625    }
626    Ok(Some(out))
627}
628
629fn scan_result_to_value(result: &ScanResult, delta: Option<&ScanDelta>) -> VmValue {
630    let mut entries: Vec<(&'static str, VmValue)> = vec![
631        ("snapshot_token", str_value(&result.snapshot_token)),
632        ("truncated", VmValue::Bool(result.truncated)),
633        ("project", project_to_value(&result.project)),
634        ("folders", list_of(&result.folders, folder_to_value)),
635        ("files", list_of(&result.files, file_to_value)),
636        ("symbols", list_of(&result.symbols, symbol_to_value)),
637        (
638            "dependencies",
639            list_of(&result.dependencies, dependency_to_value),
640        ),
641        (
642            "sub_projects",
643            list_of(&result.sub_projects, subproject_to_value),
644        ),
645        ("repo_map", str_value(&result.repo_map)),
646    ];
647    if let Some(d) = delta {
648        entries.push(("delta", delta_to_value(d)));
649    }
650    build_dict(entries)
651}
652
653fn list_of<T>(items: &[T], to_value: fn(&T) -> VmValue) -> VmValue {
654    let list: Vec<VmValue> = items.iter().map(to_value).collect();
655    VmValue::List(Arc::new(list))
656}
657
658fn project_to_value(project: &ProjectMetadata) -> VmValue {
659    let test_commands_entries: Vec<(String, VmValue)> = project
660        .test_commands
661        .iter()
662        .map(|(k, v)| (k.clone(), str_value(v)))
663        .collect();
664    let test_commands_dict = build_dict(test_commands_entries);
665
666    let detected: VmValue = project
667        .detected_test_command
668        .as_deref()
669        .map(str_value)
670        .unwrap_or(VmValue::Nil);
671
672    let code_patterns: Vec<VmValue> = project.code_patterns.iter().map(str_value).collect();
673    let available_dependencies: Vec<VmValue> = project
674        .available_dependencies
675        .iter()
676        .map(str_value)
677        .collect();
678
679    build_dict([
680        ("name", str_value(&project.name)),
681        ("root_path", str_value(&project.root_path)),
682        ("languages", list_of(&project.languages, language_to_value)),
683        ("test_commands", test_commands_dict),
684        ("detected_test_command", detected),
685        ("code_patterns", VmValue::List(Arc::new(code_patterns))),
686        ("total_files", VmValue::Int(project.total_files as i64)),
687        ("total_lines", VmValue::Int(project.total_lines as i64)),
688        ("last_scanned_at", str_value(&project.last_scanned_at)),
689        (
690            "available_dependencies",
691            VmValue::List(Arc::new(available_dependencies)),
692        ),
693    ])
694}
695
696fn language_to_value(stat: &LanguageStat) -> VmValue {
697    build_dict([
698        ("name", str_value(&stat.name)),
699        ("file_count", VmValue::Int(stat.file_count as i64)),
700        ("line_count", VmValue::Int(stat.line_count as i64)),
701        ("percentage", VmValue::Float(stat.percentage)),
702    ])
703}
704
705fn folder_to_value(folder: &FolderRecord) -> VmValue {
706    let names: Vec<VmValue> = folder.key_symbol_names.iter().map(str_value).collect();
707    build_dict([
708        ("id", str_value(&folder.id)),
709        ("relative_path", str_value(&folder.relative_path)),
710        ("file_count", VmValue::Int(folder.file_count as i64)),
711        ("line_count", VmValue::Int(folder.line_count as i64)),
712        ("dominant_language", str_value(&folder.dominant_language)),
713        ("key_symbol_names", VmValue::List(Arc::new(names))),
714    ])
715}
716
717fn file_to_value(file: &FileRecord) -> VmValue {
718    let imports: Vec<VmValue> = file.imports.iter().map(str_value).collect();
719    let test_pair = file
720        .corresponding_test_file
721        .as_deref()
722        .map(str_value)
723        .unwrap_or(VmValue::Nil);
724    build_dict([
725        ("id", str_value(&file.id)),
726        ("relative_path", str_value(&file.relative_path)),
727        ("file_name", str_value(&file.file_name)),
728        ("language", str_value(&file.language)),
729        ("line_count", VmValue::Int(file.line_count as i64)),
730        ("size_bytes", VmValue::Int(file.size_bytes as i64)),
731        (
732            "last_modified_unix_ms",
733            VmValue::Int(file.last_modified_unix_ms),
734        ),
735        ("imports", VmValue::List(Arc::new(imports))),
736        ("churn_score", VmValue::Float(file.churn_score)),
737        ("corresponding_test_file", test_pair),
738    ])
739}
740
741fn symbol_to_value(symbol: &SymbolRecord) -> VmValue {
742    let container = symbol
743        .container
744        .as_deref()
745        .map(str_value)
746        .unwrap_or(VmValue::Nil);
747    build_dict([
748        ("id", str_value(&symbol.id)),
749        ("name", str_value(&symbol.name)),
750        ("kind", str_value(symbol.kind.keyword())),
751        ("file_path", str_value(&symbol.file_path)),
752        ("line", VmValue::Int(symbol.line as i64)),
753        ("signature", str_value(&symbol.signature)),
754        ("container", container),
755        (
756            "reference_count",
757            VmValue::Int(symbol.reference_count as i64),
758        ),
759        ("importance_score", VmValue::Float(symbol.importance_score)),
760    ])
761}
762
763fn dependency_to_value(dep: &DependencyEdge) -> VmValue {
764    build_dict([
765        ("from_file", str_value(&dep.from_file)),
766        ("to_module", str_value(&dep.to_module)),
767    ])
768}
769
770fn subproject_to_value(sp: &SubProject) -> VmValue {
771    let dependencies: Vec<VmValue> = sp.dependencies.iter().map(str_value).collect();
772    build_dict([
773        ("path", str_value(&sp.path)),
774        ("name", str_value(&sp.name)),
775        ("language", str_value(&sp.language)),
776        ("project_marker", str_value(&sp.project_marker)),
777        ("dependencies", VmValue::List(Arc::new(dependencies))),
778    ])
779}
780
781fn delta_to_value(delta: &ScanDelta) -> VmValue {
782    let added: Vec<VmValue> = delta.added.iter().map(str_value).collect();
783    let modified: Vec<VmValue> = delta.modified.iter().map(str_value).collect();
784    let removed: Vec<VmValue> = delta.removed.iter().map(str_value).collect();
785    build_dict([
786        ("added", VmValue::List(Arc::new(added))),
787        ("modified", VmValue::List(Arc::new(modified))),
788        ("removed", VmValue::List(Arc::new(removed))),
789        ("full_rescan", VmValue::Bool(delta.full_rescan)),
790    ])
791}
792
793#[cfg(test)]
794mod tests {
795    use super::*;
796    use filetime::{set_file_mtime, FileTime};
797    use std::fs;
798
799    #[test]
800    fn builtin_option_defaults_match_request_schemas() {
801        let dict = harn_vm::value::DictMap::new();
802
803        let scan_project = parse_options(SCAN_PROJECT_BUILTIN, &dict).unwrap();
804        let scan_incremental = parse_options(SCAN_INCREMENTAL_BUILTIN, &dict).unwrap();
805
806        assert!(scan_project.include_git_history);
807        assert!(!scan_incremental.include_git_history);
808    }
809
810    fn symbol_names(scan: &IncrementalScan) -> Vec<String> {
811        scan.result.symbols.iter().map(|s| s.name.clone()).collect()
812    }
813
814    /// Regression guard: an agent that writes a file and re-scans in the same
815    /// instant must see its own edit. `compute_delta`'s mtime comparison
816    /// collides on same-millisecond/same-second writes (and on
817    /// coarse-granularity filesystems), so the size-change fallback is what
818    /// keeps same-turn index freshness honest. Before the fallback this
819    /// returned the pre-edit symbol set, feeding fuzzy-match-stale loops on
820    /// cheap local models.
821    #[test]
822    fn scan_incremental_detects_same_mtime_size_changing_edit() {
823        let dir = tempfile::tempdir().unwrap();
824        fs::create_dir_all(dir.path().join("src")).unwrap();
825        let file = dir.path().join("src/lib.rs");
826        fs::write(&file, "pub fn old_symbol() {}\n").unwrap();
827
828        // Canonicalize so the snapshot token matches across calls.
829        let canonical = std::fs::canonicalize(dir.path()).unwrap();
830        let token = canonical.to_string_lossy().to_string();
831        let opts = ScanProjectOptions::default();
832
833        let first = scan_incremental(&token, None, opts.clone());
834        let cached_mtime = first
835            .result
836            .files
837            .iter()
838            .find(|r| r.relative_path == "src/lib.rs")
839            .expect("seed file indexed")
840            .last_modified_unix_ms;
841        assert!(symbol_names(&first).iter().any(|n| n == "old_symbol"));
842
843        // Add a symbol (byte size grows), then force the mtime back to the
844        // cached value to simulate a same-instant edit the OS couldn't
845        // distinguish by mtime.
846        fs::write(
847            &file,
848            "pub fn old_symbol() {}\npub fn brand_new_symbol() {}\n",
849        )
850        .unwrap();
851        let secs = cached_mtime / 1000;
852        let nanos = ((cached_mtime % 1000) * 1_000_000) as u32;
853        set_file_mtime(&file, FileTime::from_unix_time(secs, nanos)).unwrap();
854
855        let second = scan_incremental(&token, None, opts);
856        let names = symbol_names(&second);
857        assert!(
858            names.iter().any(|n| n == "brand_new_symbol"),
859            "same-mtime size-changing edit must be reindexed, got {names:?} (delta.modified={:?})",
860            second.delta.modified,
861        );
862    }
863
864    /// Companion guard: even when an edit changes nothing the scanner can
865    /// cheaply detect (same mtime AND same byte size — e.g. a length-preserving
866    /// one-character swap), passing the explicit `changed_paths` signal still
867    /// forces the reindex. The agent loop threads its own write through this
868    /// bypass, so freshness never depends on mtime/size heuristics for the
869    /// agent's own edits.
870    #[test]
871    fn scan_incremental_changed_paths_bypasses_metadata_heuristics() {
872        let dir = tempfile::tempdir().unwrap();
873        fs::create_dir_all(dir.path().join("src")).unwrap();
874        let file = dir.path().join("src/lib.rs");
875        // 23 bytes.
876        fs::write(&file, "pub fn alpha_name() {}\n").unwrap();
877
878        let canonical = std::fs::canonicalize(dir.path()).unwrap();
879        let token = canonical.to_string_lossy().to_string();
880        let opts = ScanProjectOptions::default();
881
882        let first = scan_incremental(&token, None, opts.clone());
883        let cached_mtime = first
884            .result
885            .files
886            .iter()
887            .find(|r| r.relative_path == "src/lib.rs")
888            .expect("seed file indexed")
889            .last_modified_unix_ms;
890        assert!(symbol_names(&first).iter().any(|n| n == "alpha_name"));
891
892        // Length-preserving rename (same 23 bytes), same forced mtime: neither
893        // the size nor the mtime heuristic can see this.
894        fs::write(&file, "pub fn omega_name() {}\n").unwrap();
895        let secs = cached_mtime / 1000;
896        let nanos = ((cached_mtime % 1000) * 1_000_000) as u32;
897        set_file_mtime(&file, FileTime::from_unix_time(secs, nanos)).unwrap();
898
899        // Without an explicit signal the heuristics legitimately miss this
900        // rare length-preserving same-instant case...
901        let heuristic_only = scan_incremental(&token, None, opts.clone());
902        assert!(
903            !heuristic_only
904                .delta
905                .modified
906                .contains(&"src/lib.rs".to_string()),
907            "documenting the heuristic's known blind spot",
908        );
909
910        // ...but the explicit changed-path signal the agent loop passes after
911        // its own write forces the reindex regardless.
912        let explicit = scan_incremental(&token, Some(&["src/lib.rs".to_string()]), opts);
913        assert!(
914            symbol_names(&explicit).iter().any(|n| n == "omega_name"),
915            "explicit changed_paths must always reindex, got {:?}",
916            symbol_names(&explicit),
917        );
918    }
919}