Skip to main content

harn_hostlib/scanner/
mod.rs

1//! Repo scanner host capability.
2//!
3//! Deterministic project-wide file enumeration honoring `.gitignore` and
4//! the [`extensions::EXCLUDED_DIRS`] table, symbol extraction,
5//! import-derived dependency graph, reference + churn + importance
6//! scoring, source/test pairing, folder aggregates, project metadata
7//! (language stats + detected test commands + code-pattern hints),
8//! sub-project detection, and a token-budgeted text repo map.
9//!
10//! `scan_project` returns the full [`result::ScanResult`] alongside an
11//! opaque `snapshot_token` derived from the canonicalized root path. The
12//! result is persisted to `<root>/.harn/hostlib/scanner-snapshot.json` so
13//! that `scan_incremental` can diff against it later — without forcing the
14//! caller to pass the previous result back over the wire.
15
16use std::path::{Path, PathBuf};
17use std::process::Command;
18use std::sync::Arc;
19use std::time::{SystemTime, UNIX_EPOCH};
20
21use harn_vm::VmValue;
22
23use crate::error::HostlibError;
24use crate::registry::{BuiltinRegistry, HostlibCapability, RegisteredBuiltin, SyncHandler};
25use crate::tools::args::{
26    build_dict, dict_arg, optional_bool, optional_int, require_string, str_value,
27};
28
29mod commands;
30mod discover;
31mod extensions;
32mod folders;
33mod git;
34mod imports;
35mod manifest;
36mod result;
37mod scoring;
38mod snapshot;
39mod subproject;
40mod symbols;
41mod test_mapping;
42
43fn strip_ambient_git_env(cmd: &mut Command) {
44    // Git exports repository-specific GIT_* variables while running hooks.
45    // Scanner probes must honor their explicit `-C <root>` argument instead.
46    for (key, _) in std::env::vars() {
47        if key.starts_with("GIT_") {
48            cmd.env_remove(&key);
49        }
50    }
51}
52
53pub use git::GitCapabilities;
54pub use result::{
55    DependencyEdge, FileRecord, FolderRecord, LanguageStat, ProjectMetadata, ScanDelta, ScanResult,
56    SubProject, SymbolKind, SymbolRecord,
57};
58
59const SCAN_PROJECT_BUILTIN: &str = "hostlib_scanner_scan_project";
60const SCAN_INCREMENTAL_BUILTIN: &str = "hostlib_scanner_scan_incremental";
61
62/// Scanner capability handle.
63#[derive(Default)]
64pub struct ScannerCapability;
65
66impl HostlibCapability for ScannerCapability {
67    fn module_name(&self) -> &'static str {
68        "scanner"
69    }
70
71    fn register_builtins(&self, registry: &mut BuiltinRegistry) {
72        let scan_project: SyncHandler = Arc::new(scan_project_handler);
73        registry.register(RegisteredBuiltin {
74            name: SCAN_PROJECT_BUILTIN,
75            module: "scanner",
76            method: "scan_project",
77            handler: scan_project,
78        });
79        let scan_incremental: SyncHandler = Arc::new(scan_incremental_handler);
80        registry.register(RegisteredBuiltin {
81            name: SCAN_INCREMENTAL_BUILTIN,
82            module: "scanner",
83            method: "scan_incremental",
84            handler: scan_incremental,
85        });
86    }
87}
88
89// MARK: - Public Rust API (used by tests + by harn-cli embedders).
90
91/// Tunable knobs accepted by [`scan_project`].
92#[derive(Clone, Debug)]
93pub struct ScanProjectOptions {
94    /// Include hidden (`.`) entries during walking.
95    pub include_hidden: bool,
96    /// Honor `.gitignore`.
97    pub respect_gitignore: bool,
98    /// Hard cap on file count (0 = unlimited).
99    pub max_files: usize,
100    /// Run `git log` to compute churn scores.
101    pub include_git_history: bool,
102    /// Approximate token budget for the text repo map.
103    pub repo_map_token_budget: usize,
104}
105
106impl Default for ScanProjectOptions {
107    fn default() -> Self {
108        Self {
109            include_hidden: false,
110            respect_gitignore: true,
111            max_files: 0,
112            include_git_history: true,
113            repo_map_token_budget: 1200,
114        }
115    }
116}
117
118/// Run a full scan of `root`, persist a snapshot, and return the result.
119pub fn scan_project(root: &Path, opts: ScanProjectOptions) -> ScanResult {
120    scan_project_with_git(root, opts, &git::CliGitCapabilities)
121}
122
123/// Run a full scan using caller-supplied Git data.
124///
125/// Embedders normally call [`scan_project`]. Tests and hosts that already
126/// virtualize Git can use this entry point to keep scanner behavior
127/// deterministic without depending on ambient process state.
128pub fn scan_project_with_git(
129    root: &Path,
130    opts: ScanProjectOptions,
131    git: &dyn GitCapabilities,
132) -> ScanResult {
133    let canonical = canonicalize(root);
134    let discover_opts = discover::DiscoverOptions {
135        include_hidden: opts.include_hidden,
136        respect_gitignore: opts.respect_gitignore,
137    };
138    let mut discovered = discover::discover_files(&canonical, discover_opts, git);
139    let truncated = if opts.max_files > 0 && discovered.len() > opts.max_files {
140        discovered.truncate(opts.max_files);
141        true
142    } else {
143        false
144    };
145
146    let (mut files, mut symbols, mut dependencies) = extract_per_file(&discovered);
147
148    scoring::compute_reference_counts(&mut symbols, &files);
149
150    if opts.include_git_history {
151        let churn = git.churn_scores(&canonical);
152        scoring::apply_churn(&mut files, &churn);
153    }
154    scoring::compute_importance_scores(&mut symbols, &files);
155
156    test_mapping::map_test_files(&mut files);
157
158    let folder_records = folders::build_folder_records(&files, &symbols);
159    let test_commands = commands::detect_test_commands(&canonical);
160    let code_patterns = commands::detect_code_patterns(&files, &canonical);
161    let mut project = folders::build_project_metadata(
162        &canonical,
163        &files,
164        test_commands,
165        code_patterns,
166        now_iso8601(),
167    );
168    let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
169    let mut sub_projects = subproject::detect_subprojects(&canonical, 2);
170    attach_manifest_dependencies(&canonical, &mut project, &mut sub_projects);
171
172    sort_for_output(&mut files, &mut symbols, &mut dependencies);
173
174    let token = snapshot::root_to_token(&canonical);
175    let result = ScanResult {
176        snapshot_token: token,
177        truncated,
178        project,
179        folders: folder_records,
180        files,
181        symbols,
182        dependencies,
183        sub_projects,
184        repo_map,
185    };
186    snapshot::save(&canonical, &result);
187    result
188}
189
190/// Result returned by [`scan_incremental`].
191#[derive(Clone, Debug)]
192pub struct IncrementalScan {
193    /// Refreshed scan result.
194    pub result: ScanResult,
195    /// Path delta computed against the snapshot.
196    pub delta: ScanDelta,
197}
198
199/// Refresh the snapshot named by `token`. If the snapshot is missing, the
200/// diff is too large (>30%), or `changed_paths` is empty after `>30%` of
201/// the workspace mtime-mismatched, falls back to a full rescan.
202pub fn scan_incremental(
203    token: &str,
204    explicit_changed: Option<&[String]>,
205    opts: ScanProjectOptions,
206) -> IncrementalScan {
207    scan_incremental_with_git(token, explicit_changed, opts, &git::CliGitCapabilities)
208}
209
210/// Refresh a snapshot using caller-supplied Git data.
211pub fn scan_incremental_with_git(
212    token: &str,
213    explicit_changed: Option<&[String]>,
214    opts: ScanProjectOptions,
215    git: &dyn GitCapabilities,
216) -> IncrementalScan {
217    let root = snapshot::token_to_root(token);
218    let canonical = canonicalize(&root);
219
220    let cached = snapshot::load(&canonical);
221    let cached = match cached {
222        Some(c) => c,
223        None => {
224            let result = scan_project_with_git(&canonical, opts, git);
225            return IncrementalScan {
226                result,
227                delta: ScanDelta {
228                    full_rescan: true,
229                    ..ScanDelta::default()
230                },
231            };
232        }
233    };
234
235    let discover_opts = discover::DiscoverOptions {
236        include_hidden: opts.include_hidden,
237        respect_gitignore: opts.respect_gitignore,
238    };
239    let mut current = discover::discover_files(&canonical, discover_opts, git);
240    if opts.max_files > 0 && current.len() > opts.max_files {
241        current.truncate(opts.max_files);
242    }
243
244    let delta = compute_delta(&current, &cached, explicit_changed);
245    let total = current.len();
246    let needs_full_rescan =
247        total > 0 && (delta.added.len() + delta.modified.len()) * 10 > total * 3;
248
249    if needs_full_rescan {
250        let result = scan_project_with_git(&canonical, opts, git);
251        return IncrementalScan {
252            result,
253            delta: ScanDelta {
254                full_rescan: true,
255                ..delta
256            },
257        };
258    }
259
260    if delta.added.is_empty() && delta.modified.is_empty() && delta.removed.is_empty() {
261        return IncrementalScan {
262            result: cached,
263            delta,
264        };
265    }
266
267    // Incremental path: rebuild only the touched files, then re-finalize.
268    let mut files = cached.files;
269    let mut symbols = cached.symbols;
270    let mut dependencies = cached.dependencies;
271
272    let removed_set: std::collections::HashSet<&str> =
273        delta.removed.iter().map(|s| s.as_str()).collect();
274    let touched_set: std::collections::HashSet<&str> = delta
275        .added
276        .iter()
277        .chain(delta.modified.iter())
278        .map(|s| s.as_str())
279        .collect();
280
281    files.retain(|f| !removed_set.contains(f.relative_path.as_str()));
282    symbols.retain(|s| {
283        !removed_set.contains(s.file_path.as_str()) && !touched_set.contains(s.file_path.as_str())
284    });
285    dependencies.retain(|d| {
286        !removed_set.contains(d.from_file.as_str()) && !touched_set.contains(d.from_file.as_str())
287    });
288
289    let touched_entries: Vec<discover::DiscoveredFile> = current
290        .iter()
291        .filter(|e| touched_set.contains(e.relative_path.as_str()))
292        .cloned()
293        .collect();
294    let (new_files, new_symbols, new_deps) = extract_per_file(&touched_entries);
295
296    let mut by_path: std::collections::BTreeMap<String, FileRecord> = files
297        .into_iter()
298        .map(|f| (f.relative_path.clone(), f))
299        .collect();
300    for new_file in new_files {
301        by_path.insert(new_file.relative_path.clone(), new_file);
302    }
303    let mut files: Vec<FileRecord> = by_path.into_values().collect();
304    symbols.extend(new_symbols);
305    dependencies.extend(new_deps);
306
307    scoring::compute_reference_counts(&mut symbols, &files);
308    if opts.include_git_history {
309        let churn = git.churn_scores(&canonical);
310        scoring::apply_churn(&mut files, &churn);
311    }
312    scoring::compute_importance_scores(&mut symbols, &files);
313    test_mapping::map_test_files(&mut files);
314
315    let folder_records = folders::build_folder_records(&files, &symbols);
316    let test_commands = commands::detect_test_commands(&canonical);
317    let code_patterns = commands::detect_code_patterns(&files, &canonical);
318    let mut project = folders::build_project_metadata(
319        &canonical,
320        &files,
321        test_commands,
322        code_patterns,
323        now_iso8601(),
324    );
325    let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
326    let mut sub_projects = subproject::detect_subprojects(&canonical, 2);
327    attach_manifest_dependencies(&canonical, &mut project, &mut sub_projects);
328
329    sort_for_output(&mut files, &mut symbols, &mut dependencies);
330
331    let token = snapshot::root_to_token(&canonical);
332    let result = ScanResult {
333        snapshot_token: token,
334        truncated: cached.truncated,
335        project,
336        folders: folder_records,
337        files,
338        symbols,
339        dependencies,
340        sub_projects,
341        repo_map,
342    };
343    snapshot::save(&canonical, &result);
344    IncrementalScan { result, delta }
345}
346
347// MARK: - Internals
348
349fn canonicalize(root: &Path) -> PathBuf {
350    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
351}
352
353/// Compute package-manifest dependencies for the root and each detected
354/// sub-project. Centralized here so manifest parsing (in [`manifest`]) is
355/// invoked exactly once per project directory by both the full and the
356/// incremental scan paths.
357fn attach_manifest_dependencies(
358    canonical: &Path,
359    project: &mut ProjectMetadata,
360    sub_projects: &mut [SubProject],
361) {
362    project.available_dependencies = manifest::directory_dependencies(canonical);
363    for sp in sub_projects.iter_mut() {
364        sp.dependencies = manifest::directory_dependencies(Path::new(&sp.path));
365    }
366}
367
368fn extract_per_file(
369    discovered: &[discover::DiscoveredFile],
370) -> (Vec<FileRecord>, Vec<SymbolRecord>, Vec<DependencyEdge>) {
371    let mut files: Vec<FileRecord> = Vec::with_capacity(discovered.len());
372    let mut symbols: Vec<SymbolRecord> = Vec::new();
373    let mut dependencies: Vec<DependencyEdge> = Vec::new();
374
375    for entry in discovered {
376        let metadata = std::fs::metadata(&entry.absolute_path);
377        let size = metadata.as_ref().map(|m| m.len()).unwrap_or(0);
378        let modified = metadata
379            .as_ref()
380            .ok()
381            .and_then(|m| m.modified().ok())
382            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
383            .map(|d| d.as_millis() as i64)
384            .unwrap_or(0);
385
386        let content = std::fs::read_to_string(&entry.absolute_path).unwrap_or_default();
387        if content.is_empty() && size != 0 {
388            // Likely a non-utf8 binary; skip symbol/import extraction but still record the file.
389        }
390        let language = extensions::file_extension(&entry.relative_path);
391        let imports = imports::extract_imports(&content, &language);
392        let file_symbols = symbols::extract_symbols(&content, &language, &entry.relative_path);
393        let line_count = crate::text::count_lines(content.as_bytes()) as usize;
394
395        for imp in &imports {
396            dependencies.push(DependencyEdge {
397                from_file: entry.relative_path.clone(),
398                to_module: imp.clone(),
399            });
400        }
401        symbols.extend(file_symbols);
402
403        files.push(FileRecord {
404            id: entry.relative_path.clone(),
405            relative_path: entry.relative_path.clone(),
406            file_name: extensions::file_name(&entry.relative_path).to_string(),
407            language,
408            line_count,
409            size_bytes: size,
410            last_modified_unix_ms: modified,
411            imports,
412            churn_score: 0.0,
413            corresponding_test_file: None,
414        });
415    }
416
417    (files, symbols, dependencies)
418}
419
420fn sort_for_output(
421    files: &mut [FileRecord],
422    symbols: &mut [SymbolRecord],
423    dependencies: &mut [DependencyEdge],
424) {
425    files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
426    symbols.sort_by(|a, b| a.id.cmp(&b.id));
427    dependencies.sort_by(|a, b| {
428        a.from_file
429            .cmp(&b.from_file)
430            .then_with(|| a.to_module.cmp(&b.to_module))
431    });
432}
433
434fn compute_delta(
435    current: &[discover::DiscoveredFile],
436    cached: &ScanResult,
437    explicit_changed: Option<&[String]>,
438) -> ScanDelta {
439    let cached_files: std::collections::BTreeMap<&str, &FileRecord> = cached
440        .files
441        .iter()
442        .map(|f| (f.relative_path.as_str(), f))
443        .collect();
444    let current_paths: std::collections::HashSet<&str> =
445        current.iter().map(|e| e.relative_path.as_str()).collect();
446
447    let added: Vec<String> = current
448        .iter()
449        .filter(|e| !cached_files.contains_key(e.relative_path.as_str()))
450        .map(|e| e.relative_path.clone())
451        .collect();
452    let removed: Vec<String> = cached
453        .files
454        .iter()
455        .filter(|f| !current_paths.contains(f.relative_path.as_str()))
456        .map(|f| f.relative_path.clone())
457        .collect();
458
459    let modified: Vec<String> = if let Some(explicit) = explicit_changed {
460        explicit
461            .iter()
462            .filter(|p| cached_files.contains_key(p.as_str()) && current_paths.contains(p.as_str()))
463            .cloned()
464            .collect()
465    } else {
466        let mut out = Vec::new();
467        for entry in current {
468            if let Some(prev) = cached_files.get(entry.relative_path.as_str()) {
469                let meta = std::fs::metadata(&entry.absolute_path).ok();
470                let mtime = meta
471                    .as_ref()
472                    .and_then(|m| m.modified().ok())
473                    .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
474                    .map(|d| d.as_millis() as i64)
475                    .unwrap_or(0);
476                let size = meta.as_ref().map(|m| m.len()).unwrap_or(prev.size_bytes);
477                // A newer mtime is the cheap common signal, but mtime
478                // granularity collides on same-turn/same-second edits (and on
479                // coarse-granularity filesystems), silently dropping the edit.
480                // A changed byte size is an mtime-independent modification
481                // signal that catches the overwhelmingly common add/remove edit
482                // for free — `meta.len()` is already in hand. Without it, an
483                // agent that writes a file and re-scans in the same instant
484                // keeps reading the pre-edit symbol facts.
485                if mtime > prev.last_modified_unix_ms || size != prev.size_bytes {
486                    out.push(entry.relative_path.clone());
487                }
488            }
489        }
490        out
491    };
492
493    ScanDelta {
494        added,
495        modified,
496        removed,
497        full_rescan: false,
498    }
499}
500
501fn now_iso8601() -> String {
502    let now = SystemTime::now()
503        .duration_since(UNIX_EPOCH)
504        .unwrap_or_default();
505    let secs = now.as_secs() as i64;
506    let nanos = now.subsec_nanos();
507    let (year, month, day, hour, minute, second) = unix_to_civil(secs);
508    format!(
509        "{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}.{millis:03}Z",
510        millis = nanos / 1_000_000
511    )
512}
513
514/// Convert a unix timestamp (seconds, UTC) to civil date components. Uses
515/// Howard Hinnant's algorithm so we don't pull in `chrono` for one
516/// formatter.
517fn unix_to_civil(secs: i64) -> (i64, u32, u32, u32, u32, u32) {
518    let days = secs.div_euclid(86_400);
519    let day_secs = secs.rem_euclid(86_400);
520    let hour = (day_secs / 3600) as u32;
521    let minute = ((day_secs % 3600) / 60) as u32;
522    let second = (day_secs % 60) as u32;
523
524    // Days from 1970-01-01.
525    let z = days + 719_468;
526    let era = z.div_euclid(146_097);
527    let doe = z.rem_euclid(146_097) as u64;
528    let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
529    let y = yoe as i64 + era * 400;
530    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
531    let mp = (5 * doy + 2) / 153;
532    let day = (doy - (153 * mp + 2) / 5 + 1) as u32;
533    let month = if mp < 10 { mp + 3 } else { mp - 9 } as u32;
534    let year = if month <= 2 { y + 1 } else { y };
535    (year, month, day, hour, minute, second)
536}
537
538// MARK: - Builtin handlers (Harn dict ↔ Rust struct).
539
540fn scan_project_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
541    let raw = dict_arg(SCAN_PROJECT_BUILTIN, args)?;
542    let dict = raw.as_ref();
543    let root = require_string(SCAN_PROJECT_BUILTIN, dict, "root")?;
544    let opts = parse_options(SCAN_PROJECT_BUILTIN, dict)?;
545    let result = scan_project(Path::new(&root), opts);
546    Ok(scan_result_to_value(&result, None))
547}
548
549fn scan_incremental_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
550    let raw = dict_arg(SCAN_INCREMENTAL_BUILTIN, args)?;
551    let dict = raw.as_ref();
552    let token = require_string(SCAN_INCREMENTAL_BUILTIN, dict, "snapshot_token")?;
553    let opts = parse_options(SCAN_INCREMENTAL_BUILTIN, dict)?;
554    let changed = parse_changed_paths(SCAN_INCREMENTAL_BUILTIN, dict)?;
555    let scan = scan_incremental(&token, changed.as_deref(), opts);
556    Ok(scan_result_to_value(&scan.result, Some(&scan.delta)))
557}
558
559fn parse_options(
560    builtin: &'static str,
561    dict: &harn_vm::value::DictMap,
562) -> Result<ScanProjectOptions, HostlibError> {
563    let include_hidden = optional_bool(builtin, dict, "include_hidden", false)?;
564    let respect_gitignore = optional_bool(builtin, dict, "respect_gitignore", true)?;
565    let max_files = optional_int(builtin, dict, "max_files", 0)?;
566    let include_git_history_default = builtin == SCAN_PROJECT_BUILTIN;
567    let include_git_history = optional_bool(
568        builtin,
569        dict,
570        "include_git_history",
571        include_git_history_default,
572    )?;
573    let repo_map_token_budget = optional_int(builtin, dict, "repo_map_token_budget", 1200)?;
574    if max_files < 0 {
575        return Err(HostlibError::InvalidParameter {
576            builtin,
577            param: "max_files",
578            message: "must be >= 0".to_string(),
579        });
580    }
581    if repo_map_token_budget < 0 {
582        return Err(HostlibError::InvalidParameter {
583            builtin,
584            param: "repo_map_token_budget",
585            message: "must be >= 0".to_string(),
586        });
587    }
588    Ok(ScanProjectOptions {
589        include_hidden,
590        respect_gitignore,
591        max_files: max_files as usize,
592        include_git_history,
593        repo_map_token_budget: repo_map_token_budget as usize,
594    })
595}
596
597fn parse_changed_paths(
598    builtin: &'static str,
599    dict: &harn_vm::value::DictMap,
600) -> Result<Option<Vec<String>>, HostlibError> {
601    let value = match dict.get("changed_paths") {
602        None | Some(VmValue::Nil) => return Ok(None),
603        Some(v) => v,
604    };
605    let list = match value {
606        VmValue::List(items) => items,
607        other => {
608            return Err(HostlibError::InvalidParameter {
609                builtin,
610                param: "changed_paths",
611                message: format!("expected list of strings, got {}", other.type_name()),
612            });
613        }
614    };
615    let mut out = Vec::with_capacity(list.len());
616    for item in list.iter() {
617        match item {
618            VmValue::String(s) => out.push(s.to_string()),
619            other => {
620                return Err(HostlibError::InvalidParameter {
621                    builtin,
622                    param: "changed_paths",
623                    message: format!("non-string entry: {}", other.type_name()),
624                });
625            }
626        }
627    }
628    Ok(Some(out))
629}
630
631fn scan_result_to_value(result: &ScanResult, delta: Option<&ScanDelta>) -> VmValue {
632    let mut entries: Vec<(&'static str, VmValue)> = vec![
633        ("snapshot_token", str_value(&result.snapshot_token)),
634        ("truncated", VmValue::Bool(result.truncated)),
635        ("project", project_to_value(&result.project)),
636        ("folders", list_of(&result.folders, folder_to_value)),
637        ("files", list_of(&result.files, file_to_value)),
638        ("symbols", list_of(&result.symbols, symbol_to_value)),
639        (
640            "dependencies",
641            list_of(&result.dependencies, dependency_to_value),
642        ),
643        (
644            "sub_projects",
645            list_of(&result.sub_projects, subproject_to_value),
646        ),
647        ("repo_map", str_value(&result.repo_map)),
648    ];
649    if let Some(d) = delta {
650        entries.push(("delta", delta_to_value(d)));
651    }
652    build_dict(entries)
653}
654
655fn list_of<T>(items: &[T], to_value: fn(&T) -> VmValue) -> VmValue {
656    let list: Vec<VmValue> = items.iter().map(to_value).collect();
657    VmValue::List(Arc::new(list))
658}
659
660fn project_to_value(project: &ProjectMetadata) -> VmValue {
661    let test_commands_entries: Vec<(String, VmValue)> = project
662        .test_commands
663        .iter()
664        .map(|(k, v)| (k.clone(), str_value(v)))
665        .collect();
666    let test_commands_dict = build_dict(test_commands_entries);
667
668    let detected: VmValue = project
669        .detected_test_command
670        .as_deref()
671        .map(str_value)
672        .unwrap_or(VmValue::Nil);
673
674    let code_patterns: Vec<VmValue> = project.code_patterns.iter().map(str_value).collect();
675    let available_dependencies: Vec<VmValue> = project
676        .available_dependencies
677        .iter()
678        .map(str_value)
679        .collect();
680
681    build_dict([
682        ("name", str_value(&project.name)),
683        ("root_path", str_value(&project.root_path)),
684        ("languages", list_of(&project.languages, language_to_value)),
685        ("test_commands", test_commands_dict),
686        ("detected_test_command", detected),
687        ("code_patterns", VmValue::List(Arc::new(code_patterns))),
688        ("total_files", VmValue::Int(project.total_files as i64)),
689        ("total_lines", VmValue::Int(project.total_lines as i64)),
690        ("last_scanned_at", str_value(&project.last_scanned_at)),
691        (
692            "available_dependencies",
693            VmValue::List(Arc::new(available_dependencies)),
694        ),
695    ])
696}
697
698fn language_to_value(stat: &LanguageStat) -> VmValue {
699    build_dict([
700        ("name", str_value(&stat.name)),
701        ("file_count", VmValue::Int(stat.file_count as i64)),
702        ("line_count", VmValue::Int(stat.line_count as i64)),
703        ("percentage", VmValue::Float(stat.percentage)),
704    ])
705}
706
707fn folder_to_value(folder: &FolderRecord) -> VmValue {
708    let names: Vec<VmValue> = folder.key_symbol_names.iter().map(str_value).collect();
709    build_dict([
710        ("id", str_value(&folder.id)),
711        ("relative_path", str_value(&folder.relative_path)),
712        ("file_count", VmValue::Int(folder.file_count as i64)),
713        ("line_count", VmValue::Int(folder.line_count as i64)),
714        ("dominant_language", str_value(&folder.dominant_language)),
715        ("key_symbol_names", VmValue::List(Arc::new(names))),
716    ])
717}
718
719fn file_to_value(file: &FileRecord) -> VmValue {
720    let imports: Vec<VmValue> = file.imports.iter().map(str_value).collect();
721    let test_pair = file
722        .corresponding_test_file
723        .as_deref()
724        .map(str_value)
725        .unwrap_or(VmValue::Nil);
726    build_dict([
727        ("id", str_value(&file.id)),
728        ("relative_path", str_value(&file.relative_path)),
729        ("file_name", str_value(&file.file_name)),
730        ("language", str_value(&file.language)),
731        ("line_count", VmValue::Int(file.line_count as i64)),
732        ("size_bytes", VmValue::Int(file.size_bytes as i64)),
733        (
734            "last_modified_unix_ms",
735            VmValue::Int(file.last_modified_unix_ms),
736        ),
737        ("imports", VmValue::List(Arc::new(imports))),
738        ("churn_score", VmValue::Float(file.churn_score)),
739        ("corresponding_test_file", test_pair),
740    ])
741}
742
743fn symbol_to_value(symbol: &SymbolRecord) -> VmValue {
744    let container = symbol
745        .container
746        .as_deref()
747        .map(str_value)
748        .unwrap_or(VmValue::Nil);
749    build_dict([
750        ("id", str_value(&symbol.id)),
751        ("name", str_value(&symbol.name)),
752        ("kind", str_value(symbol.kind.keyword())),
753        ("file_path", str_value(&symbol.file_path)),
754        ("line", VmValue::Int(symbol.line as i64)),
755        ("signature", str_value(&symbol.signature)),
756        ("container", container),
757        (
758            "reference_count",
759            VmValue::Int(symbol.reference_count as i64),
760        ),
761        ("importance_score", VmValue::Float(symbol.importance_score)),
762    ])
763}
764
765fn dependency_to_value(dep: &DependencyEdge) -> VmValue {
766    build_dict([
767        ("from_file", str_value(&dep.from_file)),
768        ("to_module", str_value(&dep.to_module)),
769    ])
770}
771
772fn subproject_to_value(sp: &SubProject) -> VmValue {
773    let dependencies: Vec<VmValue> = sp.dependencies.iter().map(str_value).collect();
774    build_dict([
775        ("path", str_value(&sp.path)),
776        ("name", str_value(&sp.name)),
777        ("language", str_value(&sp.language)),
778        ("project_marker", str_value(&sp.project_marker)),
779        ("dependencies", VmValue::List(Arc::new(dependencies))),
780    ])
781}
782
783fn delta_to_value(delta: &ScanDelta) -> VmValue {
784    let added: Vec<VmValue> = delta.added.iter().map(str_value).collect();
785    let modified: Vec<VmValue> = delta.modified.iter().map(str_value).collect();
786    let removed: Vec<VmValue> = delta.removed.iter().map(str_value).collect();
787    build_dict([
788        ("added", VmValue::List(Arc::new(added))),
789        ("modified", VmValue::List(Arc::new(modified))),
790        ("removed", VmValue::List(Arc::new(removed))),
791        ("full_rescan", VmValue::Bool(delta.full_rescan)),
792    ])
793}
794
795#[cfg(test)]
796mod tests {
797    use super::*;
798    use filetime::{set_file_mtime, FileTime};
799    use std::fs;
800
801    #[test]
802    fn builtin_option_defaults_match_request_schemas() {
803        let dict = harn_vm::value::DictMap::new();
804
805        let scan_project = parse_options(SCAN_PROJECT_BUILTIN, &dict).unwrap();
806        let scan_incremental = parse_options(SCAN_INCREMENTAL_BUILTIN, &dict).unwrap();
807
808        assert!(scan_project.include_git_history);
809        assert!(!scan_incremental.include_git_history);
810    }
811
812    fn symbol_names(scan: &IncrementalScan) -> Vec<String> {
813        scan.result.symbols.iter().map(|s| s.name.clone()).collect()
814    }
815
816    /// Regression guard: an agent that writes a file and re-scans in the same
817    /// instant must see its own edit. `compute_delta`'s mtime comparison
818    /// collides on same-millisecond/same-second writes (and on
819    /// coarse-granularity filesystems), so the size-change fallback is what
820    /// keeps same-turn index freshness honest. Before the fallback this
821    /// returned the pre-edit symbol set, feeding fuzzy-match-stale loops on
822    /// cheap local models.
823    #[test]
824    fn scan_incremental_detects_same_mtime_size_changing_edit() {
825        let dir = tempfile::tempdir().unwrap();
826        fs::create_dir_all(dir.path().join("src")).unwrap();
827        let file = dir.path().join("src/lib.rs");
828        fs::write(&file, "pub fn old_symbol() {}\n").unwrap();
829
830        // Canonicalize so the snapshot token matches across calls.
831        let canonical = std::fs::canonicalize(dir.path()).unwrap();
832        let token = canonical.to_string_lossy().to_string();
833        let opts = ScanProjectOptions::default();
834
835        let first = scan_incremental(&token, None, opts.clone());
836        let cached_mtime = first
837            .result
838            .files
839            .iter()
840            .find(|r| r.relative_path == "src/lib.rs")
841            .expect("seed file indexed")
842            .last_modified_unix_ms;
843        assert!(symbol_names(&first).iter().any(|n| n == "old_symbol"));
844
845        // Add a symbol (byte size grows), then force the mtime back to the
846        // cached value to simulate a same-instant edit the OS couldn't
847        // distinguish by mtime.
848        fs::write(
849            &file,
850            "pub fn old_symbol() {}\npub fn brand_new_symbol() {}\n",
851        )
852        .unwrap();
853        let secs = cached_mtime / 1000;
854        let nanos = ((cached_mtime % 1000) * 1_000_000) as u32;
855        set_file_mtime(&file, FileTime::from_unix_time(secs, nanos)).unwrap();
856
857        let second = scan_incremental(&token, None, opts);
858        let names = symbol_names(&second);
859        assert!(
860            names.iter().any(|n| n == "brand_new_symbol"),
861            "same-mtime size-changing edit must be reindexed, got {names:?} (delta.modified={:?})",
862            second.delta.modified,
863        );
864    }
865
866    /// Companion guard: even when an edit changes nothing the scanner can
867    /// cheaply detect (same mtime AND same byte size — e.g. a length-preserving
868    /// one-character swap), passing the explicit `changed_paths` signal still
869    /// forces the reindex. The agent loop threads its own write through this
870    /// bypass, so freshness never depends on mtime/size heuristics for the
871    /// agent's own edits.
872    #[test]
873    fn scan_incremental_changed_paths_bypasses_metadata_heuristics() {
874        let dir = tempfile::tempdir().unwrap();
875        fs::create_dir_all(dir.path().join("src")).unwrap();
876        let file = dir.path().join("src/lib.rs");
877        // 23 bytes.
878        fs::write(&file, "pub fn alpha_name() {}\n").unwrap();
879
880        let canonical = std::fs::canonicalize(dir.path()).unwrap();
881        let token = canonical.to_string_lossy().to_string();
882        let opts = ScanProjectOptions::default();
883
884        let first = scan_incremental(&token, None, opts.clone());
885        let cached_mtime = first
886            .result
887            .files
888            .iter()
889            .find(|r| r.relative_path == "src/lib.rs")
890            .expect("seed file indexed")
891            .last_modified_unix_ms;
892        assert!(symbol_names(&first).iter().any(|n| n == "alpha_name"));
893
894        // Length-preserving rename (same 23 bytes), same forced mtime: neither
895        // the size nor the mtime heuristic can see this.
896        fs::write(&file, "pub fn omega_name() {}\n").unwrap();
897        let secs = cached_mtime / 1000;
898        let nanos = ((cached_mtime % 1000) * 1_000_000) as u32;
899        set_file_mtime(&file, FileTime::from_unix_time(secs, nanos)).unwrap();
900
901        // Without an explicit signal the heuristics legitimately miss this
902        // rare length-preserving same-instant case...
903        let heuristic_only = scan_incremental(&token, None, opts.clone());
904        assert!(
905            !heuristic_only
906                .delta
907                .modified
908                .contains(&"src/lib.rs".to_string()),
909            "documenting the heuristic's known blind spot",
910        );
911
912        // ...but the explicit changed-path signal the agent loop passes after
913        // its own write forces the reindex regardless.
914        let explicit = scan_incremental(&token, Some(&["src/lib.rs".to_string()]), opts);
915        assert!(
916            symbol_names(&explicit).iter().any(|n| n == "omega_name"),
917            "explicit changed_paths must always reindex, got {:?}",
918            symbol_names(&explicit),
919        );
920    }
921}