Skip to main content

harn_hostlib/scanner/
mod.rs

1//! Repo scanner host capability.
2//!
3//! Deterministic project-wide file enumeration honoring `.gitignore` and
4//! the [`extensions::EXCLUDED_DIRS`] table, symbol extraction,
5//! import-derived dependency graph, reference + churn + importance
6//! scoring, source/test pairing, folder aggregates, project metadata
7//! (language stats + detected test commands + code-pattern hints),
8//! sub-project detection, and a token-budgeted text repo map.
9//!
10//! `scan_project` returns the full [`result::ScanResult`] alongside an
11//! opaque `snapshot_token` derived from the canonicalized root path. The
12//! result is persisted to `<root>/.harn/hostlib/scanner-snapshot.json` so
13//! that `scan_incremental` can diff against it later — without forcing the
14//! caller to pass the previous result back over the wire.
15
16use std::path::{Path, PathBuf};
17use std::process::Command;
18use std::sync::Arc;
19use std::time::{SystemTime, UNIX_EPOCH};
20
21use harn_vm::VmValue;
22
23use crate::error::HostlibError;
24use crate::registry::{BuiltinRegistry, HostlibCapability, RegisteredBuiltin, SyncHandler};
25use crate::tools::args::{
26    build_dict, dict_arg, optional_bool, optional_int, require_string, str_value,
27};
28
29mod commands;
30mod discover;
31mod extensions;
32mod folders;
33mod git;
34mod imports;
35mod result;
36mod scoring;
37mod snapshot;
38mod subproject;
39mod symbols;
40mod test_mapping;
41
42fn strip_ambient_git_env(cmd: &mut Command) {
43    // Git exports repository-specific GIT_* variables while running hooks.
44    // Scanner probes must honor their explicit `-C <root>` argument instead.
45    for (key, _) in std::env::vars() {
46        if key.starts_with("GIT_") {
47            cmd.env_remove(&key);
48        }
49    }
50}
51
52pub use git::GitCapabilities;
53pub use result::{
54    DependencyEdge, FileRecord, FolderRecord, LanguageStat, ProjectMetadata, ScanDelta, ScanResult,
55    SubProject, SymbolKind, SymbolRecord,
56};
57
58const SCAN_PROJECT_BUILTIN: &str = "hostlib_scanner_scan_project";
59const SCAN_INCREMENTAL_BUILTIN: &str = "hostlib_scanner_scan_incremental";
60
61/// Scanner capability handle.
62#[derive(Default)]
63pub struct ScannerCapability;
64
65impl HostlibCapability for ScannerCapability {
66    fn module_name(&self) -> &'static str {
67        "scanner"
68    }
69
70    fn register_builtins(&self, registry: &mut BuiltinRegistry) {
71        let scan_project: SyncHandler = Arc::new(scan_project_handler);
72        registry.register(RegisteredBuiltin {
73            name: SCAN_PROJECT_BUILTIN,
74            module: "scanner",
75            method: "scan_project",
76            handler: scan_project,
77        });
78        let scan_incremental: SyncHandler = Arc::new(scan_incremental_handler);
79        registry.register(RegisteredBuiltin {
80            name: SCAN_INCREMENTAL_BUILTIN,
81            module: "scanner",
82            method: "scan_incremental",
83            handler: scan_incremental,
84        });
85    }
86}
87
88// MARK: - Public Rust API (used by tests + by harn-cli embedders).
89
90/// Tunable knobs accepted by [`scan_project`].
91#[derive(Clone, Debug)]
92pub struct ScanProjectOptions {
93    /// Include hidden (`.`) entries during walking.
94    pub include_hidden: bool,
95    /// Honor `.gitignore`.
96    pub respect_gitignore: bool,
97    /// Hard cap on file count (0 = unlimited).
98    pub max_files: usize,
99    /// Run `git log` to compute churn scores.
100    pub include_git_history: bool,
101    /// Approximate token budget for the text repo map.
102    pub repo_map_token_budget: usize,
103}
104
105impl Default for ScanProjectOptions {
106    fn default() -> Self {
107        Self {
108            include_hidden: false,
109            respect_gitignore: true,
110            max_files: 0,
111            include_git_history: true,
112            repo_map_token_budget: 1200,
113        }
114    }
115}
116
117/// Run a full scan of `root`, persist a snapshot, and return the result.
118pub fn scan_project(root: &Path, opts: ScanProjectOptions) -> ScanResult {
119    scan_project_with_git(root, opts, &git::CliGitCapabilities)
120}
121
122/// Run a full scan using caller-supplied Git data.
123///
124/// Embedders normally call [`scan_project`]. Tests and hosts that already
125/// virtualize Git can use this entry point to keep scanner behavior
126/// deterministic without depending on ambient process state.
127pub fn scan_project_with_git(
128    root: &Path,
129    opts: ScanProjectOptions,
130    git: &dyn GitCapabilities,
131) -> ScanResult {
132    let canonical = canonicalize(root);
133    let discover_opts = discover::DiscoverOptions {
134        include_hidden: opts.include_hidden,
135        respect_gitignore: opts.respect_gitignore,
136    };
137    let mut discovered = discover::discover_files(&canonical, discover_opts, git);
138    let truncated = if opts.max_files > 0 && discovered.len() > opts.max_files {
139        discovered.truncate(opts.max_files);
140        true
141    } else {
142        false
143    };
144
145    let (mut files, mut symbols, mut dependencies) = extract_per_file(&discovered);
146
147    scoring::compute_reference_counts(&mut symbols, &files);
148
149    if opts.include_git_history {
150        let churn = git.churn_scores(&canonical);
151        scoring::apply_churn(&mut files, &churn);
152    }
153    scoring::compute_importance_scores(&mut symbols, &files);
154
155    test_mapping::map_test_files(&mut files);
156
157    let folder_records = folders::build_folder_records(&files, &symbols);
158    let test_commands = commands::detect_test_commands(&canonical);
159    let code_patterns = commands::detect_code_patterns(&files, &canonical);
160    let project = folders::build_project_metadata(
161        &canonical,
162        &files,
163        test_commands,
164        code_patterns,
165        now_iso8601(),
166    );
167    let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
168    let sub_projects = subproject::detect_subprojects(&canonical, 2);
169
170    sort_for_output(&mut files, &mut symbols, &mut dependencies);
171
172    let token = snapshot::root_to_token(&canonical);
173    let result = ScanResult {
174        snapshot_token: token,
175        truncated,
176        project,
177        folders: folder_records,
178        files,
179        symbols,
180        dependencies,
181        sub_projects,
182        repo_map,
183    };
184    snapshot::save(&canonical, &result);
185    result
186}
187
188/// Result returned by [`scan_incremental`].
189#[derive(Clone, Debug)]
190pub struct IncrementalScan {
191    /// Refreshed scan result.
192    pub result: ScanResult,
193    /// Path delta computed against the snapshot.
194    pub delta: ScanDelta,
195}
196
197/// Refresh the snapshot named by `token`. If the snapshot is missing, the
198/// diff is too large (>30%), or `changed_paths` is empty after `>30%` of
199/// the workspace mtime-mismatched, falls back to a full rescan.
200pub fn scan_incremental(
201    token: &str,
202    explicit_changed: Option<&[String]>,
203    opts: ScanProjectOptions,
204) -> IncrementalScan {
205    scan_incremental_with_git(token, explicit_changed, opts, &git::CliGitCapabilities)
206}
207
208/// Refresh a snapshot using caller-supplied Git data.
209pub fn scan_incremental_with_git(
210    token: &str,
211    explicit_changed: Option<&[String]>,
212    opts: ScanProjectOptions,
213    git: &dyn GitCapabilities,
214) -> IncrementalScan {
215    let root = snapshot::token_to_root(token);
216    let canonical = canonicalize(&root);
217
218    let cached = snapshot::load(&canonical);
219    let cached = match cached {
220        Some(c) => c,
221        None => {
222            let result = scan_project_with_git(&canonical, opts, git);
223            return IncrementalScan {
224                result,
225                delta: ScanDelta {
226                    full_rescan: true,
227                    ..ScanDelta::default()
228                },
229            };
230        }
231    };
232
233    let discover_opts = discover::DiscoverOptions {
234        include_hidden: opts.include_hidden,
235        respect_gitignore: opts.respect_gitignore,
236    };
237    let mut current = discover::discover_files(&canonical, discover_opts, git);
238    if opts.max_files > 0 && current.len() > opts.max_files {
239        current.truncate(opts.max_files);
240    }
241
242    let delta = compute_delta(&current, &cached, explicit_changed);
243    let total = current.len();
244    let needs_full_rescan =
245        total > 0 && (delta.added.len() + delta.modified.len()) * 10 > total * 3;
246
247    if needs_full_rescan {
248        let result = scan_project_with_git(&canonical, opts, git);
249        return IncrementalScan {
250            result,
251            delta: ScanDelta {
252                full_rescan: true,
253                ..delta
254            },
255        };
256    }
257
258    if delta.added.is_empty() && delta.modified.is_empty() && delta.removed.is_empty() {
259        return IncrementalScan {
260            result: cached,
261            delta,
262        };
263    }
264
265    // Incremental path: rebuild only the touched files, then re-finalize.
266    let mut files = cached.files;
267    let mut symbols = cached.symbols;
268    let mut dependencies = cached.dependencies;
269
270    let removed_set: std::collections::HashSet<&str> =
271        delta.removed.iter().map(|s| s.as_str()).collect();
272    let touched_set: std::collections::HashSet<&str> = delta
273        .added
274        .iter()
275        .chain(delta.modified.iter())
276        .map(|s| s.as_str())
277        .collect();
278
279    files.retain(|f| !removed_set.contains(f.relative_path.as_str()));
280    symbols.retain(|s| {
281        !removed_set.contains(s.file_path.as_str()) && !touched_set.contains(s.file_path.as_str())
282    });
283    dependencies.retain(|d| {
284        !removed_set.contains(d.from_file.as_str()) && !touched_set.contains(d.from_file.as_str())
285    });
286
287    let touched_entries: Vec<discover::DiscoveredFile> = current
288        .iter()
289        .filter(|e| touched_set.contains(e.relative_path.as_str()))
290        .cloned()
291        .collect();
292    let (new_files, new_symbols, new_deps) = extract_per_file(&touched_entries);
293
294    let mut by_path: std::collections::BTreeMap<String, FileRecord> = files
295        .into_iter()
296        .map(|f| (f.relative_path.clone(), f))
297        .collect();
298    for new_file in new_files {
299        by_path.insert(new_file.relative_path.clone(), new_file);
300    }
301    let mut files: Vec<FileRecord> = by_path.into_values().collect();
302    symbols.extend(new_symbols);
303    dependencies.extend(new_deps);
304
305    scoring::compute_reference_counts(&mut symbols, &files);
306    if opts.include_git_history {
307        let churn = git.churn_scores(&canonical);
308        scoring::apply_churn(&mut files, &churn);
309    }
310    scoring::compute_importance_scores(&mut symbols, &files);
311    test_mapping::map_test_files(&mut files);
312
313    let folder_records = folders::build_folder_records(&files, &symbols);
314    let test_commands = commands::detect_test_commands(&canonical);
315    let code_patterns = commands::detect_code_patterns(&files, &canonical);
316    let project = folders::build_project_metadata(
317        &canonical,
318        &files,
319        test_commands,
320        code_patterns,
321        now_iso8601(),
322    );
323    let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
324    let sub_projects = subproject::detect_subprojects(&canonical, 2);
325
326    sort_for_output(&mut files, &mut symbols, &mut dependencies);
327
328    let token = snapshot::root_to_token(&canonical);
329    let result = ScanResult {
330        snapshot_token: token,
331        truncated: cached.truncated,
332        project,
333        folders: folder_records,
334        files,
335        symbols,
336        dependencies,
337        sub_projects,
338        repo_map,
339    };
340    snapshot::save(&canonical, &result);
341    IncrementalScan { result, delta }
342}
343
344// MARK: - Internals
345
346fn canonicalize(root: &Path) -> PathBuf {
347    std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
348}
349
350fn extract_per_file(
351    discovered: &[discover::DiscoveredFile],
352) -> (Vec<FileRecord>, Vec<SymbolRecord>, Vec<DependencyEdge>) {
353    let mut files: Vec<FileRecord> = Vec::with_capacity(discovered.len());
354    let mut symbols: Vec<SymbolRecord> = Vec::new();
355    let mut dependencies: Vec<DependencyEdge> = Vec::new();
356
357    for entry in discovered {
358        let metadata = std::fs::metadata(&entry.absolute_path);
359        let size = metadata.as_ref().map(|m| m.len()).unwrap_or(0);
360        let modified = metadata
361            .as_ref()
362            .ok()
363            .and_then(|m| m.modified().ok())
364            .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
365            .map(|d| d.as_millis() as i64)
366            .unwrap_or(0);
367
368        let content = std::fs::read_to_string(&entry.absolute_path).unwrap_or_default();
369        if content.is_empty() && size != 0 {
370            // Likely a non-utf8 binary; skip symbol/import extraction but still record the file.
371        }
372        let language = extensions::file_extension(&entry.relative_path);
373        let imports = imports::extract_imports(&content, &language);
374        let file_symbols = symbols::extract_symbols(&content, &language, &entry.relative_path);
375        let line_count = crate::text::count_lines(content.as_bytes()) as usize;
376
377        for imp in &imports {
378            dependencies.push(DependencyEdge {
379                from_file: entry.relative_path.clone(),
380                to_module: imp.clone(),
381            });
382        }
383        symbols.extend(file_symbols);
384
385        files.push(FileRecord {
386            id: entry.relative_path.clone(),
387            relative_path: entry.relative_path.clone(),
388            file_name: extensions::file_name(&entry.relative_path).to_string(),
389            language,
390            line_count,
391            size_bytes: size,
392            last_modified_unix_ms: modified,
393            imports,
394            churn_score: 0.0,
395            corresponding_test_file: None,
396        });
397    }
398
399    (files, symbols, dependencies)
400}
401
402fn sort_for_output(
403    files: &mut [FileRecord],
404    symbols: &mut [SymbolRecord],
405    dependencies: &mut [DependencyEdge],
406) {
407    files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
408    symbols.sort_by(|a, b| a.id.cmp(&b.id));
409    dependencies.sort_by(|a, b| {
410        a.from_file
411            .cmp(&b.from_file)
412            .then_with(|| a.to_module.cmp(&b.to_module))
413    });
414}
415
416fn compute_delta(
417    current: &[discover::DiscoveredFile],
418    cached: &ScanResult,
419    explicit_changed: Option<&[String]>,
420) -> ScanDelta {
421    let cached_files: std::collections::BTreeMap<&str, &FileRecord> = cached
422        .files
423        .iter()
424        .map(|f| (f.relative_path.as_str(), f))
425        .collect();
426    let current_paths: std::collections::HashSet<&str> =
427        current.iter().map(|e| e.relative_path.as_str()).collect();
428
429    let added: Vec<String> = current
430        .iter()
431        .filter(|e| !cached_files.contains_key(e.relative_path.as_str()))
432        .map(|e| e.relative_path.clone())
433        .collect();
434    let removed: Vec<String> = cached
435        .files
436        .iter()
437        .filter(|f| !current_paths.contains(f.relative_path.as_str()))
438        .map(|f| f.relative_path.clone())
439        .collect();
440
441    let modified: Vec<String> = if let Some(explicit) = explicit_changed {
442        explicit
443            .iter()
444            .filter(|p| cached_files.contains_key(p.as_str()) && current_paths.contains(p.as_str()))
445            .cloned()
446            .collect()
447    } else {
448        let mut out = Vec::new();
449        for entry in current {
450            if let Some(prev) = cached_files.get(entry.relative_path.as_str()) {
451                let meta = std::fs::metadata(&entry.absolute_path).ok();
452                let mtime = meta
453                    .as_ref()
454                    .and_then(|m| m.modified().ok())
455                    .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
456                    .map(|d| d.as_millis() as i64)
457                    .unwrap_or(0);
458                let size = meta.as_ref().map(|m| m.len()).unwrap_or(prev.size_bytes);
459                // A newer mtime is the cheap common signal, but mtime
460                // granularity collides on same-turn/same-second edits (and on
461                // coarse-granularity filesystems), silently dropping the edit.
462                // A changed byte size is an mtime-independent modification
463                // signal that catches the overwhelmingly common add/remove edit
464                // for free — `meta.len()` is already in hand. Without it, an
465                // agent that writes a file and re-scans in the same instant
466                // keeps reading the pre-edit symbol facts.
467                if mtime > prev.last_modified_unix_ms || size != prev.size_bytes {
468                    out.push(entry.relative_path.clone());
469                }
470            }
471        }
472        out
473    };
474
475    ScanDelta {
476        added,
477        modified,
478        removed,
479        full_rescan: false,
480    }
481}
482
483fn now_iso8601() -> String {
484    let now = SystemTime::now()
485        .duration_since(UNIX_EPOCH)
486        .unwrap_or_default();
487    let secs = now.as_secs() as i64;
488    let nanos = now.subsec_nanos();
489    let (year, month, day, hour, minute, second) = unix_to_civil(secs);
490    format!(
491        "{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}.{millis:03}Z",
492        millis = nanos / 1_000_000
493    )
494}
495
496/// Convert a unix timestamp (seconds, UTC) to civil date components. Uses
497/// Howard Hinnant's algorithm so we don't pull in `chrono` for one
498/// formatter.
499fn unix_to_civil(secs: i64) -> (i64, u32, u32, u32, u32, u32) {
500    let days = secs.div_euclid(86_400);
501    let day_secs = secs.rem_euclid(86_400);
502    let hour = (day_secs / 3600) as u32;
503    let minute = ((day_secs % 3600) / 60) as u32;
504    let second = (day_secs % 60) as u32;
505
506    // Days from 1970-01-01.
507    let z = days + 719_468;
508    let era = z.div_euclid(146_097);
509    let doe = z.rem_euclid(146_097) as u64;
510    let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
511    let y = yoe as i64 + era * 400;
512    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
513    let mp = (5 * doy + 2) / 153;
514    let day = (doy - (153 * mp + 2) / 5 + 1) as u32;
515    let month = if mp < 10 { mp + 3 } else { mp - 9 } as u32;
516    let year = if month <= 2 { y + 1 } else { y };
517    (year, month, day, hour, minute, second)
518}
519
520// MARK: - Builtin handlers (Harn dict ↔ Rust struct).
521
522fn scan_project_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
523    let raw = dict_arg(SCAN_PROJECT_BUILTIN, args)?;
524    let dict = raw.as_ref();
525    let root = require_string(SCAN_PROJECT_BUILTIN, dict, "root")?;
526    let opts = parse_options(SCAN_PROJECT_BUILTIN, dict)?;
527    let result = scan_project(Path::new(&root), opts);
528    Ok(scan_result_to_value(&result, None))
529}
530
531fn scan_incremental_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
532    let raw = dict_arg(SCAN_INCREMENTAL_BUILTIN, args)?;
533    let dict = raw.as_ref();
534    let token = require_string(SCAN_INCREMENTAL_BUILTIN, dict, "snapshot_token")?;
535    let opts = parse_options(SCAN_INCREMENTAL_BUILTIN, dict)?;
536    let changed = parse_changed_paths(SCAN_INCREMENTAL_BUILTIN, dict)?;
537    let scan = scan_incremental(&token, changed.as_deref(), opts);
538    Ok(scan_result_to_value(&scan.result, Some(&scan.delta)))
539}
540
541fn parse_options(
542    builtin: &'static str,
543    dict: &std::collections::BTreeMap<String, VmValue>,
544) -> Result<ScanProjectOptions, HostlibError> {
545    let include_hidden = optional_bool(builtin, dict, "include_hidden", false)?;
546    let respect_gitignore = optional_bool(builtin, dict, "respect_gitignore", true)?;
547    let max_files = optional_int(builtin, dict, "max_files", 0)?;
548    let include_git_history_default = builtin == SCAN_PROJECT_BUILTIN;
549    let include_git_history = optional_bool(
550        builtin,
551        dict,
552        "include_git_history",
553        include_git_history_default,
554    )?;
555    let repo_map_token_budget = optional_int(builtin, dict, "repo_map_token_budget", 1200)?;
556    if max_files < 0 {
557        return Err(HostlibError::InvalidParameter {
558            builtin,
559            param: "max_files",
560            message: "must be >= 0".to_string(),
561        });
562    }
563    if repo_map_token_budget < 0 {
564        return Err(HostlibError::InvalidParameter {
565            builtin,
566            param: "repo_map_token_budget",
567            message: "must be >= 0".to_string(),
568        });
569    }
570    Ok(ScanProjectOptions {
571        include_hidden,
572        respect_gitignore,
573        max_files: max_files as usize,
574        include_git_history,
575        repo_map_token_budget: repo_map_token_budget as usize,
576    })
577}
578
579fn parse_changed_paths(
580    builtin: &'static str,
581    dict: &std::collections::BTreeMap<String, VmValue>,
582) -> Result<Option<Vec<String>>, HostlibError> {
583    let value = match dict.get("changed_paths") {
584        None | Some(VmValue::Nil) => return Ok(None),
585        Some(v) => v,
586    };
587    let list = match value {
588        VmValue::List(items) => items,
589        other => {
590            return Err(HostlibError::InvalidParameter {
591                builtin,
592                param: "changed_paths",
593                message: format!("expected list of strings, got {}", other.type_name()),
594            });
595        }
596    };
597    let mut out = Vec::with_capacity(list.len());
598    for item in list.iter() {
599        match item {
600            VmValue::String(s) => out.push(s.to_string()),
601            other => {
602                return Err(HostlibError::InvalidParameter {
603                    builtin,
604                    param: "changed_paths",
605                    message: format!("non-string entry: {}", other.type_name()),
606                });
607            }
608        }
609    }
610    Ok(Some(out))
611}
612
613fn scan_result_to_value(result: &ScanResult, delta: Option<&ScanDelta>) -> VmValue {
614    let mut entries: Vec<(&'static str, VmValue)> = vec![
615        ("snapshot_token", str_value(&result.snapshot_token)),
616        ("truncated", VmValue::Bool(result.truncated)),
617        ("project", project_to_value(&result.project)),
618        ("folders", list_of(&result.folders, folder_to_value)),
619        ("files", list_of(&result.files, file_to_value)),
620        ("symbols", list_of(&result.symbols, symbol_to_value)),
621        (
622            "dependencies",
623            list_of(&result.dependencies, dependency_to_value),
624        ),
625        (
626            "sub_projects",
627            list_of(&result.sub_projects, subproject_to_value),
628        ),
629        ("repo_map", str_value(&result.repo_map)),
630    ];
631    if let Some(d) = delta {
632        entries.push(("delta", delta_to_value(d)));
633    }
634    build_dict(entries)
635}
636
637fn list_of<T>(items: &[T], to_value: fn(&T) -> VmValue) -> VmValue {
638    let list: Vec<VmValue> = items.iter().map(to_value).collect();
639    VmValue::List(Arc::new(list))
640}
641
642fn project_to_value(project: &ProjectMetadata) -> VmValue {
643    let test_commands_entries: Vec<(String, VmValue)> = project
644        .test_commands
645        .iter()
646        .map(|(k, v)| (k.clone(), str_value(v)))
647        .collect();
648    let test_commands_dict = build_dict(test_commands_entries);
649
650    let detected: VmValue = project
651        .detected_test_command
652        .as_deref()
653        .map(str_value)
654        .unwrap_or(VmValue::Nil);
655
656    let code_patterns: Vec<VmValue> = project.code_patterns.iter().map(str_value).collect();
657
658    build_dict([
659        ("name", str_value(&project.name)),
660        ("root_path", str_value(&project.root_path)),
661        ("languages", list_of(&project.languages, language_to_value)),
662        ("test_commands", test_commands_dict),
663        ("detected_test_command", detected),
664        ("code_patterns", VmValue::List(Arc::new(code_patterns))),
665        ("total_files", VmValue::Int(project.total_files as i64)),
666        ("total_lines", VmValue::Int(project.total_lines as i64)),
667        ("last_scanned_at", str_value(&project.last_scanned_at)),
668    ])
669}
670
671fn language_to_value(stat: &LanguageStat) -> VmValue {
672    build_dict([
673        ("name", str_value(&stat.name)),
674        ("file_count", VmValue::Int(stat.file_count as i64)),
675        ("line_count", VmValue::Int(stat.line_count as i64)),
676        ("percentage", VmValue::Float(stat.percentage)),
677    ])
678}
679
680fn folder_to_value(folder: &FolderRecord) -> VmValue {
681    let names: Vec<VmValue> = folder.key_symbol_names.iter().map(str_value).collect();
682    build_dict([
683        ("id", str_value(&folder.id)),
684        ("relative_path", str_value(&folder.relative_path)),
685        ("file_count", VmValue::Int(folder.file_count as i64)),
686        ("line_count", VmValue::Int(folder.line_count as i64)),
687        ("dominant_language", str_value(&folder.dominant_language)),
688        ("key_symbol_names", VmValue::List(Arc::new(names))),
689    ])
690}
691
692fn file_to_value(file: &FileRecord) -> VmValue {
693    let imports: Vec<VmValue> = file.imports.iter().map(str_value).collect();
694    let test_pair = file
695        .corresponding_test_file
696        .as_deref()
697        .map(str_value)
698        .unwrap_or(VmValue::Nil);
699    build_dict([
700        ("id", str_value(&file.id)),
701        ("relative_path", str_value(&file.relative_path)),
702        ("file_name", str_value(&file.file_name)),
703        ("language", str_value(&file.language)),
704        ("line_count", VmValue::Int(file.line_count as i64)),
705        ("size_bytes", VmValue::Int(file.size_bytes as i64)),
706        (
707            "last_modified_unix_ms",
708            VmValue::Int(file.last_modified_unix_ms),
709        ),
710        ("imports", VmValue::List(Arc::new(imports))),
711        ("churn_score", VmValue::Float(file.churn_score)),
712        ("corresponding_test_file", test_pair),
713    ])
714}
715
716fn symbol_to_value(symbol: &SymbolRecord) -> VmValue {
717    let container = symbol
718        .container
719        .as_deref()
720        .map(str_value)
721        .unwrap_or(VmValue::Nil);
722    build_dict([
723        ("id", str_value(&symbol.id)),
724        ("name", str_value(&symbol.name)),
725        ("kind", str_value(symbol.kind.keyword())),
726        ("file_path", str_value(&symbol.file_path)),
727        ("line", VmValue::Int(symbol.line as i64)),
728        ("signature", str_value(&symbol.signature)),
729        ("container", container),
730        (
731            "reference_count",
732            VmValue::Int(symbol.reference_count as i64),
733        ),
734        ("importance_score", VmValue::Float(symbol.importance_score)),
735    ])
736}
737
738fn dependency_to_value(dep: &DependencyEdge) -> VmValue {
739    build_dict([
740        ("from_file", str_value(&dep.from_file)),
741        ("to_module", str_value(&dep.to_module)),
742    ])
743}
744
745fn subproject_to_value(sp: &SubProject) -> VmValue {
746    build_dict([
747        ("path", str_value(&sp.path)),
748        ("name", str_value(&sp.name)),
749        ("language", str_value(&sp.language)),
750        ("project_marker", str_value(&sp.project_marker)),
751    ])
752}
753
754fn delta_to_value(delta: &ScanDelta) -> VmValue {
755    let added: Vec<VmValue> = delta.added.iter().map(str_value).collect();
756    let modified: Vec<VmValue> = delta.modified.iter().map(str_value).collect();
757    let removed: Vec<VmValue> = delta.removed.iter().map(str_value).collect();
758    build_dict([
759        ("added", VmValue::List(Arc::new(added))),
760        ("modified", VmValue::List(Arc::new(modified))),
761        ("removed", VmValue::List(Arc::new(removed))),
762        ("full_rescan", VmValue::Bool(delta.full_rescan)),
763    ])
764}
765
766#[cfg(test)]
767mod tests {
768    use super::*;
769    use filetime::{set_file_mtime, FileTime};
770    use std::fs;
771
772    #[test]
773    fn builtin_option_defaults_match_request_schemas() {
774        let dict = std::collections::BTreeMap::new();
775
776        let scan_project = parse_options(SCAN_PROJECT_BUILTIN, &dict).unwrap();
777        let scan_incremental = parse_options(SCAN_INCREMENTAL_BUILTIN, &dict).unwrap();
778
779        assert!(scan_project.include_git_history);
780        assert!(!scan_incremental.include_git_history);
781    }
782
783    fn symbol_names(scan: &IncrementalScan) -> Vec<String> {
784        scan.result.symbols.iter().map(|s| s.name.clone()).collect()
785    }
786
787    /// Regression guard: an agent that writes a file and re-scans in the same
788    /// instant must see its own edit. `compute_delta`'s mtime comparison
789    /// collides on same-millisecond/same-second writes (and on
790    /// coarse-granularity filesystems), so the size-change fallback is what
791    /// keeps same-turn index freshness honest. Before the fallback this
792    /// returned the pre-edit symbol set, feeding fuzzy-match-stale loops on
793    /// cheap local models.
794    #[test]
795    fn scan_incremental_detects_same_mtime_size_changing_edit() {
796        let dir = tempfile::tempdir().unwrap();
797        fs::create_dir_all(dir.path().join("src")).unwrap();
798        let file = dir.path().join("src/lib.rs");
799        fs::write(&file, "pub fn old_symbol() {}\n").unwrap();
800
801        // Canonicalize so the snapshot token matches across calls.
802        let canonical = std::fs::canonicalize(dir.path()).unwrap();
803        let token = canonical.to_string_lossy().to_string();
804        let opts = ScanProjectOptions::default();
805
806        let first = scan_incremental(&token, None, opts.clone());
807        let cached_mtime = first
808            .result
809            .files
810            .iter()
811            .find(|r| r.relative_path == "src/lib.rs")
812            .expect("seed file indexed")
813            .last_modified_unix_ms;
814        assert!(symbol_names(&first).iter().any(|n| n == "old_symbol"));
815
816        // Add a symbol (byte size grows), then force the mtime back to the
817        // cached value to simulate a same-instant edit the OS couldn't
818        // distinguish by mtime.
819        fs::write(
820            &file,
821            "pub fn old_symbol() {}\npub fn brand_new_symbol() {}\n",
822        )
823        .unwrap();
824        let secs = cached_mtime / 1000;
825        let nanos = ((cached_mtime % 1000) * 1_000_000) as u32;
826        set_file_mtime(&file, FileTime::from_unix_time(secs, nanos)).unwrap();
827
828        let second = scan_incremental(&token, None, opts);
829        let names = symbol_names(&second);
830        assert!(
831            names.iter().any(|n| n == "brand_new_symbol"),
832            "same-mtime size-changing edit must be reindexed, got {names:?} (delta.modified={:?})",
833            second.delta.modified,
834        );
835    }
836
837    /// Companion guard: even when an edit changes nothing the scanner can
838    /// cheaply detect (same mtime AND same byte size — e.g. a length-preserving
839    /// one-character swap), passing the explicit `changed_paths` signal still
840    /// forces the reindex. The agent loop threads its own write through this
841    /// bypass, so freshness never depends on mtime/size heuristics for the
842    /// agent's own edits.
843    #[test]
844    fn scan_incremental_changed_paths_bypasses_metadata_heuristics() {
845        let dir = tempfile::tempdir().unwrap();
846        fs::create_dir_all(dir.path().join("src")).unwrap();
847        let file = dir.path().join("src/lib.rs");
848        // 23 bytes.
849        fs::write(&file, "pub fn alpha_name() {}\n").unwrap();
850
851        let canonical = std::fs::canonicalize(dir.path()).unwrap();
852        let token = canonical.to_string_lossy().to_string();
853        let opts = ScanProjectOptions::default();
854
855        let first = scan_incremental(&token, None, opts.clone());
856        let cached_mtime = first
857            .result
858            .files
859            .iter()
860            .find(|r| r.relative_path == "src/lib.rs")
861            .expect("seed file indexed")
862            .last_modified_unix_ms;
863        assert!(symbol_names(&first).iter().any(|n| n == "alpha_name"));
864
865        // Length-preserving rename (same 23 bytes), same forced mtime: neither
866        // the size nor the mtime heuristic can see this.
867        fs::write(&file, "pub fn omega_name() {}\n").unwrap();
868        let secs = cached_mtime / 1000;
869        let nanos = ((cached_mtime % 1000) * 1_000_000) as u32;
870        set_file_mtime(&file, FileTime::from_unix_time(secs, nanos)).unwrap();
871
872        // Without an explicit signal the heuristics legitimately miss this
873        // rare length-preserving same-instant case...
874        let heuristic_only = scan_incremental(&token, None, opts.clone());
875        assert!(
876            !heuristic_only
877                .delta
878                .modified
879                .contains(&"src/lib.rs".to_string()),
880            "documenting the heuristic's known blind spot",
881        );
882
883        // ...but the explicit changed-path signal the agent loop passes after
884        // its own write forces the reindex regardless.
885        let explicit = scan_incremental(&token, Some(&["src/lib.rs".to_string()]), opts);
886        assert!(
887            symbol_names(&explicit).iter().any(|n| n == "omega_name"),
888            "explicit changed_paths must always reindex, got {:?}",
889            symbol_names(&explicit),
890        );
891    }
892}