1use std::path::{Path, PathBuf};
17use std::rc::Rc;
18use std::sync::Arc;
19use std::time::{SystemTime, UNIX_EPOCH};
20
21use harn_vm::VmValue;
22
23use crate::error::HostlibError;
24use crate::registry::{BuiltinRegistry, HostlibCapability, RegisteredBuiltin, SyncHandler};
25use crate::tools::args::{
26 build_dict, dict_arg, optional_bool, optional_int, require_string, str_value,
27};
28
29mod commands;
30mod discover;
31mod extensions;
32mod folders;
33mod imports;
34mod result;
35mod scoring;
36mod snapshot;
37mod subproject;
38mod symbols;
39mod test_mapping;
40
41pub use result::{
42 DependencyEdge, FileRecord, FolderRecord, LanguageStat, ProjectMetadata, ScanDelta, ScanResult,
43 SubProject, SymbolKind, SymbolRecord,
44};
45
46const SCAN_PROJECT_BUILTIN: &str = "hostlib_scanner_scan_project";
47const SCAN_INCREMENTAL_BUILTIN: &str = "hostlib_scanner_scan_incremental";
48
49#[derive(Default)]
51pub struct ScannerCapability;
52
53impl HostlibCapability for ScannerCapability {
54 fn module_name(&self) -> &'static str {
55 "scanner"
56 }
57
58 fn register_builtins(&self, registry: &mut BuiltinRegistry) {
59 let scan_project: SyncHandler = Arc::new(scan_project_handler);
60 registry.register(RegisteredBuiltin {
61 name: SCAN_PROJECT_BUILTIN,
62 module: "scanner",
63 method: "scan_project",
64 handler: scan_project,
65 });
66 let scan_incremental: SyncHandler = Arc::new(scan_incremental_handler);
67 registry.register(RegisteredBuiltin {
68 name: SCAN_INCREMENTAL_BUILTIN,
69 module: "scanner",
70 method: "scan_incremental",
71 handler: scan_incremental,
72 });
73 }
74}
75
76#[derive(Clone, Debug)]
80pub struct ScanProjectOptions {
81 pub include_hidden: bool,
83 pub respect_gitignore: bool,
85 pub max_files: usize,
87 pub include_git_history: bool,
89 pub repo_map_token_budget: usize,
91}
92
93impl Default for ScanProjectOptions {
94 fn default() -> Self {
95 Self {
96 include_hidden: false,
97 respect_gitignore: true,
98 max_files: 0,
99 include_git_history: true,
100 repo_map_token_budget: 1200,
101 }
102 }
103}
104
105pub fn scan_project(root: &Path, opts: ScanProjectOptions) -> ScanResult {
107 let canonical = canonicalize(root);
108 let discover_opts = discover::DiscoverOptions {
109 include_hidden: opts.include_hidden,
110 respect_gitignore: opts.respect_gitignore,
111 };
112 let mut discovered = discover::discover_files(&canonical, discover_opts);
113 let truncated = if opts.max_files > 0 && discovered.len() > opts.max_files {
114 discovered.truncate(opts.max_files);
115 true
116 } else {
117 false
118 };
119
120 let (mut files, mut symbols, mut dependencies) = extract_per_file(&discovered);
121
122 scoring::compute_reference_counts(&mut symbols, &files);
123
124 if opts.include_git_history {
125 let churn = scoring::compute_churn_scores(&canonical);
126 scoring::apply_churn(&mut files, &churn);
127 }
128 scoring::compute_importance_scores(&mut symbols, &files);
129
130 test_mapping::map_test_files(&mut files);
131
132 let folder_records = folders::build_folder_records(&files, &symbols);
133 let test_commands = commands::detect_test_commands(&canonical);
134 let code_patterns = commands::detect_code_patterns(&files, &canonical);
135 let project = folders::build_project_metadata(
136 &canonical,
137 &files,
138 test_commands,
139 code_patterns,
140 now_iso8601(),
141 );
142 let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
143 let sub_projects = subproject::detect_subprojects(&canonical, 2);
144
145 sort_for_output(&mut files, &mut symbols, &mut dependencies);
146
147 let token = snapshot::root_to_token(&canonical);
148 let result = ScanResult {
149 snapshot_token: token,
150 truncated,
151 project,
152 folders: folder_records,
153 files,
154 symbols,
155 dependencies,
156 sub_projects,
157 repo_map,
158 };
159 snapshot::save(&canonical, &result);
160 result
161}
162
163#[derive(Clone, Debug)]
165pub struct IncrementalScan {
166 pub result: ScanResult,
168 pub delta: ScanDelta,
170}
171
172pub fn scan_incremental(
176 token: &str,
177 explicit_changed: Option<&[String]>,
178 opts: ScanProjectOptions,
179) -> IncrementalScan {
180 let root = snapshot::token_to_root(token);
181 let canonical = canonicalize(&root);
182
183 let cached = snapshot::load(&canonical);
184 let cached = match cached {
185 Some(c) => c,
186 None => {
187 let result = scan_project(&canonical, opts);
188 return IncrementalScan {
189 result,
190 delta: ScanDelta {
191 full_rescan: true,
192 ..ScanDelta::default()
193 },
194 };
195 }
196 };
197
198 let discover_opts = discover::DiscoverOptions {
199 include_hidden: opts.include_hidden,
200 respect_gitignore: opts.respect_gitignore,
201 };
202 let mut current = discover::discover_files(&canonical, discover_opts);
203 if opts.max_files > 0 && current.len() > opts.max_files {
204 current.truncate(opts.max_files);
205 }
206
207 let delta = compute_delta(¤t, &cached, explicit_changed);
208 let total = current.len();
209 let needs_full_rescan =
210 total > 0 && (delta.added.len() + delta.modified.len()) * 10 > total * 3;
211
212 if needs_full_rescan {
213 let result = scan_project(&canonical, opts);
214 return IncrementalScan {
215 result,
216 delta: ScanDelta {
217 full_rescan: true,
218 ..delta
219 },
220 };
221 }
222
223 if delta.added.is_empty() && delta.modified.is_empty() && delta.removed.is_empty() {
224 return IncrementalScan {
225 result: cached,
226 delta,
227 };
228 }
229
230 let mut files = cached.files;
232 let mut symbols = cached.symbols;
233 let mut dependencies = cached.dependencies;
234
235 let removed_set: std::collections::HashSet<&str> =
236 delta.removed.iter().map(|s| s.as_str()).collect();
237 let touched_set: std::collections::HashSet<&str> = delta
238 .added
239 .iter()
240 .chain(delta.modified.iter())
241 .map(|s| s.as_str())
242 .collect();
243
244 files.retain(|f| !removed_set.contains(f.relative_path.as_str()));
245 symbols.retain(|s| {
246 !removed_set.contains(s.file_path.as_str()) && !touched_set.contains(s.file_path.as_str())
247 });
248 dependencies.retain(|d| {
249 !removed_set.contains(d.from_file.as_str()) && !touched_set.contains(d.from_file.as_str())
250 });
251
252 let touched_entries: Vec<discover::DiscoveredFile> = current
253 .iter()
254 .filter(|e| touched_set.contains(e.relative_path.as_str()))
255 .cloned()
256 .collect();
257 let (new_files, new_symbols, new_deps) = extract_per_file(&touched_entries);
258
259 let mut by_path: std::collections::BTreeMap<String, FileRecord> = files
260 .into_iter()
261 .map(|f| (f.relative_path.clone(), f))
262 .collect();
263 for new_file in new_files {
264 by_path.insert(new_file.relative_path.clone(), new_file);
265 }
266 let mut files: Vec<FileRecord> = by_path.into_values().collect();
267 symbols.extend(new_symbols);
268 dependencies.extend(new_deps);
269
270 scoring::compute_reference_counts(&mut symbols, &files);
271 if opts.include_git_history {
272 let churn = scoring::compute_churn_scores(&canonical);
273 scoring::apply_churn(&mut files, &churn);
274 }
275 scoring::compute_importance_scores(&mut symbols, &files);
276 test_mapping::map_test_files(&mut files);
277
278 let folder_records = folders::build_folder_records(&files, &symbols);
279 let test_commands = commands::detect_test_commands(&canonical);
280 let code_patterns = commands::detect_code_patterns(&files, &canonical);
281 let project = folders::build_project_metadata(
282 &canonical,
283 &files,
284 test_commands,
285 code_patterns,
286 now_iso8601(),
287 );
288 let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
289 let sub_projects = subproject::detect_subprojects(&canonical, 2);
290
291 sort_for_output(&mut files, &mut symbols, &mut dependencies);
292
293 let token = snapshot::root_to_token(&canonical);
294 let result = ScanResult {
295 snapshot_token: token,
296 truncated: cached.truncated,
297 project,
298 folders: folder_records,
299 files,
300 symbols,
301 dependencies,
302 sub_projects,
303 repo_map,
304 };
305 snapshot::save(&canonical, &result);
306 IncrementalScan { result, delta }
307}
308
309fn canonicalize(root: &Path) -> PathBuf {
312 std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
313}
314
315fn extract_per_file(
316 discovered: &[discover::DiscoveredFile],
317) -> (Vec<FileRecord>, Vec<SymbolRecord>, Vec<DependencyEdge>) {
318 let mut files: Vec<FileRecord> = Vec::with_capacity(discovered.len());
319 let mut symbols: Vec<SymbolRecord> = Vec::new();
320 let mut dependencies: Vec<DependencyEdge> = Vec::new();
321
322 for entry in discovered {
323 let metadata = std::fs::metadata(&entry.absolute_path);
324 let size = metadata.as_ref().map(|m| m.len()).unwrap_or(0);
325 let modified = metadata
326 .as_ref()
327 .ok()
328 .and_then(|m| m.modified().ok())
329 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
330 .map(|d| d.as_millis() as i64)
331 .unwrap_or(0);
332
333 let content = std::fs::read_to_string(&entry.absolute_path).unwrap_or_default();
334 if content.is_empty() && size != 0 {
335 }
337 let language = extensions::file_extension(&entry.relative_path);
338 let imports = imports::extract_imports(&content, &language);
339 let file_symbols = symbols::extract_symbols(&content, &language, &entry.relative_path);
340 let line_count = count_lines(&content);
341
342 for imp in &imports {
343 dependencies.push(DependencyEdge {
344 from_file: entry.relative_path.clone(),
345 to_module: imp.clone(),
346 });
347 }
348 symbols.extend(file_symbols);
349
350 files.push(FileRecord {
351 id: entry.relative_path.clone(),
352 relative_path: entry.relative_path.clone(),
353 file_name: extensions::file_name(&entry.relative_path).to_string(),
354 language,
355 line_count,
356 size_bytes: size,
357 last_modified_unix_ms: modified,
358 imports,
359 churn_score: 0.0,
360 corresponding_test_file: None,
361 });
362 }
363
364 (files, symbols, dependencies)
365}
366
367fn count_lines(content: &str) -> usize {
368 if content.is_empty() {
369 return 0;
370 }
371 let nl = content.bytes().filter(|b| *b == b'\n').count();
372 let trailing = content.as_bytes().last() != Some(&b'\n');
373 nl + if trailing { 1 } else { 0 }
374}
375
376fn sort_for_output(
377 files: &mut [FileRecord],
378 symbols: &mut [SymbolRecord],
379 dependencies: &mut [DependencyEdge],
380) {
381 files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
382 symbols.sort_by(|a, b| a.id.cmp(&b.id));
383 dependencies.sort_by(|a, b| {
384 a.from_file
385 .cmp(&b.from_file)
386 .then_with(|| a.to_module.cmp(&b.to_module))
387 });
388}
389
390fn compute_delta(
391 current: &[discover::DiscoveredFile],
392 cached: &ScanResult,
393 explicit_changed: Option<&[String]>,
394) -> ScanDelta {
395 let cached_files: std::collections::BTreeMap<&str, &FileRecord> = cached
396 .files
397 .iter()
398 .map(|f| (f.relative_path.as_str(), f))
399 .collect();
400 let current_paths: std::collections::HashSet<&str> =
401 current.iter().map(|e| e.relative_path.as_str()).collect();
402
403 let added: Vec<String> = current
404 .iter()
405 .filter(|e| !cached_files.contains_key(e.relative_path.as_str()))
406 .map(|e| e.relative_path.clone())
407 .collect();
408 let removed: Vec<String> = cached
409 .files
410 .iter()
411 .filter(|f| !current_paths.contains(f.relative_path.as_str()))
412 .map(|f| f.relative_path.clone())
413 .collect();
414
415 let modified: Vec<String> = if let Some(explicit) = explicit_changed {
416 explicit
417 .iter()
418 .filter(|p| cached_files.contains_key(p.as_str()) && current_paths.contains(p.as_str()))
419 .cloned()
420 .collect()
421 } else {
422 let mut out = Vec::new();
423 for entry in current {
424 if let Some(prev) = cached_files.get(entry.relative_path.as_str()) {
425 let mtime = std::fs::metadata(&entry.absolute_path)
426 .ok()
427 .and_then(|m| m.modified().ok())
428 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
429 .map(|d| d.as_millis() as i64)
430 .unwrap_or(0);
431 if mtime > prev.last_modified_unix_ms {
432 out.push(entry.relative_path.clone());
433 }
434 }
435 }
436 out
437 };
438
439 ScanDelta {
440 added,
441 modified,
442 removed,
443 full_rescan: false,
444 }
445}
446
447fn now_iso8601() -> String {
448 let now = SystemTime::now()
449 .duration_since(UNIX_EPOCH)
450 .unwrap_or_default();
451 let secs = now.as_secs() as i64;
452 let nanos = now.subsec_nanos();
453 let (year, month, day, hour, minute, second) = unix_to_civil(secs);
454 format!(
455 "{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}.{millis:03}Z",
456 millis = nanos / 1_000_000
457 )
458}
459
460fn unix_to_civil(secs: i64) -> (i64, u32, u32, u32, u32, u32) {
464 let days = secs.div_euclid(86_400);
465 let day_secs = secs.rem_euclid(86_400);
466 let hour = (day_secs / 3600) as u32;
467 let minute = ((day_secs % 3600) / 60) as u32;
468 let second = (day_secs % 60) as u32;
469
470 let z = days + 719_468;
472 let era = z.div_euclid(146_097);
473 let doe = z.rem_euclid(146_097) as u64;
474 let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
475 let y = yoe as i64 + era * 400;
476 let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
477 let mp = (5 * doy + 2) / 153;
478 let day = (doy - (153 * mp + 2) / 5 + 1) as u32;
479 let month = if mp < 10 { mp + 3 } else { mp - 9 } as u32;
480 let year = if month <= 2 { y + 1 } else { y };
481 (year, month, day, hour, minute, second)
482}
483
484fn scan_project_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
487 let raw = dict_arg(SCAN_PROJECT_BUILTIN, args)?;
488 let dict = raw.as_ref();
489 let root = require_string(SCAN_PROJECT_BUILTIN, dict, "root")?;
490 let opts = parse_options(SCAN_PROJECT_BUILTIN, dict)?;
491 let result = scan_project(Path::new(&root), opts);
492 Ok(scan_result_to_value(&result, None))
493}
494
495fn scan_incremental_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
496 let raw = dict_arg(SCAN_INCREMENTAL_BUILTIN, args)?;
497 let dict = raw.as_ref();
498 let token = require_string(SCAN_INCREMENTAL_BUILTIN, dict, "snapshot_token")?;
499 let opts = parse_options(SCAN_INCREMENTAL_BUILTIN, dict)?;
500 let changed = parse_changed_paths(SCAN_INCREMENTAL_BUILTIN, dict)?;
501 let scan = scan_incremental(&token, changed.as_deref(), opts);
502 Ok(scan_result_to_value(&scan.result, Some(&scan.delta)))
503}
504
505fn parse_options(
506 builtin: &'static str,
507 dict: &std::collections::BTreeMap<String, VmValue>,
508) -> Result<ScanProjectOptions, HostlibError> {
509 let include_hidden = optional_bool(builtin, dict, "include_hidden", false)?;
510 let respect_gitignore = optional_bool(builtin, dict, "respect_gitignore", true)?;
511 let max_files = optional_int(builtin, dict, "max_files", 0)?;
512 let include_git_history = optional_bool(builtin, dict, "include_git_history", true)?;
513 let repo_map_token_budget = optional_int(builtin, dict, "repo_map_token_budget", 1200)?;
514 if max_files < 0 {
515 return Err(HostlibError::InvalidParameter {
516 builtin,
517 param: "max_files",
518 message: "must be >= 0".to_string(),
519 });
520 }
521 if repo_map_token_budget < 0 {
522 return Err(HostlibError::InvalidParameter {
523 builtin,
524 param: "repo_map_token_budget",
525 message: "must be >= 0".to_string(),
526 });
527 }
528 Ok(ScanProjectOptions {
529 include_hidden,
530 respect_gitignore,
531 max_files: max_files as usize,
532 include_git_history,
533 repo_map_token_budget: repo_map_token_budget as usize,
534 })
535}
536
537fn parse_changed_paths(
538 builtin: &'static str,
539 dict: &std::collections::BTreeMap<String, VmValue>,
540) -> Result<Option<Vec<String>>, HostlibError> {
541 let value = match dict.get("changed_paths") {
542 None | Some(VmValue::Nil) => return Ok(None),
543 Some(v) => v,
544 };
545 let list = match value {
546 VmValue::List(items) => items,
547 other => {
548 return Err(HostlibError::InvalidParameter {
549 builtin,
550 param: "changed_paths",
551 message: format!("expected list of strings, got {}", other.type_name()),
552 });
553 }
554 };
555 let mut out = Vec::with_capacity(list.len());
556 for item in list.iter() {
557 match item {
558 VmValue::String(s) => out.push(s.to_string()),
559 other => {
560 return Err(HostlibError::InvalidParameter {
561 builtin,
562 param: "changed_paths",
563 message: format!("non-string entry: {}", other.type_name()),
564 });
565 }
566 }
567 }
568 Ok(Some(out))
569}
570
571fn scan_result_to_value(result: &ScanResult, delta: Option<&ScanDelta>) -> VmValue {
572 let mut entries: Vec<(&'static str, VmValue)> = vec![
573 ("snapshot_token", str_value(&result.snapshot_token)),
574 ("truncated", VmValue::Bool(result.truncated)),
575 ("project", project_to_value(&result.project)),
576 ("folders", list_of(&result.folders, folder_to_value)),
577 ("files", list_of(&result.files, file_to_value)),
578 ("symbols", list_of(&result.symbols, symbol_to_value)),
579 (
580 "dependencies",
581 list_of(&result.dependencies, dependency_to_value),
582 ),
583 (
584 "sub_projects",
585 list_of(&result.sub_projects, subproject_to_value),
586 ),
587 ("repo_map", str_value(&result.repo_map)),
588 ];
589 if let Some(d) = delta {
590 entries.push(("delta", delta_to_value(d)));
591 }
592 build_dict(entries)
593}
594
595fn list_of<T>(items: &[T], to_value: fn(&T) -> VmValue) -> VmValue {
596 let list: Vec<VmValue> = items.iter().map(to_value).collect();
597 VmValue::List(Rc::new(list))
598}
599
600fn project_to_value(project: &ProjectMetadata) -> VmValue {
601 let test_commands_entries: Vec<(String, VmValue)> = project
602 .test_commands
603 .iter()
604 .map(|(k, v)| (k.clone(), str_value(v)))
605 .collect();
606 let test_commands_dict = build_dict(test_commands_entries);
607
608 let detected: VmValue = project
609 .detected_test_command
610 .as_deref()
611 .map(str_value)
612 .unwrap_or(VmValue::Nil);
613
614 let code_patterns: Vec<VmValue> = project.code_patterns.iter().map(str_value).collect();
615
616 build_dict([
617 ("name", str_value(&project.name)),
618 ("root_path", str_value(&project.root_path)),
619 ("languages", list_of(&project.languages, language_to_value)),
620 ("test_commands", test_commands_dict),
621 ("detected_test_command", detected),
622 ("code_patterns", VmValue::List(Rc::new(code_patterns))),
623 ("total_files", VmValue::Int(project.total_files as i64)),
624 ("total_lines", VmValue::Int(project.total_lines as i64)),
625 ("last_scanned_at", str_value(&project.last_scanned_at)),
626 ])
627}
628
629fn language_to_value(stat: &LanguageStat) -> VmValue {
630 build_dict([
631 ("name", str_value(&stat.name)),
632 ("file_count", VmValue::Int(stat.file_count as i64)),
633 ("line_count", VmValue::Int(stat.line_count as i64)),
634 ("percentage", VmValue::Float(stat.percentage)),
635 ])
636}
637
638fn folder_to_value(folder: &FolderRecord) -> VmValue {
639 let names: Vec<VmValue> = folder.key_symbol_names.iter().map(str_value).collect();
640 build_dict([
641 ("id", str_value(&folder.id)),
642 ("relative_path", str_value(&folder.relative_path)),
643 ("file_count", VmValue::Int(folder.file_count as i64)),
644 ("line_count", VmValue::Int(folder.line_count as i64)),
645 ("dominant_language", str_value(&folder.dominant_language)),
646 ("key_symbol_names", VmValue::List(Rc::new(names))),
647 ])
648}
649
650fn file_to_value(file: &FileRecord) -> VmValue {
651 let imports: Vec<VmValue> = file.imports.iter().map(str_value).collect();
652 let test_pair = file
653 .corresponding_test_file
654 .as_deref()
655 .map(str_value)
656 .unwrap_or(VmValue::Nil);
657 build_dict([
658 ("id", str_value(&file.id)),
659 ("relative_path", str_value(&file.relative_path)),
660 ("file_name", str_value(&file.file_name)),
661 ("language", str_value(&file.language)),
662 ("line_count", VmValue::Int(file.line_count as i64)),
663 ("size_bytes", VmValue::Int(file.size_bytes as i64)),
664 (
665 "last_modified_unix_ms",
666 VmValue::Int(file.last_modified_unix_ms),
667 ),
668 ("imports", VmValue::List(Rc::new(imports))),
669 ("churn_score", VmValue::Float(file.churn_score)),
670 ("corresponding_test_file", test_pair),
671 ])
672}
673
674fn symbol_to_value(symbol: &SymbolRecord) -> VmValue {
675 let container = symbol
676 .container
677 .as_deref()
678 .map(str_value)
679 .unwrap_or(VmValue::Nil);
680 build_dict([
681 ("id", str_value(&symbol.id)),
682 ("name", str_value(&symbol.name)),
683 ("kind", str_value(symbol.kind.keyword())),
684 ("file_path", str_value(&symbol.file_path)),
685 ("line", VmValue::Int(symbol.line as i64)),
686 ("signature", str_value(&symbol.signature)),
687 ("container", container),
688 (
689 "reference_count",
690 VmValue::Int(symbol.reference_count as i64),
691 ),
692 ("importance_score", VmValue::Float(symbol.importance_score)),
693 ])
694}
695
696fn dependency_to_value(dep: &DependencyEdge) -> VmValue {
697 build_dict([
698 ("from_file", str_value(&dep.from_file)),
699 ("to_module", str_value(&dep.to_module)),
700 ])
701}
702
703fn subproject_to_value(sp: &SubProject) -> VmValue {
704 build_dict([
705 ("path", str_value(&sp.path)),
706 ("name", str_value(&sp.name)),
707 ("language", str_value(&sp.language)),
708 ("project_marker", str_value(&sp.project_marker)),
709 ])
710}
711
712fn delta_to_value(delta: &ScanDelta) -> VmValue {
713 let added: Vec<VmValue> = delta.added.iter().map(str_value).collect();
714 let modified: Vec<VmValue> = delta.modified.iter().map(str_value).collect();
715 let removed: Vec<VmValue> = delta.removed.iter().map(str_value).collect();
716 build_dict([
717 ("added", VmValue::List(Rc::new(added))),
718 ("modified", VmValue::List(Rc::new(modified))),
719 ("removed", VmValue::List(Rc::new(removed))),
720 ("full_rescan", VmValue::Bool(delta.full_rescan)),
721 ])
722}