1use std::path::{Path, PathBuf};
19use std::rc::Rc;
20use std::sync::Arc;
21use std::time::{SystemTime, UNIX_EPOCH};
22
23use harn_vm::VmValue;
24
25use crate::error::HostlibError;
26use crate::registry::{BuiltinRegistry, HostlibCapability, RegisteredBuiltin, SyncHandler};
27use crate::tools::args::{
28 build_dict, dict_arg, optional_bool, optional_int, require_string, str_value,
29};
30
31mod commands;
32mod discover;
33mod extensions;
34mod folders;
35mod imports;
36mod result;
37mod scoring;
38mod snapshot;
39mod subproject;
40mod symbols;
41mod test_mapping;
42
43pub use result::{
44 DependencyEdge, FileRecord, FolderRecord, LanguageStat, ProjectMetadata, ScanDelta, ScanResult,
45 SubProject, SymbolKind, SymbolRecord,
46};
47
48const SCAN_PROJECT_BUILTIN: &str = "hostlib_scanner_scan_project";
49const SCAN_INCREMENTAL_BUILTIN: &str = "hostlib_scanner_scan_incremental";
50
51#[derive(Default)]
53pub struct ScannerCapability;
54
55impl HostlibCapability for ScannerCapability {
56 fn module_name(&self) -> &'static str {
57 "scanner"
58 }
59
60 fn register_builtins(&self, registry: &mut BuiltinRegistry) {
61 let scan_project: SyncHandler = Arc::new(scan_project_handler);
62 registry.register(RegisteredBuiltin {
63 name: SCAN_PROJECT_BUILTIN,
64 module: "scanner",
65 method: "scan_project",
66 handler: scan_project,
67 });
68 let scan_incremental: SyncHandler = Arc::new(scan_incremental_handler);
69 registry.register(RegisteredBuiltin {
70 name: SCAN_INCREMENTAL_BUILTIN,
71 module: "scanner",
72 method: "scan_incremental",
73 handler: scan_incremental,
74 });
75 }
76}
77
78#[derive(Clone, Debug)]
82pub struct ScanProjectOptions {
83 pub include_hidden: bool,
85 pub respect_gitignore: bool,
87 pub max_files: usize,
89 pub include_git_history: bool,
91 pub repo_map_token_budget: usize,
93}
94
95impl Default for ScanProjectOptions {
96 fn default() -> Self {
97 Self {
98 include_hidden: false,
99 respect_gitignore: true,
100 max_files: 0,
101 include_git_history: true,
102 repo_map_token_budget: 1200,
103 }
104 }
105}
106
107pub fn scan_project(root: &Path, opts: ScanProjectOptions) -> ScanResult {
109 let canonical = canonicalize(root);
110 let discover_opts = discover::DiscoverOptions {
111 include_hidden: opts.include_hidden,
112 respect_gitignore: opts.respect_gitignore,
113 };
114 let mut discovered = discover::discover_files(&canonical, discover_opts);
115 let truncated = if opts.max_files > 0 && discovered.len() > opts.max_files {
116 discovered.truncate(opts.max_files);
117 true
118 } else {
119 false
120 };
121
122 let (mut files, mut symbols, mut dependencies) = extract_per_file(&discovered);
123
124 scoring::compute_reference_counts(&mut symbols, &files);
125
126 if opts.include_git_history {
127 let churn = scoring::compute_churn_scores(&canonical);
128 scoring::apply_churn(&mut files, &churn);
129 }
130 scoring::compute_importance_scores(&mut symbols, &files);
131
132 test_mapping::map_test_files(&mut files);
133
134 let folder_records = folders::build_folder_records(&files, &symbols);
135 let test_commands = commands::detect_test_commands(&canonical);
136 let code_patterns = commands::detect_code_patterns(&files, &canonical);
137 let project = folders::build_project_metadata(
138 &canonical,
139 &files,
140 test_commands,
141 code_patterns,
142 now_iso8601(),
143 );
144 let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
145 let sub_projects = subproject::detect_subprojects(&canonical, 2);
146
147 sort_for_output(&mut files, &mut symbols, &mut dependencies);
148
149 let token = snapshot::root_to_token(&canonical);
150 let result = ScanResult {
151 snapshot_token: token,
152 truncated,
153 project,
154 folders: folder_records,
155 files,
156 symbols,
157 dependencies,
158 sub_projects,
159 repo_map,
160 };
161 snapshot::save(&canonical, &result);
162 result
163}
164
165#[derive(Clone, Debug)]
167pub struct IncrementalScan {
168 pub result: ScanResult,
170 pub delta: ScanDelta,
172}
173
174pub fn scan_incremental(
178 token: &str,
179 explicit_changed: Option<&[String]>,
180 opts: ScanProjectOptions,
181) -> IncrementalScan {
182 let root = snapshot::token_to_root(token);
183 let canonical = canonicalize(&root);
184
185 let cached = snapshot::load(&canonical);
186 let cached = match cached {
187 Some(c) => c,
188 None => {
189 let result = scan_project(&canonical, opts);
190 return IncrementalScan {
191 result,
192 delta: ScanDelta {
193 full_rescan: true,
194 ..ScanDelta::default()
195 },
196 };
197 }
198 };
199
200 let discover_opts = discover::DiscoverOptions {
201 include_hidden: opts.include_hidden,
202 respect_gitignore: opts.respect_gitignore,
203 };
204 let mut current = discover::discover_files(&canonical, discover_opts);
205 if opts.max_files > 0 && current.len() > opts.max_files {
206 current.truncate(opts.max_files);
207 }
208
209 let delta = compute_delta(¤t, &cached, explicit_changed);
210 let total = current.len();
211 let needs_full_rescan =
212 total > 0 && (delta.added.len() + delta.modified.len()) * 10 > total * 3;
213
214 if needs_full_rescan {
215 let result = scan_project(&canonical, opts);
216 return IncrementalScan {
217 result,
218 delta: ScanDelta {
219 full_rescan: true,
220 ..delta
221 },
222 };
223 }
224
225 if delta.added.is_empty() && delta.modified.is_empty() && delta.removed.is_empty() {
226 return IncrementalScan {
227 result: cached,
228 delta,
229 };
230 }
231
232 let mut files = cached.files;
234 let mut symbols = cached.symbols;
235 let mut dependencies = cached.dependencies;
236
237 let removed_set: std::collections::HashSet<&str> =
238 delta.removed.iter().map(|s| s.as_str()).collect();
239 let touched_set: std::collections::HashSet<&str> = delta
240 .added
241 .iter()
242 .chain(delta.modified.iter())
243 .map(|s| s.as_str())
244 .collect();
245
246 files.retain(|f| !removed_set.contains(f.relative_path.as_str()));
247 symbols.retain(|s| {
248 !removed_set.contains(s.file_path.as_str()) && !touched_set.contains(s.file_path.as_str())
249 });
250 dependencies.retain(|d| {
251 !removed_set.contains(d.from_file.as_str()) && !touched_set.contains(d.from_file.as_str())
252 });
253
254 let touched_entries: Vec<discover::DiscoveredFile> = current
255 .iter()
256 .filter(|e| touched_set.contains(e.relative_path.as_str()))
257 .cloned()
258 .collect();
259 let (new_files, new_symbols, new_deps) = extract_per_file(&touched_entries);
260
261 let mut by_path: std::collections::BTreeMap<String, FileRecord> = files
262 .into_iter()
263 .map(|f| (f.relative_path.clone(), f))
264 .collect();
265 for new_file in new_files {
266 by_path.insert(new_file.relative_path.clone(), new_file);
267 }
268 let mut files: Vec<FileRecord> = by_path.into_values().collect();
269 symbols.extend(new_symbols);
270 dependencies.extend(new_deps);
271
272 scoring::compute_reference_counts(&mut symbols, &files);
273 if opts.include_git_history {
274 let churn = scoring::compute_churn_scores(&canonical);
275 scoring::apply_churn(&mut files, &churn);
276 }
277 scoring::compute_importance_scores(&mut symbols, &files);
278 test_mapping::map_test_files(&mut files);
279
280 let folder_records = folders::build_folder_records(&files, &symbols);
281 let test_commands = commands::detect_test_commands(&canonical);
282 let code_patterns = commands::detect_code_patterns(&files, &canonical);
283 let project = folders::build_project_metadata(
284 &canonical,
285 &files,
286 test_commands,
287 code_patterns,
288 now_iso8601(),
289 );
290 let repo_map = folders::build_repo_map(&symbols, &files, opts.repo_map_token_budget);
291 let sub_projects = subproject::detect_subprojects(&canonical, 2);
292
293 sort_for_output(&mut files, &mut symbols, &mut dependencies);
294
295 let token = snapshot::root_to_token(&canonical);
296 let result = ScanResult {
297 snapshot_token: token,
298 truncated: cached.truncated,
299 project,
300 folders: folder_records,
301 files,
302 symbols,
303 dependencies,
304 sub_projects,
305 repo_map,
306 };
307 snapshot::save(&canonical, &result);
308 IncrementalScan { result, delta }
309}
310
311fn canonicalize(root: &Path) -> PathBuf {
314 std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf())
315}
316
317fn extract_per_file(
318 discovered: &[discover::DiscoveredFile],
319) -> (Vec<FileRecord>, Vec<SymbolRecord>, Vec<DependencyEdge>) {
320 let mut files: Vec<FileRecord> = Vec::with_capacity(discovered.len());
321 let mut symbols: Vec<SymbolRecord> = Vec::new();
322 let mut dependencies: Vec<DependencyEdge> = Vec::new();
323
324 for entry in discovered {
325 let metadata = std::fs::metadata(&entry.absolute_path);
326 let size = metadata.as_ref().map(|m| m.len()).unwrap_or(0);
327 let modified = metadata
328 .as_ref()
329 .ok()
330 .and_then(|m| m.modified().ok())
331 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
332 .map(|d| d.as_millis() as i64)
333 .unwrap_or(0);
334
335 let content = std::fs::read_to_string(&entry.absolute_path).unwrap_or_default();
336 if content.is_empty() && size != 0 {
337 }
339 let language = extensions::file_extension(&entry.relative_path);
340 let imports = imports::extract_imports(&content, &language);
341 let file_symbols = symbols::extract_symbols(&content, &language, &entry.relative_path);
342 let line_count = count_lines(&content);
343
344 for imp in &imports {
345 dependencies.push(DependencyEdge {
346 from_file: entry.relative_path.clone(),
347 to_module: imp.clone(),
348 });
349 }
350 symbols.extend(file_symbols);
351
352 files.push(FileRecord {
353 id: entry.relative_path.clone(),
354 relative_path: entry.relative_path.clone(),
355 file_name: extensions::file_name(&entry.relative_path).to_string(),
356 language,
357 line_count,
358 size_bytes: size,
359 last_modified_unix_ms: modified,
360 imports,
361 churn_score: 0.0,
362 corresponding_test_file: None,
363 });
364 }
365
366 (files, symbols, dependencies)
367}
368
369fn count_lines(content: &str) -> usize {
370 if content.is_empty() {
371 return 0;
372 }
373 let nl = content.bytes().filter(|b| *b == b'\n').count();
374 let trailing = content.as_bytes().last() != Some(&b'\n');
375 nl + if trailing { 1 } else { 0 }
376}
377
378fn sort_for_output(
379 files: &mut [FileRecord],
380 symbols: &mut [SymbolRecord],
381 dependencies: &mut [DependencyEdge],
382) {
383 files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
384 symbols.sort_by(|a, b| a.id.cmp(&b.id));
385 dependencies.sort_by(|a, b| {
386 a.from_file
387 .cmp(&b.from_file)
388 .then_with(|| a.to_module.cmp(&b.to_module))
389 });
390}
391
392fn compute_delta(
393 current: &[discover::DiscoveredFile],
394 cached: &ScanResult,
395 explicit_changed: Option<&[String]>,
396) -> ScanDelta {
397 let cached_files: std::collections::BTreeMap<&str, &FileRecord> = cached
398 .files
399 .iter()
400 .map(|f| (f.relative_path.as_str(), f))
401 .collect();
402 let current_paths: std::collections::HashSet<&str> =
403 current.iter().map(|e| e.relative_path.as_str()).collect();
404
405 let added: Vec<String> = current
406 .iter()
407 .filter(|e| !cached_files.contains_key(e.relative_path.as_str()))
408 .map(|e| e.relative_path.clone())
409 .collect();
410 let removed: Vec<String> = cached
411 .files
412 .iter()
413 .filter(|f| !current_paths.contains(f.relative_path.as_str()))
414 .map(|f| f.relative_path.clone())
415 .collect();
416
417 let modified: Vec<String> = if let Some(explicit) = explicit_changed {
418 explicit
419 .iter()
420 .filter(|p| cached_files.contains_key(p.as_str()) && current_paths.contains(p.as_str()))
421 .cloned()
422 .collect()
423 } else {
424 let mut out = Vec::new();
425 for entry in current {
426 if let Some(prev) = cached_files.get(entry.relative_path.as_str()) {
427 let mtime = std::fs::metadata(&entry.absolute_path)
428 .ok()
429 .and_then(|m| m.modified().ok())
430 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
431 .map(|d| d.as_millis() as i64)
432 .unwrap_or(0);
433 if mtime > prev.last_modified_unix_ms {
434 out.push(entry.relative_path.clone());
435 }
436 }
437 }
438 out
439 };
440
441 ScanDelta {
442 added,
443 modified,
444 removed,
445 full_rescan: false,
446 }
447}
448
449fn now_iso8601() -> String {
450 let now = SystemTime::now()
451 .duration_since(UNIX_EPOCH)
452 .unwrap_or_default();
453 let secs = now.as_secs() as i64;
454 let nanos = now.subsec_nanos();
455 let (year, month, day, hour, minute, second) = unix_to_civil(secs);
456 format!(
457 "{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}.{millis:03}Z",
458 millis = nanos / 1_000_000
459 )
460}
461
462fn unix_to_civil(secs: i64) -> (i64, u32, u32, u32, u32, u32) {
466 let days = secs.div_euclid(86_400);
467 let day_secs = secs.rem_euclid(86_400);
468 let hour = (day_secs / 3600) as u32;
469 let minute = ((day_secs % 3600) / 60) as u32;
470 let second = (day_secs % 60) as u32;
471
472 let z = days + 719_468;
474 let era = z.div_euclid(146_097);
475 let doe = z.rem_euclid(146_097) as u64;
476 let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
477 let y = yoe as i64 + era * 400;
478 let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
479 let mp = (5 * doy + 2) / 153;
480 let day = (doy - (153 * mp + 2) / 5 + 1) as u32;
481 let month = if mp < 10 { mp + 3 } else { mp - 9 } as u32;
482 let year = if month <= 2 { y + 1 } else { y };
483 (year, month, day, hour, minute, second)
484}
485
486fn scan_project_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
489 let raw = dict_arg(SCAN_PROJECT_BUILTIN, args)?;
490 let dict = raw.as_ref();
491 let root = require_string(SCAN_PROJECT_BUILTIN, dict, "root")?;
492 let opts = parse_options(SCAN_PROJECT_BUILTIN, dict)?;
493 let result = scan_project(Path::new(&root), opts);
494 Ok(scan_result_to_value(&result, None))
495}
496
497fn scan_incremental_handler(args: &[VmValue]) -> Result<VmValue, HostlibError> {
498 let raw = dict_arg(SCAN_INCREMENTAL_BUILTIN, args)?;
499 let dict = raw.as_ref();
500 let token = require_string(SCAN_INCREMENTAL_BUILTIN, dict, "snapshot_token")?;
501 let opts = parse_options(SCAN_INCREMENTAL_BUILTIN, dict)?;
502 let changed = parse_changed_paths(SCAN_INCREMENTAL_BUILTIN, dict)?;
503 let scan = scan_incremental(&token, changed.as_deref(), opts);
504 Ok(scan_result_to_value(&scan.result, Some(&scan.delta)))
505}
506
507fn parse_options(
508 builtin: &'static str,
509 dict: &std::collections::BTreeMap<String, VmValue>,
510) -> Result<ScanProjectOptions, HostlibError> {
511 let include_hidden = optional_bool(builtin, dict, "include_hidden", false)?;
512 let respect_gitignore = optional_bool(builtin, dict, "respect_gitignore", true)?;
513 let max_files = optional_int(builtin, dict, "max_files", 0)?;
514 let include_git_history = optional_bool(builtin, dict, "include_git_history", true)?;
515 let repo_map_token_budget = optional_int(builtin, dict, "repo_map_token_budget", 1200)?;
516 if max_files < 0 {
517 return Err(HostlibError::InvalidParameter {
518 builtin,
519 param: "max_files",
520 message: "must be >= 0".to_string(),
521 });
522 }
523 if repo_map_token_budget < 0 {
524 return Err(HostlibError::InvalidParameter {
525 builtin,
526 param: "repo_map_token_budget",
527 message: "must be >= 0".to_string(),
528 });
529 }
530 Ok(ScanProjectOptions {
531 include_hidden,
532 respect_gitignore,
533 max_files: max_files as usize,
534 include_git_history,
535 repo_map_token_budget: repo_map_token_budget as usize,
536 })
537}
538
539fn parse_changed_paths(
540 builtin: &'static str,
541 dict: &std::collections::BTreeMap<String, VmValue>,
542) -> Result<Option<Vec<String>>, HostlibError> {
543 let value = match dict.get("changed_paths") {
544 None | Some(VmValue::Nil) => return Ok(None),
545 Some(v) => v,
546 };
547 let list = match value {
548 VmValue::List(items) => items,
549 other => {
550 return Err(HostlibError::InvalidParameter {
551 builtin,
552 param: "changed_paths",
553 message: format!("expected list of strings, got {}", other.type_name()),
554 });
555 }
556 };
557 let mut out = Vec::with_capacity(list.len());
558 for item in list.iter() {
559 match item {
560 VmValue::String(s) => out.push(s.to_string()),
561 other => {
562 return Err(HostlibError::InvalidParameter {
563 builtin,
564 param: "changed_paths",
565 message: format!("non-string entry: {}", other.type_name()),
566 });
567 }
568 }
569 }
570 Ok(Some(out))
571}
572
573fn scan_result_to_value(result: &ScanResult, delta: Option<&ScanDelta>) -> VmValue {
574 let mut entries: Vec<(&'static str, VmValue)> = vec![
575 ("snapshot_token", str_value(&result.snapshot_token)),
576 ("truncated", VmValue::Bool(result.truncated)),
577 ("project", project_to_value(&result.project)),
578 ("folders", list_of(&result.folders, folder_to_value)),
579 ("files", list_of(&result.files, file_to_value)),
580 ("symbols", list_of(&result.symbols, symbol_to_value)),
581 (
582 "dependencies",
583 list_of(&result.dependencies, dependency_to_value),
584 ),
585 (
586 "sub_projects",
587 list_of(&result.sub_projects, subproject_to_value),
588 ),
589 ("repo_map", str_value(&result.repo_map)),
590 ];
591 if let Some(d) = delta {
592 entries.push(("delta", delta_to_value(d)));
593 }
594 build_dict(entries)
595}
596
597fn list_of<T>(items: &[T], to_value: fn(&T) -> VmValue) -> VmValue {
598 let list: Vec<VmValue> = items.iter().map(to_value).collect();
599 VmValue::List(Rc::new(list))
600}
601
602fn project_to_value(project: &ProjectMetadata) -> VmValue {
603 let test_commands_entries: Vec<(String, VmValue)> = project
604 .test_commands
605 .iter()
606 .map(|(k, v)| (k.clone(), str_value(v)))
607 .collect();
608 let test_commands_dict = build_dict(test_commands_entries);
609
610 let detected: VmValue = project
611 .detected_test_command
612 .as_deref()
613 .map(str_value)
614 .unwrap_or(VmValue::Nil);
615
616 let code_patterns: Vec<VmValue> = project.code_patterns.iter().map(str_value).collect();
617
618 build_dict([
619 ("name", str_value(&project.name)),
620 ("root_path", str_value(&project.root_path)),
621 ("languages", list_of(&project.languages, language_to_value)),
622 ("test_commands", test_commands_dict),
623 ("detected_test_command", detected),
624 ("code_patterns", VmValue::List(Rc::new(code_patterns))),
625 ("total_files", VmValue::Int(project.total_files as i64)),
626 ("total_lines", VmValue::Int(project.total_lines as i64)),
627 ("last_scanned_at", str_value(&project.last_scanned_at)),
628 ])
629}
630
631fn language_to_value(stat: &LanguageStat) -> VmValue {
632 build_dict([
633 ("name", str_value(&stat.name)),
634 ("file_count", VmValue::Int(stat.file_count as i64)),
635 ("line_count", VmValue::Int(stat.line_count as i64)),
636 ("percentage", VmValue::Float(stat.percentage)),
637 ])
638}
639
640fn folder_to_value(folder: &FolderRecord) -> VmValue {
641 let names: Vec<VmValue> = folder.key_symbol_names.iter().map(str_value).collect();
642 build_dict([
643 ("id", str_value(&folder.id)),
644 ("relative_path", str_value(&folder.relative_path)),
645 ("file_count", VmValue::Int(folder.file_count as i64)),
646 ("line_count", VmValue::Int(folder.line_count as i64)),
647 ("dominant_language", str_value(&folder.dominant_language)),
648 ("key_symbol_names", VmValue::List(Rc::new(names))),
649 ])
650}
651
652fn file_to_value(file: &FileRecord) -> VmValue {
653 let imports: Vec<VmValue> = file.imports.iter().map(str_value).collect();
654 let test_pair = file
655 .corresponding_test_file
656 .as_deref()
657 .map(str_value)
658 .unwrap_or(VmValue::Nil);
659 build_dict([
660 ("id", str_value(&file.id)),
661 ("relative_path", str_value(&file.relative_path)),
662 ("file_name", str_value(&file.file_name)),
663 ("language", str_value(&file.language)),
664 ("line_count", VmValue::Int(file.line_count as i64)),
665 ("size_bytes", VmValue::Int(file.size_bytes as i64)),
666 (
667 "last_modified_unix_ms",
668 VmValue::Int(file.last_modified_unix_ms),
669 ),
670 ("imports", VmValue::List(Rc::new(imports))),
671 ("churn_score", VmValue::Float(file.churn_score)),
672 ("corresponding_test_file", test_pair),
673 ])
674}
675
676fn symbol_to_value(symbol: &SymbolRecord) -> VmValue {
677 let container = symbol
678 .container
679 .as_deref()
680 .map(str_value)
681 .unwrap_or(VmValue::Nil);
682 build_dict([
683 ("id", str_value(&symbol.id)),
684 ("name", str_value(&symbol.name)),
685 ("kind", str_value(symbol.kind.keyword())),
686 ("file_path", str_value(&symbol.file_path)),
687 ("line", VmValue::Int(symbol.line as i64)),
688 ("signature", str_value(&symbol.signature)),
689 ("container", container),
690 (
691 "reference_count",
692 VmValue::Int(symbol.reference_count as i64),
693 ),
694 ("importance_score", VmValue::Float(symbol.importance_score)),
695 ])
696}
697
698fn dependency_to_value(dep: &DependencyEdge) -> VmValue {
699 build_dict([
700 ("from_file", str_value(&dep.from_file)),
701 ("to_module", str_value(&dep.to_module)),
702 ])
703}
704
705fn subproject_to_value(sp: &SubProject) -> VmValue {
706 build_dict([
707 ("path", str_value(&sp.path)),
708 ("name", str_value(&sp.name)),
709 ("language", str_value(&sp.language)),
710 ("project_marker", str_value(&sp.project_marker)),
711 ])
712}
713
714fn delta_to_value(delta: &ScanDelta) -> VmValue {
715 let added: Vec<VmValue> = delta.added.iter().map(str_value).collect();
716 let modified: Vec<VmValue> = delta.modified.iter().map(str_value).collect();
717 let removed: Vec<VmValue> = delta.removed.iter().map(str_value).collect();
718 build_dict([
719 ("added", VmValue::List(Rc::new(added))),
720 ("modified", VmValue::List(Rc::new(modified))),
721 ("removed", VmValue::List(Rc::new(removed))),
722 ("full_rescan", VmValue::Bool(delta.full_rescan)),
723 ])
724}