1use std::collections::{HashMap, HashSet};
30use std::num::NonZeroUsize;
31use std::path::{Path, PathBuf};
32
33use anyhow::{Context, Result};
34use lru::LruCache;
35use petgraph::graph::DiGraph;
36use rayon::prelude::*;
37use tracing::{debug, info};
38
39use crate::db::Database;
40use crate::git_scanner;
41use crate::graph_builder;
42use crate::models::{CommitInfo, DependencyEdge, GraphSnapshot};
43use crate::parser;
44use crate::scoring;
45
46fn is_test_path(path: &Path) -> bool {
56 let s = path.to_string_lossy();
57 let lower = s.to_ascii_lowercase().replace('\\', "/");
59
60 const TEST_DIRS: &[&str] = &[
62 "/test/",
63 "/tests/",
64 "/testdata/",
65 "/test_data/",
66 "/__tests__/",
67 "/spec/",
68 "/fixtures/",
69 "/fixture/",
70 "/examples/",
71 "/example/",
72 "/benchmarks/",
73 "/bench/",
74 "/testutil/",
75 "/testing/",
76 "/mock/",
77 "/mocks/",
78 "/snapshots/",
79 "/e2e/",
80 ];
81 for pat in TEST_DIRS {
82 if lower.contains(pat) {
83 return true;
84 }
85 }
86
87 const TEST_DIR_PREFIXES: &[&str] = &[
89 "test/",
90 "tests/",
91 "testdata/",
92 "test_data/",
93 "__tests__/",
94 "spec/",
95 "fixtures/",
96 "fixture/",
97 "examples/",
98 "example/",
99 "benchmarks/",
100 "bench/",
101 ".github/",
102 ];
103 for pat in TEST_DIR_PREFIXES {
104 if lower.starts_with(pat) {
105 return true;
106 }
107 }
108
109 let file_name = path
111 .file_name()
112 .unwrap_or_default()
113 .to_string_lossy()
114 .to_ascii_lowercase();
115 if file_name.ends_with("_test.ts")
116 || file_name.ends_with("_test.tsx")
117 || file_name.ends_with("_test.rs")
118 || file_name.ends_with("_test.go")
119 || file_name.ends_with("_test.py")
120 || file_name.ends_with(".test.ts")
121 || file_name.ends_with(".test.tsx")
122 || file_name.ends_with(".test.js")
123 || file_name.ends_with(".spec.ts")
124 || file_name.ends_with(".spec.tsx")
125 || file_name.starts_with("test_")
126 {
127 return true;
128 }
129
130 false
131}
132
133fn is_noise_import(name: &str) -> bool {
143 let name = name.trim();
144
145 if name.len() <= 1 {
147 return true;
148 }
149
150 if name.starts_with("http://") || name.starts_with("https://") {
152 return true;
153 }
154
155 if name.starts_with("npm:") || name.starts_with("node:") {
157 return true;
158 }
159
160 let lower = name.to_ascii_lowercase();
162 if lower.ends_with(".css")
163 || lower.ends_with(".scss")
164 || lower.ends_with(".json")
165 || lower.ends_with(".svg")
166 || lower.ends_with(".png")
167 || lower.ends_with(".jpg")
168 || lower.ends_with(".wasm")
169 || lower.ends_with(".html")
170 || lower.ends_with(".md")
171 || lower.ends_with(".txt")
172 {
173 return true;
174 }
175
176 if name.starts_with(|c: char| c.is_ascii_digit()) && name.contains('.') {
178 return true;
179 }
180
181 if name.chars().all(|c| c.is_ascii_digit() || c == '.') {
183 return true;
184 }
185
186 false
187}
188
189fn normalize_import(name: &str) -> String {
196 let name = name.trim();
197
198 if let Some(rest) = name.strip_prefix("npm:") {
201 let without_version = if let Some(stripped) = rest.strip_prefix('@') {
202 match stripped.find('@') {
204 Some(pos) => &rest[..pos + 1],
205 None => rest,
206 }
207 } else {
208 rest.split('@').next().unwrap_or(rest)
209 };
210 return without_version.to_string();
211 }
212
213 if let Some(rest) = name.strip_prefix("node:") {
215 return rest.to_string();
216 }
217
218 name.to_string()
219}
220
221fn collect_edges(
233 source_pkg: &str,
234 imports: &[String],
235 file_path_str: &str,
236 all_nodes: &mut HashSet<String>,
237 all_edges: &mut Vec<DependencyEdge>,
238) {
239 let source_dir = Path::new(file_path_str)
240 .parent()
241 .unwrap_or_else(|| Path::new(""));
242
243 for imp in imports {
244 if is_noise_import(imp) {
245 continue;
246 }
247 let imp = normalize_import(imp);
248 if imp.is_empty() {
249 continue;
250 }
251
252 let target = if imp.starts_with("./") || imp.starts_with("../") {
254 let resolved = source_dir.join(&imp);
256 let resolved_str = resolved.to_string_lossy().replace('\\', "/");
258 let mut parts: Vec<&str> = Vec::new();
259 for part in resolved_str.split('/') {
260 match part {
261 ".." if !parts.is_empty() => {
262 parts.pop();
263 }
264 "." | "" => {}
265 _ => parts.push(part),
266 }
267 }
268 if parts.is_empty() {
269 continue;
270 }
271 let joined = parts.join("/");
273 parser::extract_package_name(Path::new(&joined))
274 } else if imp.contains('/') {
275 parser::extract_package_name(Path::new(&imp))
277 } else {
278 imp
280 };
281
282 if target.is_empty() {
283 continue;
284 }
285
286 if target == source_pkg {
288 continue;
289 }
290
291 all_nodes.insert(target.clone());
292 all_edges.push(DependencyEdge {
293 from_module: source_pkg.to_string(),
294 to_module: target,
295 file_path: file_path_str.to_string(),
296 line: 0,
297 weight: 1,
298 });
299 }
300}
301
302pub struct ScanResult {
304 pub commits_scanned: usize,
305 pub graphs_created: usize,
306 pub drifts_calculated: usize,
308}
309
310pub fn run_scan(path: &Path, db: &Database, max_commits: usize) -> Result<ScanResult> {
322 let repo = gix::discover(path)
323 .with_context(|| format!("Failed to open repo for graph building: {}", path.display()))?;
324
325 let last_commit = db.get_latest_scanned_commit()?;
327 let existing_count = db.graph_snapshot_count()?;
328
329 let mut commits = if let Some(ref last_hash) = last_commit {
330 let new_commits = git_scanner::get_commits_since(&repo, last_hash, max_commits)?;
332 if new_commits.is_empty() {
333 info!("No new commits since last scan ({})", &last_hash[..7]);
334 return Ok(ScanResult {
335 commits_scanned: 0,
336 graphs_created: 0,
337 drifts_calculated: 0,
338 });
339 }
340 info!(
341 "Incremental scan: {} existing snapshots, {} new commits since {}",
342 existing_count,
343 new_commits.len(),
344 &last_hash[..7]
345 );
346 new_commits
347 } else {
348 db.clear_all_graph_snapshots()?;
350 info!("Building dependency graphs...");
351 git_scanner::get_commits_in_order(&repo, max_commits)?
352 };
353
354 commits.reverse();
356
357 let mut prev_graph: Option<DiGraph<String, ()>> = None;
359 if let Some(ref last_hash) = last_commit {
360 if let Some(snapshot) = db.get_graph_snapshot(last_hash)? {
361 let nodes: HashSet<String> = snapshot.nodes.into_iter().collect();
362 prev_graph = Some(graph_builder::build_graph(&nodes, &snapshot.edges));
363 debug!(
364 "Loaded previous graph for drift continuity ({} nodes)",
365 nodes.len()
366 );
367 }
368 }
369
370 db.begin_transaction()?;
372
373 let mut graphs_created: usize = 0;
374 let mut drifts_calculated: usize = 0;
375
376 let mut seen_trees: HashSet<String> = HashSet::new();
378
379 let mut subtree_cache = git_scanner::SubtreeCache::new();
391 let mut blob_import_cache: LruCache<[u8; 20], Vec<String>> =
392 LruCache::new(NonZeroUsize::new(50_000).unwrap());
393 let total_commits = commits.len();
394 let scan_start = std::time::Instant::now();
395
396 for (ci, commit) in commits.iter().enumerate() {
397 let commit_hash = commit.id().to_string();
398
399 let decoded = match commit.decode() {
400 Ok(d) => d,
401 Err(e) => {
402 debug!(hash = %commit_hash, error = %e, "Failed to decode commit, skipping");
403 continue;
404 }
405 };
406
407 let (author_name, author_email, timestamp) = match decoded.author() {
408 Ok(sig) => (sig.name.to_string(), sig.email.to_string(), sig.seconds()),
409 Err(_) => ("unknown".to_string(), "unknown".to_string(), 0),
410 };
411 let commit_info = CommitInfo {
412 hash: commit_hash.clone(),
413 author_name,
414 author_email,
415 message: decoded.message.to_string(),
416 timestamp,
417 tree_id: decoded.tree().to_string(),
418 };
419
420 db.insert_commit(&commit_info)?;
422
423 let tree_oid = match git_scanner::get_tree_for_commit(&repo, &commit_hash) {
424 Ok(oid) => oid,
425 Err(e) => {
426 debug!(hash = %commit_hash, error = %e, "Failed to get commit tree, skipping");
427 continue;
428 }
429 };
430
431 let tree_hex = tree_oid.to_string();
432 if !seen_trees.insert(tree_hex) {
433 debug!(hash = %commit_hash, "Same tree already processed, skipping");
434 continue;
435 }
436
437 let tree = match repo.find_tree(tree_oid) {
438 Ok(t) => t,
439 Err(e) => {
440 debug!(hash = %commit_hash, error = %e, "Tree not found");
441 continue;
442 }
443 };
444
445 let entries = match git_scanner::walk_tree_entries_cached(&repo, &tree, &mut subtree_cache)
447 {
448 Ok(e) => e,
449 Err(e) => {
450 debug!(hash = %commit_hash, error = %e, "Tree walk failed");
451 continue;
452 }
453 };
454
455 if entries.is_empty() {
456 continue;
457 }
458
459 let mut all_nodes: HashSet<String> = HashSet::with_capacity(entries.len() / 4);
460 let mut all_edges: Vec<DependencyEdge> = Vec::with_capacity(entries.len());
461 let mut cache_hits: usize = 0;
462
463 struct ParseJob {
468 source_pkg: String,
469 oid_key: [u8; 20],
470 content: String,
471 file_path: PathBuf,
472 }
473
474 let mut cached_imports: Vec<(String, Vec<String>, String)> = Vec::new();
475 let mut parse_jobs: Vec<ParseJob> = Vec::new();
476
477 for (file_path, blob_oid) in &entries {
478 if is_test_path(file_path.as_path()) {
480 continue;
481 }
482
483 let source_pkg = parser::extract_package_name(file_path.as_path());
484 all_nodes.insert(source_pkg.clone());
485
486 let oid_key: [u8; 20] = blob_oid.as_bytes().try_into().unwrap_or([0u8; 20]);
488
489 if let Some(cached) = blob_import_cache.get(&oid_key) {
491 cache_hits += 1;
492 if !cached.is_empty() {
493 let file_path_str = file_path.to_string_lossy().replace('\\', "/");
494 cached_imports.push((source_pkg, cached.clone(), file_path_str));
495 }
496 } else {
497 let blob = match repo.find_object(*blob_oid) {
499 Ok(b) => b,
500 Err(_) => {
501 blob_import_cache.put(oid_key, Vec::new());
502 continue;
503 }
504 };
505 let content = match std::str::from_utf8(&blob.data) {
506 Ok(s) => s.to_string(),
507 Err(_) => {
508 blob_import_cache.put(oid_key, Vec::new());
509 continue;
510 }
511 };
512 let file_path_str = file_path.to_string_lossy();
513 if parser::detect_language(file_path_str.as_ref()).is_none() {
514 blob_import_cache.put(oid_key, Vec::new());
515 continue;
516 }
517 parse_jobs.push(ParseJob {
518 source_pkg,
519 oid_key,
520 content,
521 file_path: file_path.clone(),
522 });
523 }
524 }
525
526 let source_pkgs: HashSet<String> = all_nodes.clone();
531
532 let parsed_results: Vec<(String, [u8; 20], Vec<String>, String)> = parse_jobs
539 .into_par_iter()
540 .filter_map(|job| {
541 let path_str = job.file_path.to_string_lossy();
542 let lang = parser::detect_language(path_str.as_ref())?;
543 let imports = parser::parse_imports(&job.content, lang, job.file_path.as_path());
544 let file_path_str = path_str.replace('\\', "/");
545 Some((job.source_pkg, job.oid_key, imports, file_path_str))
546 })
547 .collect();
548
549 for (source_pkg, oid_key, imports, file_path_str) in parsed_results {
554 blob_import_cache.put(oid_key, imports.clone());
555 if !imports.is_empty() {
556 collect_edges(
557 &source_pkg,
558 &imports,
559 &file_path_str,
560 &mut all_nodes,
561 &mut all_edges,
562 );
563 }
564 }
565
566 for (source_pkg, imports, file_path_str) in cached_imports {
568 collect_edges(
569 &source_pkg,
570 &imports,
571 &file_path_str,
572 &mut all_nodes,
573 &mut all_edges,
574 );
575 }
576
577 if all_nodes.is_empty() {
578 continue;
579 }
580
581 let mut edge_weight_map: HashMap<(String, String), DependencyEdge> =
583 HashMap::with_capacity(all_edges.len() / 2);
584 for edge in all_edges {
585 let key = (edge.from_module.clone(), edge.to_module.clone());
586 edge_weight_map
587 .entry(key)
588 .and_modify(|existing| existing.weight += 1)
589 .or_insert(edge);
590 }
591 let all_edges: Vec<DependencyEdge> = edge_weight_map.into_values().collect();
592
593 const MIN_EXT_IMPORTERS: usize = 3;
604
605 let mut ext_importer_count: HashMap<String, HashSet<String>> = HashMap::new();
607 for edge in &all_edges {
608 if !source_pkgs.contains(&edge.to_module) {
609 ext_importer_count
610 .entry(edge.to_module.clone())
611 .or_default()
612 .insert(edge.from_module.clone());
613 }
614 }
615
616 let kept_nodes: HashSet<String> = all_nodes
617 .iter()
618 .filter(|n| {
619 source_pkgs.contains(*n)
620 || ext_importer_count
621 .get(*n)
622 .is_some_and(|importers| importers.len() >= MIN_EXT_IMPORTERS)
623 })
624 .cloned()
625 .collect();
626
627 let filtered_edges: Vec<DependencyEdge> = all_edges
628 .into_iter()
629 .filter(|e| kept_nodes.contains(&e.from_module) && kept_nodes.contains(&e.to_module))
630 .collect();
631
632 let graph = graph_builder::build_graph(&kept_nodes, &filtered_edges);
633
634 let nodes_vec: Vec<String> = kept_nodes.iter().cloned().collect();
635 let edges_pairs = scoring::edges_to_pairs(&filtered_edges);
636 let drift = scoring::calculate_drift(
637 &graph,
638 prev_graph.as_ref(),
639 &nodes_vec,
640 &edges_pairs,
641 commit_info.timestamp,
642 );
643 drifts_calculated += 1;
644
645 let snapshot = GraphSnapshot {
646 commit_hash: commit_hash.clone(),
647 nodes: nodes_vec,
648 edges: filtered_edges,
649 node_count: graph.node_count(),
650 edge_count: graph.edge_count(),
651 timestamp: commit_info.timestamp,
652 drift: Some(drift),
653 };
654
655 db.insert_graph_snapshot(&snapshot)?;
656 graphs_created += 1;
657
658 prev_graph = Some(graph);
659
660 if (ci + 1) % 25 == 0 || ci + 1 == total_commits {
662 let elapsed = scan_start.elapsed().as_secs_f64();
663 let pct = ((ci + 1) as f64 / total_commits as f64 * 100.0) as u32;
664 info!(
665 "[{}/{}] {}% — {} graphs, {} cached blobs ({} hits), {:.1}s",
666 ci + 1,
667 total_commits,
668 pct,
669 graphs_created,
670 blob_import_cache.len(),
671 cache_hits,
672 elapsed,
673 );
674 }
675 }
676
677 db.commit_transaction()?;
679
680 info!(
681 total = graphs_created,
682 drifts = drifts_calculated,
683 "Dependency graph + drift creation complete"
684 );
685
686 Ok(ScanResult {
687 commits_scanned: commits.len(),
688 graphs_created,
689 drifts_calculated,
690 })
691}
692
693#[cfg(test)]
706fn path_to_module(path: &str) -> String {
707 let path = path.replace("\\", "/");
708 let without_ext = path
709 .rsplit_once('.')
710 .map_or(path.as_str(), |(base, _)| base);
711 without_ext.replace('/', "::")
712}
713
714#[cfg(test)]
718mod tests {
719 use super::*;
720
721 #[test]
722 fn test_path_to_module() {
723 assert_eq!(path_to_module("src/main.rs"), "src::main");
724 assert_eq!(
725 path_to_module("packages/ui/index.ts"),
726 "packages::ui::index"
727 );
728 assert_eq!(path_to_module("cmd/server/main.go"), "cmd::server::main");
729 assert_eq!(path_to_module("lib.rs"), "lib");
730 assert_eq!(path_to_module("src\\win\\main.rs"), "src::win::main");
731 }
732
733 #[test]
734 fn test_is_test_path() {
735 assert!(is_test_path(Path::new("cli/tests/testdata/001_hello.ts")));
737 assert!(is_test_path(Path::new("src/__tests__/app.test.tsx")));
738 assert!(is_test_path(Path::new("tests/integration/run.rs")));
739 assert!(is_test_path(Path::new("examples/hello/main.rs")));
740 assert!(is_test_path(Path::new("benchmarks/perf.go")));
741 assert!(is_test_path(Path::new("src/utils_test.go")));
742 assert!(is_test_path(Path::new("lib/parser.test.ts")));
743 assert!(is_test_path(Path::new("test_helper.py")));
744 assert!(is_test_path(Path::new("fixtures/data.ts")));
745
746 assert!(!is_test_path(Path::new("src/main.rs")));
748 assert!(!is_test_path(Path::new("cli/tools/run.ts")));
749 assert!(!is_test_path(Path::new("packages/core/index.ts")));
750 assert!(!is_test_path(Path::new("runtime/ops/fs.rs")));
751 }
752
753 #[test]
754 fn test_is_noise_import() {
755 assert!(is_noise_import("https://deno.land/std/testing/asserts.ts"));
757 assert!(is_noise_import("http://example.com/mod.ts"));
758 assert!(is_noise_import("npm:chalk@5"));
759 assert!(is_noise_import("node:fs"));
760 assert!(is_noise_import("./styles.css"));
761 assert!(is_noise_import("../data.json"));
762 assert!(is_noise_import("logo.svg"));
763 assert!(is_noise_import("0.1.0"));
764 assert!(is_noise_import("1.2.3"));
765 assert!(is_noise_import("x")); assert!(!is_noise_import("react"));
769 assert!(!is_noise_import("serde"));
770 assert!(!is_noise_import("std"));
771 assert!(!is_noise_import("@scope/package"));
772 assert!(!is_noise_import("tokio"));
773 }
774
775 #[test]
776 fn test_normalize_import() {
777 assert_eq!(normalize_import("npm:chalk@5"), "chalk");
778 assert_eq!(normalize_import("npm:@types/node"), "@types/node");
779 assert_eq!(normalize_import("node:fs"), "fs");
780 assert_eq!(normalize_import("node:path"), "path");
781 assert_eq!(normalize_import("react"), "react");
782 }
783}