1use std::collections::{HashMap, HashSet};
2use std::path::Path;
3
4use anyhow::{Context, Result};
5
6use crate::extract;
7use crate::lang::LanguageRegistry;
8
9use super::{ChangeKind, FlatSym, SymbolChange, SymbolDiff};
10
11pub fn semantic_diff(
16 project_root: &Path,
17 old_ref: &str,
18 new_ref: &str,
19 registry: &LanguageRegistry,
20) -> Result<SymbolDiff> {
21 let changed = compute_changed_files(project_root, old_ref, new_ref);
22
23 let (old_filter, new_filter) = match &changed {
24 Some(cf) => (Some(&cf.old_ref_files), Some(&cf.new_ref_files)),
25 None => (None, None),
26 };
27
28 let old_symbols = extract_ref_symbols(project_root, old_ref, registry, old_filter)
29 .with_context(|| format!("failed to extract symbols for ref '{}'", old_ref))?;
30 let new_symbols = extract_ref_symbols(project_root, new_ref, registry, new_filter)
31 .with_context(|| format!("failed to extract symbols for ref '{}'", new_ref))?;
32
33 Ok(diff_symbol_maps(old_ref, new_ref, old_symbols, new_symbols))
34}
35
36struct ChangedFiles {
37 old_ref_files: HashSet<String>,
38 new_ref_files: HashSet<String>,
39}
40
41fn compute_changed_files(
42 project_root: &Path,
43 old_ref: &str,
44 new_ref: &str,
45) -> Option<ChangedFiles> {
46 let output = std::process::Command::new("git")
47 .args(["diff", "--name-status", "--no-renames", old_ref, new_ref])
48 .current_dir(project_root)
49 .output()
50 .ok()?;
51
52 if !output.status.success() {
53 eprintln!(
54 "infigraph: git diff --name-status failed for {}..{}, falling back to full extraction",
55 old_ref, new_ref
56 );
57 return None;
58 }
59
60 let text = String::from_utf8_lossy(&output.stdout);
61 let mut old_ref_files = HashSet::new();
62 let mut new_ref_files = HashSet::new();
63
64 for line in text.lines() {
65 let line = line.trim();
66 if line.is_empty() {
67 continue;
68 }
69 let mut parts = line.splitn(2, '\t');
70 let status = parts.next().unwrap_or("").trim();
71 let path = match parts.next() {
72 Some(p) => p.trim().to_string(),
73 None => continue,
74 };
75
76 match status {
77 "A" => {
78 new_ref_files.insert(path);
79 }
80 "D" => {
81 old_ref_files.insert(path);
82 }
83 _ => {
84 old_ref_files.insert(path.clone());
85 new_ref_files.insert(path);
86 }
87 }
88 }
89
90 Some(ChangedFiles {
91 old_ref_files,
92 new_ref_files,
93 })
94}
95
96const MAX_ARCHIVE_ARGS: usize = 500;
103
104fn extract_ref_symbols(
105 project_root: &Path,
106 git_ref: &str,
107 registry: &LanguageRegistry,
108 file_filter: Option<&HashSet<String>>,
109) -> Result<HashMap<String, FlatSym>> {
110 if let Some(filter) = file_filter {
111 if filter.is_empty() {
112 return Ok(HashMap::new());
113 }
114 }
115
116 let is_working_tree = git_ref == "HEAD" || git_ref == "WORKING";
117
118 if is_working_tree {
119 return extract_dir_symbols(project_root, project_root, registry, file_filter);
120 }
121
122 let tmp = tempfile::tempdir().context("failed to create temp dir")?;
123
124 let use_filtered_archive = file_filter
125 .map(|f| f.len() <= MAX_ARCHIVE_ARGS)
126 .unwrap_or(false);
127
128 let archive_output = if use_filtered_archive {
129 let filter = file_filter.unwrap();
130 let mut args: Vec<&str> = vec!["archive", "--format=tar", git_ref, "--"];
131 args.extend(filter.iter().map(|s| s.as_str()));
132 std::process::Command::new("git")
133 .args(&args)
134 .current_dir(project_root)
135 .output()
136 .context("git archive (filtered) failed")?
137 } else {
138 std::process::Command::new("git")
139 .args(["archive", "--format=tar", git_ref])
140 .current_dir(project_root)
141 .output()
142 .context("git archive failed")?
143 };
144
145 if !archive_output.status.success() {
146 let err = String::from_utf8_lossy(&archive_output.stderr);
147 if use_filtered_archive {
148 eprintln!(
149 "infigraph: filtered git archive for {} failed, falling back to full archive: {}",
150 git_ref,
151 err.trim()
152 );
153 let full_output = std::process::Command::new("git")
154 .args(["archive", "--format=tar", git_ref])
155 .current_dir(project_root)
156 .output()
157 .context("git archive (full fallback) failed")?;
158 if !full_output.status.success() {
159 let err2 = String::from_utf8_lossy(&full_output.stderr);
160 anyhow::bail!("git archive {} failed: {}", git_ref, err2.trim());
161 }
162 return untar_and_extract(tmp.path(), &full_output.stdout, registry, file_filter);
163 }
164 anyhow::bail!("git archive {} failed: {}", git_ref, err.trim());
165 }
166
167 untar_and_extract(tmp.path(), &archive_output.stdout, registry, file_filter)
168}
169
170fn untar_and_extract(
171 tmp_dir: &Path,
172 tar_data: &[u8],
173 registry: &LanguageRegistry,
174 file_filter: Option<&HashSet<String>>,
175) -> Result<HashMap<String, FlatSym>> {
176 let mut tar = std::process::Command::new("tar")
177 .args(["-x", "-C", tmp_dir.to_str().unwrap_or(".")])
178 .stdin(std::process::Stdio::piped())
179 .spawn()
180 .context("failed to spawn tar")?;
181
182 if let Some(stdin) = tar.stdin.take() {
183 use std::io::Write;
184 let mut w = stdin;
185 w.write_all(tar_data)?;
186 }
187 tar.wait().context("tar wait failed")?;
188
189 extract_dir_symbols(tmp_dir, tmp_dir, registry, file_filter)
190}
191
192fn extract_dir_symbols(
193 root: &Path,
194 dir: &Path,
195 registry: &LanguageRegistry,
196 file_filter: Option<&HashSet<String>>,
197) -> Result<HashMap<String, FlatSym>> {
198 let mut map = HashMap::new();
199 collect_symbols(root, dir, registry, file_filter, &mut map)?;
200 Ok(map)
201}
202
203static SKIP_DIRS: &[&str] = &[
204 ".git",
205 "node_modules",
206 ".venv",
207 "venv",
208 "target",
209 "build",
210 "dist",
211 "__pycache__",
212 ".tox",
213 ".infigraph",
214];
215
216fn collect_symbols(
217 root: &Path,
218 dir: &Path,
219 registry: &LanguageRegistry,
220 file_filter: Option<&HashSet<String>>,
221 map: &mut HashMap<String, FlatSym>,
222) -> Result<()> {
223 for entry in std::fs::read_dir(dir)? {
224 let entry = entry?;
225 let path = entry.path();
226 let name = entry.file_name();
227 let name_str = name.to_string_lossy();
228
229 if path.is_dir() {
230 if !SKIP_DIRS.contains(&name_str.as_ref()) && !name_str.starts_with('.') {
231 collect_symbols(root, &path, registry, file_filter, map)?;
232 }
233 } else if path.is_file() {
234 let rel = path
235 .strip_prefix(root)
236 .unwrap_or(&path)
237 .to_string_lossy()
238 .replace('\\', "/");
239 if let Some(filter) = file_filter {
240 if !filter.contains(&rel) {
241 continue;
242 }
243 }
244 let Ok(source) = std::fs::read(&path) else {
245 continue;
246 };
247 let Some(pack) = registry.for_file_with_content(&rel, &source) else {
248 continue;
249 };
250 let Ok(extraction) = extract::extract_file(&rel, &source, pack) else {
251 continue;
252 };
253 let file = extraction.file.clone();
254 for sym in &extraction.symbols {
255 let kind_str = sym.kind.as_str().to_string();
256 let key = format!("{}::{}::{}", file, sym.name, kind_str);
258 map.insert(
259 key,
260 FlatSym {
261 file: file.clone(),
262 name: sym.name.clone(),
263 kind: kind_str,
264 sig_hash: sym.signature_hash.clone(),
265 params: sym.parameters.clone().unwrap_or_default(),
266 return_type: sym.return_type.clone().unwrap_or_default(),
267 },
268 );
269 }
270 }
271 }
272 Ok(())
273}
274
275pub(crate) fn sig_matches(a: &FlatSym, b: &FlatSym) -> bool {
280 a.params == b.params && a.return_type == b.return_type
281}
282
283pub(crate) fn diff_symbol_maps(
284 old_ref: &str,
285 new_ref: &str,
286 old: HashMap<String, FlatSym>,
287 new: HashMap<String, FlatSym>,
288) -> SymbolDiff {
289 let mut changes = Vec::new();
290
291 let old_by_name: HashMap<String, &FlatSym> = old
293 .values()
294 .map(|s| (format!("{}::{}", s.name, s.kind), s))
295 .collect();
296
297 for (key, new_sym) in &new {
299 if let Some(old_sym) = old.get(key) {
300 if old_sym.sig_hash == new_sym.sig_hash
302 || old_sym.sig_hash.is_empty()
303 || new_sym.sig_hash.is_empty()
304 {
305 continue;
306 }
307 if !sig_matches(old_sym, new_sym) {
308 changes.push(SymbolChange {
309 name: new_sym.name.clone(),
310 kind: new_sym.kind.clone(),
311 file: new_sym.file.clone(),
312 change: ChangeKind::SignatureChanged,
313 caller_count: 0,
314 });
315 } else {
316 changes.push(SymbolChange {
317 name: new_sym.name.clone(),
318 kind: new_sym.kind.clone(),
319 file: new_sym.file.clone(),
320 change: ChangeKind::BodyChanged,
321 caller_count: 0,
322 });
323 }
324 } else {
325 let name_key = format!("{}::{}", new_sym.name, new_sym.kind);
327 if let Some(old_sym) = old_by_name.get(&name_key) {
328 if old_sym.file != new_sym.file {
329 changes.push(SymbolChange {
330 name: new_sym.name.clone(),
331 kind: new_sym.kind.clone(),
332 file: new_sym.file.clone(),
333 change: ChangeKind::Moved {
334 from_file: old_sym.file.clone(),
335 },
336 caller_count: 0,
337 });
338 continue;
339 }
340 }
341 changes.push(SymbolChange {
343 name: new_sym.name.clone(),
344 kind: new_sym.kind.clone(),
345 file: new_sym.file.clone(),
346 change: ChangeKind::Added,
347 caller_count: 0,
348 });
349 }
350 }
351
352 let moved_names: std::collections::HashSet<String> = changes
354 .iter()
355 .filter_map(|c| {
356 if matches!(c.change, ChangeKind::Moved { .. }) {
357 Some(format!("{}::{}", c.name, c.kind))
358 } else {
359 None
360 }
361 })
362 .collect();
363
364 for (key, old_sym) in &old {
365 if !new.contains_key(key) {
366 let name_key = format!("{}::{}", old_sym.name, old_sym.kind);
367 if !moved_names.contains(&name_key) {
368 changes.push(SymbolChange {
369 name: old_sym.name.clone(),
370 kind: old_sym.kind.clone(),
371 file: old_sym.file.clone(),
372 change: ChangeKind::Removed,
373 caller_count: 0,
374 });
375 }
376 }
377 }
378
379 let added: Vec<usize> = changes
381 .iter()
382 .enumerate()
383 .filter(|(_, c)| c.change == ChangeKind::Added)
384 .map(|(i, _)| i)
385 .collect();
386 let removed: Vec<usize> = changes
387 .iter()
388 .enumerate()
389 .filter(|(_, c)| c.change == ChangeKind::Removed)
390 .map(|(i, _)| i)
391 .collect();
392
393 let mut rename_pairs: Vec<(usize, usize, String)> = Vec::new();
394 let mut used_removed: HashSet<usize> = HashSet::new();
395
396 for &ai in &added {
397 let a = &changes[ai];
398 for &ri in &removed {
399 if used_removed.contains(&ri) {
400 continue;
401 }
402 let r = &changes[ri];
403 if a.file != r.file || a.kind != r.kind {
404 continue;
405 }
406 let a_key = format!("{}::{}::{}", a.file, a.name, a.kind);
408 let r_key = format!("{}::{}::{}", r.file, r.name, r.kind);
409 if let (Some(a_sym), Some(r_sym)) = (new.get(&a_key), old.get(&r_key)) {
410 if a_sym.sig_hash == r_sym.sig_hash && !a_sym.sig_hash.is_empty() {
411 rename_pairs.push((ai, ri, r.name.clone()));
412 used_removed.insert(ri);
413 break;
414 }
415 }
416 }
417 }
418
419 let mut remove_indices: HashSet<usize> = HashSet::new();
420 for (ai, ri, old_name) in &rename_pairs {
421 changes[*ai].change = ChangeKind::Renamed {
422 old_name: old_name.clone(),
423 };
424 remove_indices.insert(*ri);
425 }
426
427 if !remove_indices.is_empty() {
428 let mut idx = 0;
429 changes.retain(|_| {
430 let keep = !remove_indices.contains(&idx);
431 idx += 1;
432 keep
433 });
434 }
435
436 changes.sort_by_key(|c| match &c.change {
438 ChangeKind::Removed => 0,
439 ChangeKind::SignatureChanged => 1,
440 ChangeKind::BodyChanged => 2,
441 ChangeKind::Moved { .. } => 3,
442 ChangeKind::Renamed { .. } => 4,
443 ChangeKind::Added => 5,
444 });
445
446 SymbolDiff {
447 old_ref: old_ref.to_string(),
448 new_ref: new_ref.to_string(),
449 changes,
450 }
451}
452
453#[cfg(test)]
454mod tests {
455 use super::*;
456
457 fn sym(file: &str, name: &str, kind: &str, sig_hash: &str, params: &str, ret: &str) -> FlatSym {
458 FlatSym {
459 file: file.to_string(),
460 name: name.to_string(),
461 kind: kind.to_string(),
462 sig_hash: sig_hash.to_string(),
463 params: params.to_string(),
464 return_type: ret.to_string(),
465 }
466 }
467
468 fn key(file: &str, name: &str, kind: &str) -> String {
469 format!("{}::{}::{}", file, name, kind)
470 }
471
472 #[test]
473 fn test_body_change_classified_as_body_changed() {
474 let mut old = HashMap::new();
475 let mut new = HashMap::new();
476 let k = key("app.py", "validate_email", "Function");
477 old.insert(
478 k.clone(),
479 sym(
480 "app.py",
481 "validate_email",
482 "Function",
483 "hash_v1",
484 "(addr: str)",
485 "bool",
486 ),
487 );
488 new.insert(
489 k.clone(),
490 sym(
491 "app.py",
492 "validate_email",
493 "Function",
494 "hash_v2",
495 "(addr: str)",
496 "bool",
497 ),
498 );
499
500 let diff = diff_symbol_maps("old", "new", old, new);
501 assert_eq!(diff.changes.len(), 1);
502 assert_eq!(diff.changes[0].change, ChangeKind::BodyChanged);
503 assert_eq!(diff.changes[0].name, "validate_email");
504 }
505
506 #[test]
507 fn test_signature_change_params_differ() {
508 let mut old = HashMap::new();
509 let mut new = HashMap::new();
510 let k = key("app.py", "process", "Function");
511 old.insert(
512 k.clone(),
513 sym(
514 "app.py", "process", "Function", "hash_v1", "(x: int)", "None",
515 ),
516 );
517 new.insert(
518 k.clone(),
519 sym(
520 "app.py",
521 "process",
522 "Function",
523 "hash_v2",
524 "(x: int, y: int)",
525 "None",
526 ),
527 );
528
529 let diff = diff_symbol_maps("old", "new", old, new);
530 assert_eq!(diff.changes.len(), 1);
531 assert_eq!(diff.changes[0].change, ChangeKind::SignatureChanged);
532 }
533
534 #[test]
535 fn test_signature_change_return_type_differs() {
536 let mut old = HashMap::new();
537 let mut new = HashMap::new();
538 let k = key("app.py", "get_value", "Function");
539 old.insert(
540 k.clone(),
541 sym("app.py", "get_value", "Function", "hash_v1", "()", "int"),
542 );
543 new.insert(
544 k.clone(),
545 sym("app.py", "get_value", "Function", "hash_v2", "()", "str"),
546 );
547
548 let diff = diff_symbol_maps("old", "new", old, new);
549 assert_eq!(diff.changes.len(), 1);
550 assert_eq!(diff.changes[0].change, ChangeKind::SignatureChanged);
551 }
552
553 #[test]
554 fn test_rename_same_file_identical_body() {
555 let mut old = HashMap::new();
556 let mut new = HashMap::new();
557 old.insert(
558 key("calculator.py", "calculate_order_total", "Function"),
559 sym(
560 "calculator.py",
561 "calculate_order_total",
562 "Function",
563 "body_hash_abc",
564 "(items: list[Item])",
565 "Decimal",
566 ),
567 );
568 new.insert(
569 key("calculator.py", "compute_order_sum", "Function"),
570 sym(
571 "calculator.py",
572 "compute_order_sum",
573 "Function",
574 "body_hash_abc",
575 "(items: list[Item])",
576 "Decimal",
577 ),
578 );
579
580 let diff = diff_symbol_maps("old", "new", old, new);
581 let renamed: Vec<_> = diff
582 .changes
583 .iter()
584 .filter(|c| matches!(&c.change, ChangeKind::Renamed { .. }))
585 .collect();
586 assert_eq!(
587 renamed.len(),
588 1,
589 "Expected 1 rename, got: {:?}",
590 diff.changes
591 .iter()
592 .map(|c| format!("{}: {}", c.name, c.change))
593 .collect::<Vec<_>>()
594 );
595 assert_eq!(renamed[0].name, "compute_order_sum");
596 if let ChangeKind::Renamed { old_name } = &renamed[0].change {
597 assert_eq!(old_name, "calculate_order_total");
598 }
599 let removed: Vec<_> = diff
600 .changes
601 .iter()
602 .filter(|c| c.change == ChangeKind::Removed)
603 .collect();
604 assert_eq!(removed.len(), 0, "Old name should not appear as Removed");
605 }
606
607 #[test]
608 fn test_rename_not_detected_different_body() {
609 let mut old = HashMap::new();
610 let mut new = HashMap::new();
611 old.insert(
612 key("app.py", "old_func", "Function"),
613 sym("app.py", "old_func", "Function", "hash_A", "()", ""),
614 );
615 new.insert(
616 key("app.py", "new_func", "Function"),
617 sym("app.py", "new_func", "Function", "hash_B", "()", ""),
618 );
619
620 let diff = diff_symbol_maps("old", "new", old, new);
621 let renamed: Vec<_> = diff
622 .changes
623 .iter()
624 .filter(|c| matches!(&c.change, ChangeKind::Renamed { .. }))
625 .collect();
626 assert_eq!(renamed.len(), 0);
627 assert!(diff.changes.iter().any(|c| c.change == ChangeKind::Added));
628 assert!(diff.changes.iter().any(|c| c.change == ChangeKind::Removed));
629 }
630
631 #[test]
632 fn test_move_across_files() {
633 let mut old = HashMap::new();
634 let mut new = HashMap::new();
635 old.insert(
636 key("old_file.py", "helper", "Function"),
637 sym("old_file.py", "helper", "Function", "hash_1", "()", ""),
638 );
639 new.insert(
640 key("new_file.py", "helper", "Function"),
641 sym("new_file.py", "helper", "Function", "hash_1", "()", ""),
642 );
643
644 let diff = diff_symbol_maps("old", "new", old, new);
645 let moved: Vec<_> = diff
646 .changes
647 .iter()
648 .filter(|c| matches!(&c.change, ChangeKind::Moved { .. }))
649 .collect();
650 assert_eq!(moved.len(), 1);
651 if let ChangeKind::Moved { from_file } = &moved[0].change {
652 assert_eq!(from_file, "old_file.py");
653 }
654 }
655
656 #[test]
657 fn test_added_and_removed() {
658 let mut old = HashMap::new();
659 let mut new = HashMap::new();
660 old.insert(
661 key("app.py", "removed_fn", "Function"),
662 sym("app.py", "removed_fn", "Function", "hash_r", "()", ""),
663 );
664 new.insert(
665 key("app.py", "added_fn", "Function"),
666 sym("app.py", "added_fn", "Function", "hash_a", "()", ""),
667 );
668
669 let diff = diff_symbol_maps("old", "new", old, new);
670 assert!(diff
671 .changes
672 .iter()
673 .any(|c| c.change == ChangeKind::Added && c.name == "added_fn"));
674 assert!(diff
675 .changes
676 .iter()
677 .any(|c| c.change == ChangeKind::Removed && c.name == "removed_fn"));
678 }
679
680 #[test]
681 fn test_no_change_same_hash() {
682 let mut old = HashMap::new();
683 let mut new = HashMap::new();
684 let k = key("app.py", "stable_fn", "Function");
685 old.insert(
686 k.clone(),
687 sym("app.py", "stable_fn", "Function", "same_hash", "()", "int"),
688 );
689 new.insert(
690 k.clone(),
691 sym("app.py", "stable_fn", "Function", "same_hash", "()", "int"),
692 );
693
694 let diff = diff_symbol_maps("old", "new", old, new);
695 assert_eq!(diff.changes.len(), 0);
696 }
697
698 #[test]
699 fn test_modified_helper_returns_all_change_types() {
700 let mut old = HashMap::new();
701 let mut new = HashMap::new();
702
703 let k1 = key("a.py", "fn_body", "Function");
705 old.insert(
706 k1.clone(),
707 sym("a.py", "fn_body", "Function", "h1", "()", ""),
708 );
709 new.insert(
710 k1.clone(),
711 sym("a.py", "fn_body", "Function", "h2", "()", ""),
712 );
713
714 let k2 = key("a.py", "fn_sig", "Function");
716 old.insert(
717 k2.clone(),
718 sym("a.py", "fn_sig", "Function", "h3", "(x: int)", ""),
719 );
720 new.insert(
721 k2.clone(),
722 sym("a.py", "fn_sig", "Function", "h4", "(x: str)", ""),
723 );
724
725 old.insert(
727 key("old.py", "fn_moved", "Function"),
728 sym("old.py", "fn_moved", "Function", "h5", "()", ""),
729 );
730 new.insert(
731 key("new.py", "fn_moved", "Function"),
732 sym("new.py", "fn_moved", "Function", "h5", "()", ""),
733 );
734
735 old.insert(
737 key("a.py", "old_name", "Function"),
738 sym("a.py", "old_name", "Function", "h6", "()", ""),
739 );
740 new.insert(
741 key("a.py", "new_name", "Function"),
742 sym("a.py", "new_name", "Function", "h6", "()", ""),
743 );
744
745 let diff = diff_symbol_maps("old", "new", old, new);
746 let modified: Vec<_> = diff.modified().collect();
747 assert_eq!(
748 modified.len(),
749 4,
750 "modified() should include all 4 change types, got: {:?}",
751 modified
752 .iter()
753 .map(|c| format!("{}: {}", c.name, c.change))
754 .collect::<Vec<_>>()
755 );
756 }
757
758 #[test]
759 fn test_combined_rename_and_body_change() {
760 let mut old = HashMap::new();
761 let mut new = HashMap::new();
762 old.insert(
763 key("app.py", "calc_total", "Function"),
764 sym(
765 "app.py",
766 "calc_total",
767 "Function",
768 "hash_old",
769 "(items: list)",
770 "float",
771 ),
772 );
773 new.insert(
774 key("app.py", "compute_sum", "Function"),
775 sym(
776 "app.py",
777 "compute_sum",
778 "Function",
779 "hash_new",
780 "(items: list)",
781 "float",
782 ),
783 );
784
785 let diff = diff_symbol_maps("old", "new", old, new);
786 let renamed: Vec<_> = diff
787 .changes
788 .iter()
789 .filter(|c| matches!(&c.change, ChangeKind::Renamed { .. }))
790 .collect();
791 assert_eq!(
792 renamed.len(),
793 0,
794 "Should NOT detect rename when body hash differs"
795 );
796 assert!(diff.changes.iter().any(|c| c.change == ChangeKind::Added));
797 assert!(diff.changes.iter().any(|c| c.change == ChangeKind::Removed));
798 }
799}