Skip to main content

infigraph_core/diff/
mod.rs

1//! Semantic diff between two git refs at the symbol level.
2//!
3//! Instead of a line diff, this compares the extracted symbol graphs of two
4//! git tree-states and classifies each change as Added / Removed / Modified /
5//! SignatureChanged.  The caller supplies a project root and two git refs
6//! (e.g. "HEAD~1", "main"); the module checks out each ref into a temp
7//! worktree, indexes it with the current language registry, and returns a
8//! structured `SymbolDiff`.
9
10use std::collections::{HashMap, HashSet};
11use std::path::Path;
12
13use anyhow::{Context, Result};
14use serde::{Deserialize, Serialize};
15
16use crate::extract;
17use crate::lang::LanguageRegistry;
18
19/// How a symbol changed between two refs.
20#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
21pub enum ChangeKind {
22    /// Symbol exists in new ref but not in old ref.
23    Added,
24    /// Symbol exists in old ref but not in new ref.
25    Removed,
26    /// Symbol exists in both; signature_hash changed (parameter / return type change).
27    SignatureChanged,
28    /// Symbol exists in both; body changed but signature is the same.
29    Modified,
30    /// Symbol moved to a different file.
31    Moved { from_file: String },
32}
33
34impl std::fmt::Display for ChangeKind {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            ChangeKind::Added => write!(f, "ADDED"),
38            ChangeKind::Removed => write!(f, "REMOVED"),
39            ChangeKind::SignatureChanged => write!(f, "SIGNATURE_CHANGED"),
40            ChangeKind::Modified => write!(f, "MODIFIED"),
41            ChangeKind::Moved { from_file } => write!(f, "MOVED(from:{})", from_file),
42        }
43    }
44}
45
46/// A single symbol-level change.
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct SymbolChange {
49    pub name: String,
50    pub kind: String,
51    pub file: String,
52    pub change: ChangeKind,
53    /// Callers in the current graph (populated by caller when graph is available).
54    pub caller_count: usize,
55}
56
57/// Full semantic diff result.
58#[derive(Debug, Default)]
59pub struct SymbolDiff {
60    pub old_ref: String,
61    pub new_ref: String,
62    pub changes: Vec<SymbolChange>,
63}
64
65impl SymbolDiff {
66    pub fn added(&self) -> impl Iterator<Item = &SymbolChange> {
67        self.changes
68            .iter()
69            .filter(|c| c.change == ChangeKind::Added)
70    }
71    pub fn removed(&self) -> impl Iterator<Item = &SymbolChange> {
72        self.changes
73            .iter()
74            .filter(|c| c.change == ChangeKind::Removed)
75    }
76    pub fn modified(&self) -> impl Iterator<Item = &SymbolChange> {
77        self.changes.iter().filter(|c| {
78            matches!(
79                c.change,
80                ChangeKind::Modified | ChangeKind::SignatureChanged | ChangeKind::Moved { .. }
81            )
82        })
83    }
84}
85
86// ---------------------------------------------------------------------------
87// Public API
88// ---------------------------------------------------------------------------
89
90/// A flat symbol record used during diff (file + name + kind + sig_hash).
91#[derive(Clone)]
92struct FlatSym {
93    file: String,
94    name: String,
95    kind: String,
96    sig_hash: String,
97}
98
99/// Compute a symbol-level diff between `old_ref` and `new_ref` in `project_root`.
100///
101/// Uses `git archive` to extract each ref into a temp directory so no
102/// working-tree modifications are needed.
103pub fn semantic_diff(
104    project_root: &Path,
105    old_ref: &str,
106    new_ref: &str,
107    registry: &LanguageRegistry,
108) -> Result<SymbolDiff> {
109    let changed = compute_changed_files(project_root, old_ref, new_ref);
110
111    let (old_filter, new_filter) = match &changed {
112        Some(cf) => (Some(&cf.old_ref_files), Some(&cf.new_ref_files)),
113        None => (None, None),
114    };
115
116    let old_symbols = extract_ref_symbols(project_root, old_ref, registry, old_filter)
117        .with_context(|| format!("failed to extract symbols for ref '{}'", old_ref))?;
118    let new_symbols = extract_ref_symbols(project_root, new_ref, registry, new_filter)
119        .with_context(|| format!("failed to extract symbols for ref '{}'", new_ref))?;
120
121    Ok(diff_symbol_maps(old_ref, new_ref, old_symbols, new_symbols))
122}
123
124struct ChangedFiles {
125    old_ref_files: HashSet<String>,
126    new_ref_files: HashSet<String>,
127}
128
129fn compute_changed_files(
130    project_root: &Path,
131    old_ref: &str,
132    new_ref: &str,
133) -> Option<ChangedFiles> {
134    let output = std::process::Command::new("git")
135        .args(["diff", "--name-status", "--no-renames", old_ref, new_ref])
136        .current_dir(project_root)
137        .output()
138        .ok()?;
139
140    if !output.status.success() {
141        eprintln!(
142            "infigraph: git diff --name-status failed for {}..{}, falling back to full extraction",
143            old_ref, new_ref
144        );
145        return None;
146    }
147
148    let text = String::from_utf8_lossy(&output.stdout);
149    let mut old_ref_files = HashSet::new();
150    let mut new_ref_files = HashSet::new();
151
152    for line in text.lines() {
153        let line = line.trim();
154        if line.is_empty() {
155            continue;
156        }
157        let mut parts = line.splitn(2, '\t');
158        let status = parts.next().unwrap_or("").trim();
159        let path = match parts.next() {
160            Some(p) => p.trim().to_string(),
161            None => continue,
162        };
163
164        match status {
165            "A" => {
166                new_ref_files.insert(path);
167            }
168            "D" => {
169                old_ref_files.insert(path);
170            }
171            _ => {
172                old_ref_files.insert(path.clone());
173                new_ref_files.insert(path);
174            }
175        }
176    }
177
178    Some(ChangedFiles {
179        old_ref_files,
180        new_ref_files,
181    })
182}
183
184// ---------------------------------------------------------------------------
185// Extract symbols for a git ref
186// ---------------------------------------------------------------------------
187
188/// Extract all symbols from a git ref by using `git archive | tar -x` into a
189/// temp directory, then walking files through the language registry.
190const MAX_ARCHIVE_ARGS: usize = 500;
191
192fn extract_ref_symbols(
193    project_root: &Path,
194    git_ref: &str,
195    registry: &LanguageRegistry,
196    file_filter: Option<&HashSet<String>>,
197) -> Result<HashMap<String, FlatSym>> {
198    if let Some(filter) = file_filter {
199        if filter.is_empty() {
200            return Ok(HashMap::new());
201        }
202    }
203
204    let is_working_tree = git_ref == "HEAD" || git_ref == "WORKING";
205
206    if is_working_tree {
207        return extract_dir_symbols(project_root, project_root, registry, file_filter);
208    }
209
210    let tmp = tempfile::tempdir().context("failed to create temp dir")?;
211
212    let use_filtered_archive = file_filter
213        .map(|f| f.len() <= MAX_ARCHIVE_ARGS)
214        .unwrap_or(false);
215
216    let archive_output = if use_filtered_archive {
217        let filter = file_filter.unwrap();
218        let mut args: Vec<&str> = vec!["archive", "--format=tar", git_ref, "--"];
219        args.extend(filter.iter().map(|s| s.as_str()));
220        std::process::Command::new("git")
221            .args(&args)
222            .current_dir(project_root)
223            .output()
224            .context("git archive (filtered) failed")?
225    } else {
226        std::process::Command::new("git")
227            .args(["archive", "--format=tar", git_ref])
228            .current_dir(project_root)
229            .output()
230            .context("git archive failed")?
231    };
232
233    if !archive_output.status.success() {
234        let err = String::from_utf8_lossy(&archive_output.stderr);
235        if use_filtered_archive {
236            eprintln!(
237                "infigraph: filtered git archive for {} failed, falling back to full archive: {}",
238                git_ref,
239                err.trim()
240            );
241            let full_output = std::process::Command::new("git")
242                .args(["archive", "--format=tar", git_ref])
243                .current_dir(project_root)
244                .output()
245                .context("git archive (full fallback) failed")?;
246            if !full_output.status.success() {
247                let err2 = String::from_utf8_lossy(&full_output.stderr);
248                anyhow::bail!("git archive {} failed: {}", git_ref, err2.trim());
249            }
250            return untar_and_extract(tmp.path(), &full_output.stdout, registry, file_filter);
251        }
252        anyhow::bail!("git archive {} failed: {}", git_ref, err.trim());
253    }
254
255    untar_and_extract(tmp.path(), &archive_output.stdout, registry, file_filter)
256}
257
258fn untar_and_extract(
259    tmp_dir: &Path,
260    tar_data: &[u8],
261    registry: &LanguageRegistry,
262    file_filter: Option<&HashSet<String>>,
263) -> Result<HashMap<String, FlatSym>> {
264    let mut tar = std::process::Command::new("tar")
265        .args(["-x", "-C", tmp_dir.to_str().unwrap_or(".")])
266        .stdin(std::process::Stdio::piped())
267        .spawn()
268        .context("failed to spawn tar")?;
269
270    if let Some(stdin) = tar.stdin.take() {
271        use std::io::Write;
272        let mut w = stdin;
273        w.write_all(tar_data)?;
274    }
275    tar.wait().context("tar wait failed")?;
276
277    extract_dir_symbols(tmp_dir, tmp_dir, registry, file_filter)
278}
279
280fn extract_dir_symbols(
281    root: &Path,
282    dir: &Path,
283    registry: &LanguageRegistry,
284    file_filter: Option<&HashSet<String>>,
285) -> Result<HashMap<String, FlatSym>> {
286    let mut map = HashMap::new();
287    collect_symbols(root, dir, registry, file_filter, &mut map)?;
288    Ok(map)
289}
290
291static SKIP_DIRS: &[&str] = &[
292    ".git",
293    "node_modules",
294    ".venv",
295    "venv",
296    "target",
297    "build",
298    "dist",
299    "__pycache__",
300    ".tox",
301    ".infigraph",
302];
303
304fn collect_symbols(
305    root: &Path,
306    dir: &Path,
307    registry: &LanguageRegistry,
308    file_filter: Option<&HashSet<String>>,
309    map: &mut HashMap<String, FlatSym>,
310) -> Result<()> {
311    for entry in std::fs::read_dir(dir)? {
312        let entry = entry?;
313        let path = entry.path();
314        let name = entry.file_name();
315        let name_str = name.to_string_lossy();
316
317        if path.is_dir() {
318            if !SKIP_DIRS.contains(&name_str.as_ref()) && !name_str.starts_with('.') {
319                collect_symbols(root, &path, registry, file_filter, map)?;
320            }
321        } else if path.is_file() {
322            let rel = path
323                .strip_prefix(root)
324                .unwrap_or(&path)
325                .to_string_lossy()
326                .replace('\\', "/");
327            if let Some(filter) = file_filter {
328                if !filter.contains(&rel) {
329                    continue;
330                }
331            }
332            let Ok(source) = std::fs::read(&path) else {
333                continue;
334            };
335            let Some(pack) = registry.for_file_with_content(&rel, &source) else {
336                continue;
337            };
338            let Ok(extraction) = extract::extract_file(&rel, &source, pack) else {
339                continue;
340            };
341            let file = extraction.file.clone();
342            for sym in &extraction.symbols {
343                let kind_str = sym.kind.as_str().to_string();
344                // Key: "file::name::kind" — stable across refs
345                let key = format!("{}::{}::{}", file, sym.name, kind_str);
346                map.insert(
347                    key,
348                    FlatSym {
349                        file: file.clone(),
350                        name: sym.name.clone(),
351                        kind: kind_str,
352                        sig_hash: sym.signature_hash.clone(),
353                    },
354                );
355            }
356        }
357    }
358    Ok(())
359}
360
361// ---------------------------------------------------------------------------
362// Diff two symbol maps
363// ---------------------------------------------------------------------------
364
365fn diff_symbol_maps(
366    old_ref: &str,
367    new_ref: &str,
368    old: HashMap<String, FlatSym>,
369    new: HashMap<String, FlatSym>,
370) -> SymbolDiff {
371    let mut changes = Vec::new();
372
373    // Build name→sym map for old (for move detection)
374    let old_by_name: HashMap<String, &FlatSym> = old
375        .values()
376        .map(|s| (format!("{}::{}", s.name, s.kind), s))
377        .collect();
378
379    // Check new symbols against old
380    for (key, new_sym) in &new {
381        if let Some(old_sym) = old.get(key) {
382            // Same file+name+kind — check signature change
383            if old_sym.sig_hash != new_sym.sig_hash
384                && !old_sym.sig_hash.is_empty()
385                && !new_sym.sig_hash.is_empty()
386            {
387                changes.push(SymbolChange {
388                    name: new_sym.name.clone(),
389                    kind: new_sym.kind.clone(),
390                    file: new_sym.file.clone(),
391                    change: ChangeKind::SignatureChanged,
392                    caller_count: 0,
393                });
394            }
395        } else {
396            // Not in old by key. Check if name+kind existed in a different file (move).
397            let name_key = format!("{}::{}", new_sym.name, new_sym.kind);
398            if let Some(old_sym) = old_by_name.get(&name_key) {
399                if old_sym.file != new_sym.file {
400                    changes.push(SymbolChange {
401                        name: new_sym.name.clone(),
402                        kind: new_sym.kind.clone(),
403                        file: new_sym.file.clone(),
404                        change: ChangeKind::Moved {
405                            from_file: old_sym.file.clone(),
406                        },
407                        caller_count: 0,
408                    });
409                    continue;
410                }
411            }
412            // Truly new
413            changes.push(SymbolChange {
414                name: new_sym.name.clone(),
415                kind: new_sym.kind.clone(),
416                file: new_sym.file.clone(),
417                change: ChangeKind::Added,
418                caller_count: 0,
419            });
420        }
421    }
422
423    // Removed: in old but not in new (excluding moves already captured)
424    let moved_names: std::collections::HashSet<String> = changes
425        .iter()
426        .filter_map(|c| {
427            if matches!(c.change, ChangeKind::Moved { .. }) {
428                Some(format!("{}::{}", c.name, c.kind))
429            } else {
430                None
431            }
432        })
433        .collect();
434
435    for (key, old_sym) in &old {
436        if !new.contains_key(key) {
437            let name_key = format!("{}::{}", old_sym.name, old_sym.kind);
438            if !moved_names.contains(&name_key) {
439                changes.push(SymbolChange {
440                    name: old_sym.name.clone(),
441                    kind: old_sym.kind.clone(),
442                    file: old_sym.file.clone(),
443                    change: ChangeKind::Removed,
444                    caller_count: 0,
445                });
446            }
447        }
448    }
449
450    // Sort: Removed first, then Added, then modified kinds
451    changes.sort_by_key(|c| match &c.change {
452        ChangeKind::Removed => 0,
453        ChangeKind::SignatureChanged => 1,
454        ChangeKind::Modified => 2,
455        ChangeKind::Moved { .. } => 3,
456        ChangeKind::Added => 4,
457    });
458
459    SymbolDiff {
460        old_ref: old_ref.to_string(),
461        new_ref: new_ref.to_string(),
462        changes,
463    }
464}
465
466// ---------------------------------------------------------------------------
467// Formatting
468// ---------------------------------------------------------------------------
469
470pub fn format_diff(diff: &SymbolDiff) -> String {
471    if diff.changes.is_empty() {
472        return format!(
473            "No symbol-level changes between '{}' and '{}'.",
474            diff.old_ref, diff.new_ref
475        );
476    }
477
478    let added = diff.added().count();
479    let removed = diff.removed().count();
480    let modified = diff.modified().count();
481
482    let mut out = format!(
483        "Semantic diff {} → {}  [+{} added  -{} removed  ~{} modified]\n\n",
484        diff.old_ref, diff.new_ref, added, removed, modified
485    );
486
487    let mut cur_file = String::new();
488    for c in &diff.changes {
489        if c.file != cur_file {
490            out.push_str(&format!("  {}\n", c.file));
491            cur_file = c.file.clone();
492        }
493        let callers = if c.caller_count > 0 {
494            format!("  [{} callers]", c.caller_count)
495        } else {
496            String::new()
497        };
498        out.push_str(&format!(
499            "    {:>20}  {:<10} {}{}\n",
500            c.change.to_string(),
501            c.kind,
502            c.name,
503            callers
504        ));
505    }
506
507    out
508}