grit_lib/
diff.rs

1//! Diff machinery — compare trees, index entries, and working tree files.
2//!
3//! # Overview
4//!
5//! This module provides the core diffing infrastructure shared by `diff`,
6//! `diff-index`, `status`, `log`, `show`, `commit`, and `merge`.
7//!
8//! ## Levels of comparison
9//!
10//! 1. **Tree-to-tree** — compare two tree objects (e.g. for `log`/`show`).
11//! 2. **Tree-to-index** — compare a tree (usually HEAD) against the index
12//!    (staged changes, used by `diff --cached` and `status`).
13//! 3. **Index-to-worktree** — compare index against the working directory
14//!    (unstaged changes, used by `diff` and `status`).
15//!
16//! ## Content diff
17//!
18//! Line-level diffing uses the Myers algorithm via the `similar` crate.
19//! Output formats: unified patch, raw (`:old-mode new-mode ...`), stat,
20//! numstat.
21
22use std::fs;
23use std::os::unix::fs::MetadataExt;
24use std::path::Path;
25
26use crate::error::{Error, Result};
27use crate::index::{Index, IndexEntry};
28use crate::objects::{parse_tree, ObjectId, ObjectKind, TreeEntry};
29use crate::odb::Odb;
30
31/// The kind of change between two sides of a diff.
32#[derive(Debug, Clone, Copy, PartialEq, Eq)]
33pub enum DiffStatus {
34    /// File was added.
35    Added,
36    /// File was deleted.
37    Deleted,
38    /// File was modified (content or mode change).
39    Modified,
40    /// File was renamed (with optional content change).
41    Renamed,
42    /// File was copied.
43    Copied,
44    /// File type changed (e.g. regular → symlink).
45    TypeChanged,
46    /// Unmerged (conflict).
47    Unmerged,
48}
49
50impl DiffStatus {
51    /// Single-character status letter used in raw diff output.
52    #[must_use]
53    pub fn letter(&self) -> char {
54        match self {
55            Self::Added => 'A',
56            Self::Deleted => 'D',
57            Self::Modified => 'M',
58            Self::Renamed => 'R',
59            Self::Copied => 'C',
60            Self::TypeChanged => 'T',
61            Self::Unmerged => 'U',
62        }
63    }
64}
65
66/// A single diff entry representing one changed path.
67#[derive(Debug, Clone)]
68pub struct DiffEntry {
69    /// The status of this change.
70    pub status: DiffStatus,
71    /// Path in the "old" side (None for Added).
72    pub old_path: Option<String>,
73    /// Path in the "new" side (None for Deleted).
74    pub new_path: Option<String>,
75    /// Old file mode (as octal string, e.g. "100644").
76    pub old_mode: String,
77    /// New file mode.
78    pub new_mode: String,
79    /// Old object ID (zero OID for Added).
80    pub old_oid: ObjectId,
81    /// New object ID (zero OID for Deleted).
82    pub new_oid: ObjectId,
83    /// Similarity score (0–100) for renames/copies.
84    pub score: Option<u32>,
85}
86
87impl DiffEntry {
88    /// The primary path for display (new_path for adds, old_path for deletes).
89    #[must_use]
90    pub fn path(&self) -> &str {
91        self.new_path
92            .as_deref()
93            .or(self.old_path.as_deref())
94            .unwrap_or("")
95    }
96}
97
98/// The zero (null) object ID used for "no object" in diff output.
99pub const ZERO_OID: &str = "0000000000000000000000000000000000000000";
100
101/// Return the zero ObjectId.
102#[must_use]
103pub fn zero_oid() -> ObjectId {
104    ObjectId::from_bytes(&[0u8; 20]).unwrap_or_else(|_| {
105        // This should never fail since we pass exactly 20 bytes
106        panic!("internal error: failed to create zero OID");
107    })
108}
109
110// ── Tree-to-tree diff ───────────────────────────────────────────────
111
112/// Compare two trees and return the list of changed entries.
113///
114/// # Parameters
115///
116/// - `odb` — object database to read tree objects from.
117/// - `old_tree_oid` — OID of the old tree (or `None` for comparison against empty).
118/// - `new_tree_oid` — OID of the new tree (or `None` for comparison against empty).
119/// - `prefix` — path prefix for nested tree recursion (empty string for root).
120///
121/// # Errors
122///
123/// Returns errors from object database reads.
124pub fn diff_trees(
125    odb: &Odb,
126    old_tree_oid: Option<&ObjectId>,
127    new_tree_oid: Option<&ObjectId>,
128    prefix: &str,
129) -> Result<Vec<DiffEntry>> {
130    let old_entries = match old_tree_oid {
131        Some(oid) => read_tree(odb, oid)?,
132        None => Vec::new(),
133    };
134    let new_entries = match new_tree_oid {
135        Some(oid) => read_tree(odb, oid)?,
136        None => Vec::new(),
137    };
138
139    let mut result = Vec::new();
140    diff_tree_entries(odb, &old_entries, &new_entries, prefix, &mut result)?;
141    Ok(result)
142}
143
144/// Read and parse a tree object from the ODB.
145fn read_tree(odb: &Odb, oid: &ObjectId) -> Result<Vec<TreeEntry>> {
146    let obj = odb.read(oid)?;
147    if obj.kind != ObjectKind::Tree {
148        return Err(Error::CorruptObject(format!(
149            "expected tree, got {}",
150            obj.kind.as_str()
151        )));
152    }
153    parse_tree(&obj.data)
154}
155
156/// Compare two sorted lists of tree entries, recursing into subtrees.
157fn diff_tree_entries(
158    odb: &Odb,
159    old: &[TreeEntry],
160    new: &[TreeEntry],
161    prefix: &str,
162    result: &mut Vec<DiffEntry>,
163) -> Result<()> {
164    let mut oi = 0;
165    let mut ni = 0;
166
167    while oi < old.len() || ni < new.len() {
168        match (old.get(oi), new.get(ni)) {
169            (Some(o), Some(n)) => {
170                let cmp = crate::objects::tree_entry_cmp(
171                    &o.name,
172                    is_tree_mode(o.mode),
173                    &n.name,
174                    is_tree_mode(n.mode),
175                );
176                match cmp {
177                    std::cmp::Ordering::Less => {
178                        // Old entry not in new → deleted
179                        emit_deleted(odb, o, prefix, result)?;
180                        oi += 1;
181                    }
182                    std::cmp::Ordering::Greater => {
183                        // New entry not in old → added
184                        emit_added(odb, n, prefix, result)?;
185                        ni += 1;
186                    }
187                    std::cmp::Ordering::Equal => {
188                        // Both present — check for changes
189                        if o.oid != n.oid || o.mode != n.mode {
190                            let name_str = String::from_utf8_lossy(&o.name);
191                            let path = format_path(prefix, &name_str);
192                            if is_tree_mode(o.mode) && is_tree_mode(n.mode) {
193                                // Both are trees — recurse
194                                let nested = diff_trees(odb, Some(&o.oid), Some(&n.oid), &path)?;
195                                result.extend(nested);
196                            } else if is_tree_mode(o.mode) && !is_tree_mode(n.mode) {
197                                // Tree → blob: delete tree contents, add blob
198                                emit_deleted(odb, o, prefix, result)?;
199                                emit_added(odb, n, prefix, result)?;
200                            } else if !is_tree_mode(o.mode) && is_tree_mode(n.mode) {
201                                // Blob → tree: delete blob, add tree contents
202                                emit_deleted(odb, o, prefix, result)?;
203                                emit_added(odb, n, prefix, result)?;
204                            } else {
205                                // Both blobs — modified
206                                result.push(DiffEntry {
207                                    status: if o.mode != n.mode && o.oid == n.oid {
208                                        DiffStatus::TypeChanged
209                                    } else {
210                                        DiffStatus::Modified
211                                    },
212                                    old_path: Some(path.clone()),
213                                    new_path: Some(path),
214                                    old_mode: format_mode(o.mode),
215                                    new_mode: format_mode(n.mode),
216                                    old_oid: o.oid,
217                                    new_oid: n.oid,
218                    score: None,
219                                });
220                            }
221                        }
222                        oi += 1;
223                        ni += 1;
224                    }
225                }
226            }
227            (Some(o), None) => {
228                emit_deleted(odb, o, prefix, result)?;
229                oi += 1;
230            }
231            (None, Some(n)) => {
232                emit_added(odb, n, prefix, result)?;
233                ni += 1;
234            }
235            (None, None) => break,
236        }
237    }
238
239    Ok(())
240}
241
242fn emit_deleted(
243    odb: &Odb,
244    entry: &TreeEntry,
245    prefix: &str,
246    result: &mut Vec<DiffEntry>,
247) -> Result<()> {
248    let name_str = String::from_utf8_lossy(&entry.name);
249    let path = format_path(prefix, &name_str);
250    if is_tree_mode(entry.mode) {
251        // Recurse into deleted tree
252        let nested = diff_trees(odb, Some(&entry.oid), None, &path)?;
253        result.extend(nested);
254    } else {
255        result.push(DiffEntry {
256            status: DiffStatus::Deleted,
257            old_path: Some(path.clone()),
258            new_path: None,
259            old_mode: format_mode(entry.mode),
260            new_mode: "000000".to_owned(),
261            old_oid: entry.oid,
262            new_oid: zero_oid(),
263                    score: None,
264        });
265    }
266    Ok(())
267}
268
269fn emit_added(
270    odb: &Odb,
271    entry: &TreeEntry,
272    prefix: &str,
273    result: &mut Vec<DiffEntry>,
274) -> Result<()> {
275    let name_str = String::from_utf8_lossy(&entry.name);
276    let path = format_path(prefix, &name_str);
277    if is_tree_mode(entry.mode) {
278        // Recurse into added tree
279        let nested = diff_trees(odb, None, Some(&entry.oid), &path)?;
280        result.extend(nested);
281    } else {
282        result.push(DiffEntry {
283            status: DiffStatus::Added,
284            old_path: None,
285            new_path: Some(path),
286            old_mode: "000000".to_owned(),
287            new_mode: format_mode(entry.mode),
288            old_oid: zero_oid(),
289            new_oid: entry.oid,
290                    score: None,
291        });
292    }
293    Ok(())
294}
295
296// ── Index-to-tree diff (staged changes) ─────────────────────────────
297
298/// Compare the index against a tree (usually HEAD's tree).
299///
300/// This shows "staged" changes — what would be committed.
301///
302/// # Parameters
303///
304/// - `odb` — object database.
305/// - `index` — the current index.
306/// - `tree_oid` — the tree to compare against (e.g. HEAD's tree), or `None`
307///   for comparison against an empty tree (initial commit).
308///
309/// # Errors
310///
311/// Returns errors from ODB reads.
312pub fn diff_index_to_tree(
313    odb: &Odb,
314    index: &Index,
315    tree_oid: Option<&ObjectId>,
316) -> Result<Vec<DiffEntry>> {
317    // Flatten the tree into a sorted list of (path, mode, oid)
318    let tree_entries = match tree_oid {
319        Some(oid) => flatten_tree(odb, oid, "")?,
320        None => Vec::new(),
321    };
322
323    // Build maps keyed by path
324    let mut tree_map: std::collections::BTreeMap<&str, &FlatEntry> =
325        std::collections::BTreeMap::new();
326    for entry in &tree_entries {
327        tree_map.insert(&entry.path, entry);
328    }
329
330    let mut result = Vec::new();
331
332    // Check index entries against tree
333    for ie in &index.entries {
334        // Only look at stage 0 (merged) entries
335        if ie.stage() != 0 {
336            continue;
337        }
338        let path = String::from_utf8_lossy(&ie.path).to_string();
339        match tree_map.remove(path.as_str()) {
340            Some(te) => {
341                // Present in both — check for differences
342                if te.oid != ie.oid || te.mode != ie.mode {
343                    result.push(DiffEntry {
344                        status: DiffStatus::Modified,
345                        old_path: Some(path.clone()),
346                        new_path: Some(path),
347                        old_mode: format_mode(te.mode),
348                        new_mode: format_mode(ie.mode),
349                        old_oid: te.oid,
350                        new_oid: ie.oid,
351                    score: None,
352                    });
353                }
354            }
355            None => {
356                // In index but not tree → added
357                result.push(DiffEntry {
358                    status: DiffStatus::Added,
359                    old_path: None,
360                    new_path: Some(path),
361                    old_mode: "000000".to_owned(),
362                    new_mode: format_mode(ie.mode),
363                    old_oid: zero_oid(),
364                    new_oid: ie.oid,
365                    score: None,
366                });
367            }
368        }
369    }
370
371    // Remaining tree entries not in index → deleted
372    for (path, te) in tree_map {
373        result.push(DiffEntry {
374            status: DiffStatus::Deleted,
375            old_path: Some(path.to_owned()),
376            new_path: None,
377            old_mode: format_mode(te.mode),
378            new_mode: "000000".to_owned(),
379            old_oid: te.oid,
380            new_oid: zero_oid(),
381                    score: None,
382        });
383    }
384
385    result.sort_by(|a, b| a.path().cmp(b.path()));
386    Ok(result)
387}
388
389// ── Index-to-worktree diff (unstaged changes) ───────────────────────
390
391/// Compare the index against the working tree.
392///
393/// This shows "unstaged" changes — modifications not yet staged.
394///
395/// # Parameters
396///
397/// - `odb` — object database (for hashing worktree files).
398/// - `index` — the current index.
399/// - `work_tree` — path to the working tree root.
400///
401/// # Errors
402///
403/// Returns errors from I/O or hashing.
404pub fn diff_index_to_worktree(
405    odb: &Odb,
406    index: &Index,
407    work_tree: &Path,
408) -> Result<Vec<DiffEntry>> {
409    let mut result = Vec::new();
410
411    for ie in &index.entries {
412        if ie.stage() != 0 {
413            continue;
414        }
415        // Use str slice directly to avoid allocation for path joining;
416        // only allocate String if we need it for DiffEntry output.
417        let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
418        let file_path = work_tree.join(path_str_ref);
419        match fs::symlink_metadata(&file_path) {
420            Ok(meta) => {
421                // Check if the file has changed using stat data first
422                if stat_matches(ie, &meta) {
423                    continue; // Fast path: stat data matches, assume unchanged
424                }
425
426                // Stat differs — hash the file to check actual content
427                let worktree_oid = hash_worktree_file(odb, &file_path, &meta)?;
428                let worktree_mode = mode_from_metadata(&meta);
429
430                if worktree_oid != ie.oid || worktree_mode != ie.mode {
431                    let path_owned = path_str_ref.to_owned();
432                    result.push(DiffEntry {
433                        status: DiffStatus::Modified,
434                        old_path: Some(path_owned.clone()),
435                        new_path: Some(path_owned),
436                        old_mode: format_mode(ie.mode),
437                        new_mode: format_mode(worktree_mode),
438                        old_oid: ie.oid,
439                        new_oid: worktree_oid,
440                    score: None,
441                    });
442                }
443            }
444            Err(e) if e.kind() == std::io::ErrorKind::NotFound
445                || e.raw_os_error() == Some(20) /* ENOTDIR */ => {
446                // File deleted from working tree (or parent replaced by a file)
447                result.push(DiffEntry {
448                    status: DiffStatus::Deleted,
449                    old_path: Some(path_str_ref.to_owned()),
450                    new_path: None,
451                    old_mode: format_mode(ie.mode),
452                    new_mode: "000000".to_owned(),
453                    old_oid: ie.oid,
454                    new_oid: zero_oid(),
455                    score: None,
456                });
457            }
458            Err(e) => return Err(Error::Io(e)),
459        }
460    }
461
462    Ok(result)
463}
464
465/// Quick stat check: does the index entry's cached stat data match the file?
466pub fn stat_matches(ie: &IndexEntry, meta: &fs::Metadata) -> bool {
467    // Compare size
468    if meta.len() as u32 != ie.size {
469        return false;
470    }
471    // Compare mtime (seconds + nanoseconds)
472    if meta.mtime() as u32 != ie.mtime_sec {
473        return false;
474    }
475    if meta.mtime_nsec() as u32 != ie.mtime_nsec {
476        return false;
477    }
478    // Compare ctime (seconds + nanoseconds)
479    if meta.ctime() as u32 != ie.ctime_sec {
480        return false;
481    }
482    if meta.ctime_nsec() as u32 != ie.ctime_nsec {
483        return false;
484    }
485    // Compare inode and device
486    if meta.ino() as u32 != ie.ino {
487        return false;
488    }
489    if meta.dev() as u32 != ie.dev {
490        return false;
491    }
492    true
493}
494
495/// Hash a working tree file as a blob to get its OID.
496fn hash_worktree_file(_odb: &Odb, path: &Path, meta: &fs::Metadata) -> Result<ObjectId> {
497    let data = if meta.file_type().is_symlink() {
498        // For symlinks, hash the target path
499        let target = fs::read_link(path)?;
500        target.to_string_lossy().into_owned().into_bytes()
501    } else {
502        fs::read(path)?
503    };
504
505    Ok(Odb::hash_object_data(ObjectKind::Blob, &data))
506}
507
508/// Derive a Git file mode from filesystem metadata.
509fn mode_from_metadata(meta: &fs::Metadata) -> u32 {
510    if meta.file_type().is_symlink() {
511        0o120000
512    } else if meta.mode() & 0o111 != 0 {
513        0o100755
514    } else {
515        0o100644
516    }
517}
518
519/// Compare a tree against the working tree.
520///
521/// Shows changes from `tree_oid` to the current working directory state.
522/// Files tracked in the index but not in the tree are shown as Added.
523/// Files in the tree but missing from the working tree are shown as Deleted.
524///
525/// # Parameters
526///
527/// - `odb` — object database.
528/// - `tree_oid` — the tree to compare against (`None` for empty tree).
529/// - `work_tree` — path to the working tree root.
530/// - `index` — current index (used to discover new tracked files not in tree).
531///
532/// # Errors
533///
534/// Returns errors from ODB reads or I/O.
535pub fn diff_tree_to_worktree(
536    odb: &Odb,
537    tree_oid: Option<&ObjectId>,
538    work_tree: &Path,
539    index: &Index,
540) -> Result<Vec<DiffEntry>> {
541    // Flatten the tree into a BTreeMap keyed by path
542    let tree_flat = match tree_oid {
543        Some(oid) => flatten_tree(odb, oid, "")?,
544        None => Vec::new(),
545    };
546    let tree_map: std::collections::BTreeMap<String, &FlatEntry> =
547        tree_flat.iter().map(|e| (e.path.clone(), e)).collect();
548
549    // Build index lookup: path → &IndexEntry (stage 0 only)
550    let mut index_entries: std::collections::BTreeMap<&[u8], &IndexEntry> =
551        std::collections::BTreeMap::new();
552    let mut index_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
553    for ie in &index.entries {
554        if ie.stage() != 0 {
555            continue;
556        }
557        let path = String::from_utf8_lossy(&ie.path).to_string();
558        index_entries.insert(&ie.path, ie);
559        index_paths.insert(path);
560    }
561
562    // Union of tree paths + index paths
563    let mut all_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
564    all_paths.extend(tree_map.keys().cloned());
565    all_paths.extend(index_paths.iter().cloned());
566
567    let mut result = Vec::new();
568
569    for path in &all_paths {
570        let tree_entry = tree_map.get(path.as_str());
571        let file_path = work_tree.join(path);
572
573        let wt_meta = match fs::symlink_metadata(&file_path) {
574            Ok(m) => Some(m),
575            Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
576            Err(e) => return Err(Error::Io(e)),
577        };
578
579        match (tree_entry, wt_meta) {
580            (Some(te), Some(ref meta)) => {
581                // Fast path: if the index entry matches the tree entry AND
582                // stat cache matches, the file is unchanged — skip hashing.
583                if let Some(ie) = index_entries.get(path.as_bytes()) {
584                    if ie.oid == te.oid && ie.mode == te.mode && stat_matches(ie, meta) {
585                        continue;
586                    }
587                }
588
589                // Stat or content differs — hash the file
590                let wt_oid = hash_worktree_file(odb, &file_path, meta)?;
591                let wt_mode = mode_from_metadata(meta);
592                if wt_oid != te.oid || wt_mode != te.mode {
593                    result.push(DiffEntry {
594                        status: DiffStatus::Modified,
595                        old_path: Some(path.clone()),
596                        new_path: Some(path.clone()),
597                        old_mode: format_mode(te.mode),
598                        new_mode: format_mode(wt_mode),
599                        old_oid: te.oid,
600                        new_oid: wt_oid,
601                    score: None,
602                    });
603                }
604            }
605            (Some(te), None) => {
606                // In tree but missing from worktree
607                result.push(DiffEntry {
608                    status: DiffStatus::Deleted,
609                    old_path: Some(path.clone()),
610                    new_path: None,
611                    old_mode: format_mode(te.mode),
612                    new_mode: "000000".to_owned(),
613                    old_oid: te.oid,
614                    new_oid: zero_oid(),
615                    score: None,
616                });
617            }
618            (None, Some(ref meta)) => {
619                // In index but not in tree, and exists in worktree
620                let wt_oid = hash_worktree_file(odb, &file_path, meta)?;
621                let wt_mode = mode_from_metadata(meta);
622                result.push(DiffEntry {
623                    status: DiffStatus::Added,
624                    old_path: None,
625                    new_path: Some(path.clone()),
626                    old_mode: "000000".to_owned(),
627                    new_mode: format_mode(wt_mode),
628                    old_oid: zero_oid(),
629                    new_oid: wt_oid,
630                    score: None,
631                });
632            }
633            (None, None) => {
634                // Tracked in index but neither in tree nor worktree — skip
635            }
636        }
637    }
638
639    result.sort_by(|a, b| a.path().cmp(b.path()));
640    Ok(result)
641}
642
643// ── Rename detection ────────────────────────────────────────────────
644
645/// Detect renames by pairing Deleted and Added entries with similar content.
646///
647/// `threshold` is the minimum similarity percentage (0–100) for a pair to
648/// be considered a rename (Git's default is 50%).  The function reads blob
649/// content from the ODB to compute a line-level similarity score.
650///
651/// Exact-OID matches are always 100% similar regardless of content.
652pub fn detect_renames(odb: &Odb, entries: Vec<DiffEntry>, threshold: u32) -> Vec<DiffEntry> {
653    // Split entries into deleted, added, and others.
654    let mut deleted: Vec<DiffEntry> = Vec::new();
655    let mut added: Vec<DiffEntry> = Vec::new();
656    let mut others: Vec<DiffEntry> = Vec::new();
657
658    for entry in entries {
659        match entry.status {
660            DiffStatus::Deleted => deleted.push(entry),
661            DiffStatus::Added => added.push(entry),
662            _ => others.push(entry),
663        }
664    }
665
666    if deleted.is_empty() || added.is_empty() {
667        // Nothing to pair — return original order.
668        let mut result = others;
669        result.extend(deleted);
670        result.extend(added);
671        result.sort_by(|a, b| a.path().cmp(b.path()));
672        return result;
673    }
674
675    // Read content for all deleted blobs.
676    let deleted_contents: Vec<Option<Vec<u8>>> = deleted
677        .iter()
678        .map(|d| odb.read(&d.old_oid).ok().map(|obj| obj.data))
679        .collect();
680
681    // Read content for all added blobs.
682    let added_contents: Vec<Option<Vec<u8>>> = added
683        .iter()
684        .map(|a| odb.read(&a.new_oid).ok().map(|obj| obj.data))
685        .collect();
686
687    // Build a matrix of similarity scores and find the best pairings.
688    // We use a greedy approach: pick the highest-scoring pair first.
689    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
690
691    for (di, del) in deleted.iter().enumerate() {
692        for (ai, add) in added.iter().enumerate() {
693            // Exact OID match → 100%
694            if del.old_oid == add.new_oid {
695                scores.push((100, di, ai));
696                continue;
697            }
698
699            let score = match (&deleted_contents[di], &added_contents[ai]) {
700                (Some(old_data), Some(new_data)) => {
701                    compute_similarity(old_data, new_data)
702                }
703                _ => 0,
704            };
705
706            if score >= threshold {
707                scores.push((score, di, ai));
708            }
709        }
710    }
711
712    // Sort: prefer same-basename pairs first, then by score descending.
713    // This matches Git's behavior where basename matches are checked first.
714    scores.sort_by(|a, b| {
715        let a_same = same_basename(&deleted[a.1], &added[a.2]);
716        let b_same = same_basename(&deleted[b.1], &added[b.2]);
717        b_same.cmp(&a_same).then_with(|| b.0.cmp(&a.0))
718    });
719
720    let mut used_deleted = vec![false; deleted.len()];
721    let mut used_added = vec![false; added.len()];
722    let mut renames: Vec<DiffEntry> = Vec::new();
723
724    for (score, di, ai) in &scores {
725        if used_deleted[*di] || used_added[*ai] {
726            continue;
727        }
728        used_deleted[*di] = true;
729        used_added[*ai] = true;
730
731        let del = &deleted[*di];
732        let add = &added[*ai];
733
734        renames.push(DiffEntry {
735            status: DiffStatus::Renamed,
736            old_path: del.old_path.clone(),
737            new_path: add.new_path.clone(),
738            old_mode: del.old_mode.clone(),
739            new_mode: add.new_mode.clone(),
740            old_oid: del.old_oid,
741            new_oid: add.new_oid,
742            score: Some(*score),
743        });
744    }
745
746    // Collect unmatched entries.
747    let mut result = others;
748    result.extend(renames);
749    for (i, entry) in deleted.into_iter().enumerate() {
750        if !used_deleted[i] {
751            result.push(entry);
752        }
753    }
754    for (i, entry) in added.into_iter().enumerate() {
755        if !used_added[i] {
756            result.push(entry);
757        }
758    }
759
760    result.sort_by(|a, b| a.path().cmp(b.path()));
761    result
762}
763
764
765/// Detect copies among diff entries.
766///
767/// This first runs rename detection (pairing Deleted+Added), then for any
768/// remaining Added entries, looks for copy sources.
769///
770/// - `find_copies_harder` = false: only Modified entries are copy source candidates.
771/// - `find_copies_harder` = true: also examine unmodified files from `source_tree_entries`.
772///
773/// `source_tree_entries` should be a list of (path, mode, oid) from the source tree;
774/// used when `find_copies_harder` is true to consider unmodified files as copy sources.
775pub fn detect_copies(
776    odb: &Odb,
777    entries: Vec<DiffEntry>,
778    threshold: u32,
779    find_copies_harder: bool,
780    source_tree_entries: &[(String, String, ObjectId)],
781) -> Vec<DiffEntry> {
782    // First, run rename detection to pair Delete+Add.
783    let entries = detect_renames(odb, entries, threshold);
784
785    // Split into added (remaining) and others.
786    let mut added: Vec<DiffEntry> = Vec::new();
787    let mut others: Vec<DiffEntry> = Vec::new();
788
789    for entry in entries {
790        match entry.status {
791            DiffStatus::Added => added.push(entry),
792            _ => others.push(entry),
793        }
794    }
795
796    if added.is_empty() {
797        return others;
798    }
799
800    // Build copy source candidates.
801    let mut sources: Vec<(String, ObjectId)> = Vec::new();
802
803    // Modified files are always candidates for -C.
804    for entry in &others {
805        if entry.status == DiffStatus::Modified {
806            if let Some(ref old_path) = entry.old_path {
807                sources.push((old_path.clone(), entry.old_oid));
808            }
809        }
810    }
811
812    // With find_copies_harder, also add all source tree entries.
813    if find_copies_harder {
814        for (path, _mode, oid) in source_tree_entries {
815            if !sources.iter().any(|(p, _)| p == path) {
816                sources.push((path.clone(), *oid));
817            }
818        }
819    }
820
821    if sources.is_empty() {
822        let mut result = others;
823        result.extend(added);
824        result.sort_by(|a, b| a.path().cmp(b.path()));
825        return result;
826    }
827
828    // Read content for sources.
829    let source_contents: Vec<Option<Vec<u8>>> = sources
830        .iter()
831        .map(|(_, oid)| odb.read(oid).ok().map(|obj| obj.data))
832        .collect();
833
834    // Read content for added blobs.
835    let added_contents: Vec<Option<Vec<u8>>> = added
836        .iter()
837        .map(|a| odb.read(&a.new_oid).ok().map(|obj| obj.data))
838        .collect();
839
840    // Build score matrix.
841    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
842    for (si, (_, src_oid)) in sources.iter().enumerate() {
843        for (ai, add) in added.iter().enumerate() {
844            if *src_oid == add.new_oid {
845                scores.push((100, si, ai));
846                continue;
847            }
848            let score = match (&source_contents[si], &added_contents[ai]) {
849                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
850                _ => 0,
851            };
852            if score >= threshold {
853                scores.push((score, si, ai));
854            }
855        }
856    }
857
858    // Sort by score descending.
859    scores.sort_by(|a, b| b.0.cmp(&a.0));
860
861    let mut used_added = vec![false; added.len()];
862    let mut copies: Vec<DiffEntry> = Vec::new();
863
864    for (score, si, ai) in &scores {
865        if used_added[*ai] {
866            continue;
867        }
868        used_added[*ai] = true;
869
870        let (ref src_path, _) = sources[*si];
871        let add = &added[*ai];
872
873        let src_mode = source_tree_entries
874            .iter()
875            .find(|(p, _, _)| p == src_path)
876            .map(|(_, m, _)| m.clone())
877            .unwrap_or_else(|| add.old_mode.clone());
878
879        copies.push(DiffEntry {
880            status: DiffStatus::Copied,
881            old_path: Some(src_path.clone()),
882            new_path: add.new_path.clone(),
883            old_mode: src_mode,
884            new_mode: add.new_mode.clone(),
885            old_oid: sources[*si].1,
886            new_oid: add.new_oid,
887            score: Some(*score),
888        });
889    }
890
891    let mut result = others;
892    result.extend(copies);
893    for (i, entry) in added.into_iter().enumerate() {
894        if !used_added[i] {
895            result.push(entry);
896        }
897    }
898
899    result.sort_by(|a, b| a.path().cmp(b.path()));
900    result
901}
902
903/// Format a rename pair using Git's compact path format.
904///
905/// Examples:
906/// - `a/b/c` → `c/b/a` → `a/b/c => c/b/a`
907/// - `c/b/a` → `c/d/e` → `c/{b/a => d/e}`
908/// - `c/d/e` → `d/e` → `{c/d => d}/e`
909/// - `d/e` → `d/f/e` → `d/{ => f}/e`
910pub fn format_rename_path(old: &str, new: &str) -> String {
911    let ob = old.as_bytes();
912    let nb = new.as_bytes();
913
914    // Find common prefix length, snapped to '/' boundary.
915    let pfx = {
916        let mut last_sep = 0usize;
917        let min_len = ob.len().min(nb.len());
918        for i in 0..min_len {
919            if ob[i] != nb[i] {
920                break;
921            }
922            if ob[i] == b'/' {
923                last_sep = i + 1;
924            }
925        }
926        last_sep
927    };
928
929    // Find common suffix length, snapped to '/' boundary.
930    let mut sfx = {
931        let mut last_sep = 0usize;
932        let min_len = ob.len().min(nb.len());
933        for i in 0..min_len {
934            let oi = ob.len() - 1 - i;
935            let ni = nb.len() - 1 - i;
936            if ob[oi] != nb[ni] {
937                break;
938            }
939            if ob[oi] == b'/' {
940                last_sep = i + 1;
941            }
942        }
943        last_sep
944    };
945
946    // Suffix starts at this position in each string.
947    let mut sfx_at_old = ob.len() - sfx;
948    let mut sfx_at_new = nb.len() - sfx;
949
950    // If prefix and suffix overlap in both strings (both middles empty),
951    // reduce the suffix so that at least the longer string has a non-empty middle.
952    while pfx > sfx_at_old && pfx > sfx_at_new && sfx > 0 {
953        // Reduce suffix by snapping to the next smaller '/' boundary.
954        let suffix_bytes = &ob[sfx_at_old..];
955        let mut new_sfx = 0;
956        // Find the next '/' after sfx_at_old (i.e., reduce suffix).
957        for i in 1..suffix_bytes.len() {
958            if suffix_bytes[i] == b'/' {
959                new_sfx = sfx - i;
960                break;
961            }
962        }
963        if new_sfx == 0 || new_sfx >= sfx {
964            sfx = 0;
965            sfx_at_old = ob.len();
966            sfx_at_new = nb.len();
967            break;
968        }
969        sfx = new_sfx;
970        sfx_at_old = ob.len() - sfx;
971        sfx_at_new = nb.len() - sfx;
972    }
973
974    // When prefix and suffix overlap in the shorter string, they share
975    // the '/' boundary character. In the output format, the shared '/'
976    // appears in both positions (e.g. "d/{ => f}/e" for d/e → d/f/e).
977    // Compute the middle parts. When prefix and suffix overlap in a
978    // string, the middle for that string is empty. The shared '/' shows
979    // in both prefix (trailing) and suffix (leading) positions.
980    let prefix = &old[..pfx];
981    let suffix = &old[sfx_at_old..];
982    let old_mid = if pfx <= sfx_at_old {
983        &old[pfx..sfx_at_old]
984    } else {
985        ""
986    };
987    let new_mid = if pfx <= sfx_at_new {
988        &new[pfx..sfx_at_new]
989    } else {
990        ""
991    };
992
993    if prefix.is_empty() && suffix.is_empty() {
994        return format!("{old} => {new}");
995    }
996
997    format!("{prefix}{{{old_mid} => {new_mid}}}{suffix}")
998}
999
1000/// Check if two entries share the same filename (basename).
1001fn same_basename(del: &DiffEntry, add: &DiffEntry) -> bool {
1002    let old = del.old_path.as_deref().unwrap_or("");
1003    let new = add.new_path.as_deref().unwrap_or("");
1004    let old_base = old.rsplit('/').next().unwrap_or(old);
1005    let new_base = new.rsplit('/').next().unwrap_or(new);
1006    old_base == new_base && !old_base.is_empty()
1007}
1008
1009/// Compute a similarity percentage (0–100) between two byte slices.
1010///
1011/// Uses Git's approach: count the bytes that are "shared" (appear in
1012/// equal lines), then compute `score = shared_bytes * 2 * 100 / (src_size + dst_size)`.
1013fn compute_similarity(old: &[u8], new: &[u8]) -> u32 {
1014    let src_size = old.len();
1015    let dst_size = new.len();
1016
1017    if src_size == 0 && dst_size == 0 {
1018        return 100;
1019    }
1020    let total = src_size + dst_size;
1021    if total == 0 {
1022        return 100;
1023    }
1024
1025    // Use line-level diff to find shared content, then count bytes.
1026    use similar::{ChangeTag, TextDiff};
1027    let old_str = String::from_utf8_lossy(old);
1028    let new_str = String::from_utf8_lossy(new);
1029    let diff = TextDiff::from_lines(&old_str as &str, &new_str as &str);
1030
1031    let mut shared_bytes = 0usize;
1032    for change in diff.iter_all_changes() {
1033        if change.tag() == ChangeTag::Equal {
1034            // Count bytes in the matching line (including newline).
1035            shared_bytes += change.value().len();
1036        }
1037    }
1038
1039    // Git: score = copied * MAX_SCORE / max(src_size, dst_size)
1040    // We normalize to 0-100.
1041    let max_size = src_size.max(dst_size);
1042    let score = ((shared_bytes * 100) / max_size).min(100) as u32;
1043    score
1044}
1045
1046// ── Output formatting ───────────────────────────────────────────────
1047
1048/// Format a diff entry in Git's raw diff format.
1049///
1050/// Example: `:100644 100644 abc1234... def5678... M\tfile.txt`
1051pub fn format_raw(entry: &DiffEntry) -> String {
1052    let path = match entry.status {
1053        DiffStatus::Renamed | DiffStatus::Copied => {
1054            format!(
1055                "{}\t{}",
1056                entry.old_path.as_deref().unwrap_or(""),
1057                entry.new_path.as_deref().unwrap_or("")
1058            )
1059        }
1060        _ => entry.path().to_owned(),
1061    };
1062
1063    let status_str = match (entry.status, entry.score) {
1064        (DiffStatus::Renamed, Some(s)) => format!("R{:03}", s),
1065        (DiffStatus::Copied, Some(s)) => format!("C{:03}", s),
1066        _ => entry.status.letter().to_string(),
1067    };
1068
1069    format!(
1070        ":{} {} {} {} {}\t{}",
1071        entry.old_mode,
1072        entry.new_mode,
1073        entry.old_oid,
1074        entry.new_oid,
1075        status_str,
1076        path
1077    )
1078}
1079
1080/// Format a diff entry with abbreviated OIDs.
1081pub fn format_raw_abbrev(entry: &DiffEntry, abbrev_len: usize) -> String {
1082    let old_hex = format!("{}", entry.old_oid);
1083    let new_hex = format!("{}", entry.new_oid);
1084    let old_abbrev = &old_hex[..abbrev_len.min(old_hex.len())];
1085    let new_abbrev = &new_hex[..abbrev_len.min(new_hex.len())];
1086
1087    let path = entry.path();
1088
1089    format!(
1090        ":{} {} {}... {}... {}\t{}",
1091        entry.old_mode,
1092        entry.new_mode,
1093        old_abbrev,
1094        new_abbrev,
1095        entry.status.letter(),
1096        path
1097    )
1098}
1099
1100/// Generate a unified diff patch for two blobs.
1101///
1102/// # Parameters
1103///
1104/// - `old_content` — the old file content (empty for added files).
1105/// - `new_content` — the new file content (empty for deleted files).
1106/// - `old_path` — display path for the old side.
1107/// - `new_path` — display path for the new side.
1108/// - `context_lines` — number of context lines around changes (default: 3).
1109///
1110/// # Returns
1111///
1112/// The unified diff as a string.
1113pub fn unified_diff(
1114    old_content: &str,
1115    new_content: &str,
1116    old_path: &str,
1117    new_path: &str,
1118    context_lines: usize,
1119) -> String {
1120    use similar::TextDiff;
1121
1122    let diff = TextDiff::from_lines(old_content, new_content);
1123
1124    let mut output = String::new();
1125    if old_path == "/dev/null" {
1126        output.push_str("--- /dev/null\n");
1127    } else {
1128        output.push_str(&format!("--- a/{old_path}\n"));
1129    }
1130    if new_path == "/dev/null" {
1131        output.push_str("+++ /dev/null\n");
1132    } else {
1133        output.push_str(&format!("+++ b/{new_path}\n"));
1134    }
1135
1136    let old_lines: Vec<&str> = old_content.lines().collect();
1137
1138    for hunk in diff
1139        .unified_diff()
1140        .context_radius(context_lines)
1141        .iter_hunks()
1142    {
1143        let hunk_str = format!("{hunk}");
1144        // The similar crate outputs @@ -a,b +c,d @@\n but Git adds
1145        // function context after the closing @@. Extract the hunk header
1146        // and add function context.
1147        if let Some(first_newline) = hunk_str.find('\n') {
1148            let header_line = &hunk_str[..first_newline];
1149            let rest = &hunk_str[first_newline..];
1150
1151            // Parse the old start line from the @@ header
1152            if let Some(func_ctx) = extract_function_context(header_line, &old_lines) {
1153                output.push_str(header_line);
1154                output.push(' ');
1155                output.push_str(&func_ctx);
1156                output.push_str(rest);
1157            } else {
1158                output.push_str(&hunk_str);
1159            }
1160        } else {
1161            output.push_str(&hunk_str);
1162        }
1163    }
1164
1165    output
1166}
1167
1168/// Extract function context for a hunk header.
1169///
1170/// Given a hunk header like `@@ -8,7 +8,7 @@`, find the last line
1171/// before line 8 in the old content that looks like a function header
1172/// (starts with a non-whitespace character, like Git's default).
1173fn extract_function_context(header: &str, old_lines: &[&str]) -> Option<String> {
1174    // Parse the old start line number from "@@ -<start>,<count> ..."
1175    let at_pos = header.find("-")?;
1176    let rest = &header[at_pos + 1..];
1177    let comma_or_space = rest.find(|c: char| c == ',' || c == ' ')?;
1178    let start_str = &rest[..comma_or_space];
1179    let start_line: usize = start_str.parse().ok()?;
1180
1181    if start_line <= 1 {
1182        return None;
1183    }
1184
1185    // Look backwards from the line before the hunk start for a line that
1186    // starts with a non-whitespace character (Git's default funcname pattern).
1187    // start_line is 1-indexed, so the hunk starts at old_lines[start_line-1].
1188    // We want to look at lines before that: old_lines[0..start_line-1].
1189    let search_end = (start_line - 1).min(old_lines.len());
1190    for i in (0..search_end).rev() {
1191        let line = old_lines[i];
1192        if !line.is_empty() {
1193            let first = line.as_bytes()[0];
1194            // Git's default: line must start with a letter, digit, '_', '$',
1195            // or certain other non-whitespace chars. We use a simpler heuristic:
1196            // any line that doesn't start with whitespace.
1197            if first != b' ' && first != b'\t' {
1198                // Truncate to 40 chars like Git does.
1199                let truncated = if line.len() > 40 {
1200                    &line[..40]
1201                } else {
1202                    line
1203                };
1204                return Some(truncated.to_owned());
1205            }
1206        }
1207    }
1208    None
1209}
1210
1211/// Generate diff stat output (file name + insertions/deletions).
1212///
1213/// Returns a single line like: ` file.txt | 5 ++---`
1214pub fn format_stat_line(
1215    path: &str,
1216    insertions: usize,
1217    deletions: usize,
1218    max_path_len: usize,
1219) -> String {
1220    let total = insertions + deletions;
1221    let plus = "+".repeat(insertions.min(50));
1222    let minus = "-".repeat(deletions.min(50));
1223    format!(
1224        " {:<width$} | {:>4} {}{}",
1225        path,
1226        total,
1227        plus,
1228        minus,
1229        width = max_path_len
1230    )
1231}
1232
1233/// Count insertions and deletions between two strings.
1234///
1235/// Returns `(insertions, deletions)`.
1236pub fn count_changes(old_content: &str, new_content: &str) -> (usize, usize) {
1237    use similar::{ChangeTag, TextDiff};
1238
1239    let diff = TextDiff::from_lines(old_content, new_content);
1240    let mut ins = 0;
1241    let mut del = 0;
1242
1243    for change in diff.iter_all_changes() {
1244        match change.tag() {
1245            ChangeTag::Insert => ins += 1,
1246            ChangeTag::Delete => del += 1,
1247            ChangeTag::Equal => {}
1248        }
1249    }
1250
1251    (ins, del)
1252}
1253
1254// ── Helpers ─────────────────────────────────────────────────────────
1255
1256/// Flatten a tree object recursively into a sorted list of (path, mode, oid).
1257struct FlatEntry {
1258    path: String,
1259    mode: u32,
1260    oid: ObjectId,
1261}
1262
1263fn flatten_tree(odb: &Odb, tree_oid: &ObjectId, prefix: &str) -> Result<Vec<FlatEntry>> {
1264    let entries = read_tree(odb, tree_oid)?;
1265    let mut result = Vec::new();
1266
1267    for entry in entries {
1268        let name_str = String::from_utf8_lossy(&entry.name);
1269        let path = format_path(prefix, &name_str);
1270        if is_tree_mode(entry.mode) {
1271            let nested = flatten_tree(odb, &entry.oid, &path)?;
1272            result.extend(nested);
1273        } else {
1274            result.push(FlatEntry {
1275                path,
1276                mode: entry.mode,
1277                oid: entry.oid,
1278            });
1279        }
1280    }
1281
1282    Ok(result)
1283}
1284
1285/// Whether a mode represents a tree (directory).
1286fn is_tree_mode(mode: u32) -> bool {
1287    mode == 0o040000
1288}
1289
1290/// Build a path with an optional prefix.
1291fn format_path(prefix: &str, name: &str) -> String {
1292    if prefix.is_empty() {
1293        name.to_owned()
1294    } else {
1295        format!("{prefix}/{name}")
1296    }
1297}
1298
1299/// Format a numeric mode as a zero-padded octal string.
1300fn format_mode(mode: u32) -> String {
1301    format!("{mode:06o}")
1302}
grit_lib/diff.rs

grit_lib/
diff.rs