Skip to main content

grit_lib/
diff.rs

1//! Diff machinery — compare trees, index entries, and working tree files.
2//!
3//! # Overview
4//!
5//! This module provides the core diffing infrastructure shared by `diff`,
6//! `diff-index`, `status`, `log`, `show`, `commit`, and `merge`.
7//!
8//! ## Levels of comparison
9//!
10//! 1. **Tree-to-tree** — compare two tree objects (e.g. for `log`/`show`).
11//! 2. **Tree-to-index** — compare a tree (usually HEAD) against the index
12//!    (staged changes, used by `diff --cached` and `status`).
13//! 3. **Index-to-worktree** — compare index against the working directory
14//!    (unstaged changes, used by `diff` and `status`).
15//!
16//! ## Content diff
17//!
18//! Line-level diffing uses the Myers algorithm via the `similar` crate.
19//! Output formats: unified patch, raw (`:old-mode new-mode ...`), stat,
20//! numstat.
21
22use std::fs;
23use std::os::unix::fs::MetadataExt;
24use std::path::{Path, PathBuf};
25
26use crate::error::{Error, Result};
27use crate::index::{Index, IndexEntry};
28use crate::objects::{parse_commit, parse_tree, ObjectId, ObjectKind, TreeEntry};
29use crate::odb::Odb;
30use crate::userdiff::FuncnameMatcher;
31
32/// The kind of change between two sides of a diff.
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
34pub enum DiffStatus {
35    /// File was added.
36    Added,
37    /// File was deleted.
38    Deleted,
39    /// File was modified (content or mode change).
40    Modified,
41    /// File was renamed (with optional content change).
42    Renamed,
43    /// File was copied.
44    Copied,
45    /// File type changed (e.g. regular → symlink).
46    TypeChanged,
47    /// Unmerged (conflict).
48    Unmerged,
49}
50
51impl DiffStatus {
52    /// Single-character status letter used in raw diff output.
53    #[must_use]
54    pub fn letter(&self) -> char {
55        match self {
56            Self::Added => 'A',
57            Self::Deleted => 'D',
58            Self::Modified => 'M',
59            Self::Renamed => 'R',
60            Self::Copied => 'C',
61            Self::TypeChanged => 'T',
62            Self::Unmerged => 'U',
63        }
64    }
65}
66
67/// A single diff entry representing one changed path.
68#[derive(Debug, Clone, PartialEq, Eq)]
69pub struct DiffEntry {
70    /// The status of this change.
71    pub status: DiffStatus,
72    /// Path in the "old" side (None for Added).
73    pub old_path: Option<String>,
74    /// Path in the "new" side (None for Deleted).
75    pub new_path: Option<String>,
76    /// Old file mode (as octal string, e.g. "100644").
77    pub old_mode: String,
78    /// New file mode.
79    pub new_mode: String,
80    /// Old object ID (zero OID for Added).
81    pub old_oid: ObjectId,
82    /// New object ID (zero OID for Deleted).
83    pub new_oid: ObjectId,
84    /// Similarity score (0–100) for renames/copies.
85    pub score: Option<u32>,
86}
87
88impl DiffEntry {
89    /// The primary path for display (new_path for adds, old_path for deletes).
90    #[must_use]
91    pub fn path(&self) -> &str {
92        self.new_path
93            .as_deref()
94            .or(self.old_path.as_deref())
95            .unwrap_or("")
96    }
97
98    /// Return a human-oriented path display for this entry.
99    ///
100    /// For renames and copies this returns `old -> new`; for all other entry
101    /// kinds this returns the primary path.
102    #[must_use]
103    pub fn display_path(&self) -> String {
104        match self.status {
105            DiffStatus::Renamed | DiffStatus::Copied => {
106                let old = self.old_path.as_deref().unwrap_or("");
107                let new = self.new_path.as_deref().unwrap_or("");
108                if old.is_empty() || new.is_empty() {
109                    self.path().to_owned()
110                } else {
111                    format!("{old} -> {new}")
112                }
113            }
114            _ => self.path().to_owned(),
115        }
116    }
117}
118
119/// The zero (null) object ID used for "no object" in diff output.
120pub const ZERO_OID: &str = "0000000000000000000000000000000000000000";
121
122/// Return the zero ObjectId.
123#[must_use]
124pub fn zero_oid() -> ObjectId {
125    ObjectId::from_bytes(&[0u8; 20]).unwrap_or_else(|_| {
126        // This should never fail since we pass exactly 20 bytes
127        panic!("internal error: failed to create zero OID");
128    })
129}
130
131/// Return the ObjectId for the empty blob object.
132#[must_use]
133pub fn empty_blob_oid() -> ObjectId {
134    ObjectId::from_hex("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391").unwrap_or_else(|_| {
135        // This should never fail since the object ID literal is valid.
136        panic!("internal error: failed to create empty blob OID");
137    })
138}
139
140// ── Tree-to-tree diff ───────────────────────────────────────────────
141
142/// Compare two trees and return the list of changed entries.
143///
144/// # Parameters
145///
146/// - `odb` — object database to read tree objects from.
147/// - `old_tree_oid` — OID of the old tree (or `None` for comparison against empty).
148/// - `new_tree_oid` — OID of the new tree (or `None` for comparison against empty).
149/// - `prefix` — path prefix for nested tree recursion (empty string for root).
150///
151/// # Errors
152///
153/// Returns errors from object database reads.
154pub fn diff_trees(
155    odb: &Odb,
156    old_tree_oid: Option<&ObjectId>,
157    new_tree_oid: Option<&ObjectId>,
158    prefix: &str,
159) -> Result<Vec<DiffEntry>> {
160    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, false)
161}
162
163/// Like `diff_trees` but with `show_trees` flag: when true, emit entries for
164/// tree objects themselves in addition to their recursive contents (the `-t`
165/// flag of `diff-tree`).
166pub fn diff_trees_show_tree_entries(
167    odb: &Odb,
168    old_tree_oid: Option<&ObjectId>,
169    new_tree_oid: Option<&ObjectId>,
170    prefix: &str,
171) -> Result<Vec<DiffEntry>> {
172    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, true)
173}
174
175fn diff_trees_opts(
176    odb: &Odb,
177    old_tree_oid: Option<&ObjectId>,
178    new_tree_oid: Option<&ObjectId>,
179    prefix: &str,
180    show_trees: bool,
181) -> Result<Vec<DiffEntry>> {
182    let old_entries = match old_tree_oid {
183        Some(oid) => read_tree(odb, oid)?,
184        None => Vec::new(),
185    };
186    let new_entries = match new_tree_oid {
187        Some(oid) => read_tree(odb, oid)?,
188        None => Vec::new(),
189    };
190
191    let mut result = Vec::new();
192    diff_tree_entries_opts(
193        odb,
194        &old_entries,
195        &new_entries,
196        prefix,
197        show_trees,
198        &mut result,
199    )?;
200    Ok(result)
201}
202
203/// Read and parse a tree object from the ODB.
204fn read_tree(odb: &Odb, oid: &ObjectId) -> Result<Vec<TreeEntry>> {
205    let obj = odb.read(oid)?;
206    if obj.kind != ObjectKind::Tree {
207        return Err(Error::CorruptObject(format!(
208            "expected tree, got {}",
209            obj.kind.as_str()
210        )));
211    }
212    parse_tree(&obj.data)
213}
214
215/// Compare two sorted lists of tree entries, recursing into subtrees.
216fn diff_tree_entries_opts(
217    odb: &Odb,
218    old: &[TreeEntry],
219    new: &[TreeEntry],
220    prefix: &str,
221    show_trees: bool,
222    result: &mut Vec<DiffEntry>,
223) -> Result<()> {
224    let mut oi = 0;
225    let mut ni = 0;
226
227    while oi < old.len() || ni < new.len() {
228        match (old.get(oi), new.get(ni)) {
229            (Some(o), Some(n)) => {
230                let cmp = crate::objects::tree_entry_cmp(
231                    &o.name,
232                    is_tree_mode(o.mode),
233                    &n.name,
234                    is_tree_mode(n.mode),
235                );
236                match cmp {
237                    std::cmp::Ordering::Less => {
238                        // Old entry not in new → deleted
239                        emit_deleted_opts(odb, o, prefix, show_trees, result)?;
240                        oi += 1;
241                    }
242                    std::cmp::Ordering::Greater => {
243                        // New entry not in old → added
244                        emit_added_opts(odb, n, prefix, show_trees, result)?;
245                        ni += 1;
246                    }
247                    std::cmp::Ordering::Equal => {
248                        // Both present — check for changes
249                        if o.oid != n.oid || o.mode != n.mode {
250                            let name_str = String::from_utf8_lossy(&o.name);
251                            let path = format_path(prefix, &name_str);
252                            if is_tree_mode(o.mode) && is_tree_mode(n.mode) {
253                                // Both are trees
254                                if show_trees {
255                                    result.push(DiffEntry {
256                                        status: DiffStatus::Modified,
257                                        old_path: Some(path.clone()),
258                                        new_path: Some(path.clone()),
259                                        old_mode: format_mode(o.mode),
260                                        new_mode: format_mode(n.mode),
261                                        old_oid: o.oid,
262                                        new_oid: n.oid,
263                                        score: None,
264                                    });
265                                }
266                                // Recurse
267                                let nested = diff_trees_opts(
268                                    odb,
269                                    Some(&o.oid),
270                                    Some(&n.oid),
271                                    &path,
272                                    show_trees,
273                                )?;
274                                result.extend(nested);
275                            } else if is_tree_mode(o.mode) && !is_tree_mode(n.mode) {
276                                // Tree → blob: delete tree contents, add blob
277                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
278                                emit_added_opts(odb, n, prefix, show_trees, result)?;
279                            } else if !is_tree_mode(o.mode) && is_tree_mode(n.mode) {
280                                // Blob → tree: delete blob, add tree contents
281                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
282                                emit_added_opts(odb, n, prefix, show_trees, result)?;
283                            } else {
284                                // Both blobs — modified.
285                                // A mode-only change (e.g. chmod) is Modified.
286                                // TypeChanged is only for actual type changes (blob ↔ symlink).
287                                let old_type = o.mode & 0o170000;
288                                let new_type = n.mode & 0o170000;
289                                result.push(DiffEntry {
290                                    status: if old_type != new_type {
291                                        DiffStatus::TypeChanged
292                                    } else {
293                                        DiffStatus::Modified
294                                    },
295                                    old_path: Some(path.clone()),
296                                    new_path: Some(path),
297                                    old_mode: format_mode(o.mode),
298                                    new_mode: format_mode(n.mode),
299                                    old_oid: o.oid,
300                                    new_oid: n.oid,
301                                    score: None,
302                                });
303                            }
304                        }
305                        oi += 1;
306                        ni += 1;
307                    }
308                }
309            }
310            (Some(o), None) => {
311                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
312                oi += 1;
313            }
314            (None, Some(n)) => {
315                emit_added_opts(odb, n, prefix, show_trees, result)?;
316                ni += 1;
317            }
318            (None, None) => break,
319        }
320    }
321
322    Ok(())
323}
324
325fn emit_deleted_opts(
326    odb: &Odb,
327    entry: &TreeEntry,
328    prefix: &str,
329    show_trees: bool,
330    result: &mut Vec<DiffEntry>,
331) -> Result<()> {
332    let name_str = String::from_utf8_lossy(&entry.name);
333    let path = format_path(prefix, &name_str);
334    if is_tree_mode(entry.mode) {
335        if show_trees {
336            result.push(DiffEntry {
337                status: DiffStatus::Deleted,
338                old_path: Some(path.clone()),
339                new_path: None,
340                old_mode: format_mode(entry.mode),
341                new_mode: "000000".to_owned(),
342                old_oid: entry.oid,
343                new_oid: zero_oid(),
344                score: None,
345            });
346        }
347        // Recurse into deleted tree
348        let nested = diff_trees_opts(odb, Some(&entry.oid), None, &path, show_trees)?;
349        result.extend(nested);
350    } else {
351        result.push(DiffEntry {
352            status: DiffStatus::Deleted,
353            old_path: Some(path.clone()),
354            new_path: None,
355            old_mode: format_mode(entry.mode),
356            new_mode: "000000".to_owned(),
357            old_oid: entry.oid,
358            new_oid: zero_oid(),
359            score: None,
360        });
361    }
362    Ok(())
363}
364
365fn emit_added_opts(
366    odb: &Odb,
367    entry: &TreeEntry,
368    prefix: &str,
369    show_trees: bool,
370    result: &mut Vec<DiffEntry>,
371) -> Result<()> {
372    let name_str = String::from_utf8_lossy(&entry.name);
373    let path = format_path(prefix, &name_str);
374    if is_tree_mode(entry.mode) {
375        if show_trees {
376            result.push(DiffEntry {
377                status: DiffStatus::Added,
378                old_path: None,
379                new_path: Some(path.clone()),
380                old_mode: "000000".to_owned(),
381                new_mode: format_mode(entry.mode),
382                old_oid: zero_oid(),
383                new_oid: entry.oid,
384                score: None,
385            });
386        }
387        // Recurse into added tree
388        let nested = diff_trees_opts(odb, None, Some(&entry.oid), &path, show_trees)?;
389        result.extend(nested);
390    } else {
391        result.push(DiffEntry {
392            status: DiffStatus::Added,
393            old_path: None,
394            new_path: Some(path),
395            old_mode: "000000".to_owned(),
396            new_mode: format_mode(entry.mode),
397            old_oid: zero_oid(),
398            new_oid: entry.oid,
399            score: None,
400        });
401    }
402    Ok(())
403}
404
405// ── Index-to-tree diff (staged changes) ─────────────────────────────
406
407/// Compare the index against a tree (usually HEAD's tree).
408///
409/// This shows "staged" changes — what would be committed.
410///
411/// # Parameters
412///
413/// - `odb` — object database.
414/// - `index` — the current index.
415/// - `tree_oid` — the tree to compare against (e.g. HEAD's tree), or `None`
416///   for comparison against an empty tree (initial commit).
417///
418/// # Errors
419///
420/// Returns errors from ODB reads.
421pub fn diff_index_to_tree(
422    odb: &Odb,
423    index: &Index,
424    tree_oid: Option<&ObjectId>,
425) -> Result<Vec<DiffEntry>> {
426    // Flatten the tree into a sorted list of (path, mode, oid)
427    let tree_entries = match tree_oid {
428        Some(oid) => flatten_tree(odb, oid, "")?,
429        None => Vec::new(),
430    };
431
432    // Build maps keyed by path
433    let mut tree_map: std::collections::BTreeMap<&str, &FlatEntry> =
434        std::collections::BTreeMap::new();
435    for entry in &tree_entries {
436        tree_map.insert(&entry.path, entry);
437    }
438
439    let mut result = Vec::new();
440    let mut stage0_paths = std::collections::BTreeSet::new();
441    let mut unmerged_modes: std::collections::BTreeMap<String, (u8, u32)> =
442        std::collections::BTreeMap::new();
443
444    // Check index entries against tree
445    for ie in &index.entries {
446        let path = String::from_utf8_lossy(&ie.path).to_string();
447        if ie.stage() == 0 && ie.intent_to_add() {
448            // Intent-to-add entries are not "staged" for diff-index / status
449            // (matches Git: `git diff --cached` is empty for `-N` paths).
450            continue;
451        }
452        if ie.stage() != 0 {
453            let rank = match ie.stage() {
454                2 => 0u8,
455                3 => 1u8,
456                1 => 2u8,
457                _ => 3u8,
458            };
459            match unmerged_modes.get(&path) {
460                Some((existing_rank, _)) if *existing_rank <= rank => {}
461                _ => {
462                    unmerged_modes.insert(path, (rank, ie.mode));
463                }
464            }
465            continue;
466        }
467        stage0_paths.insert(path.clone());
468        match tree_map.remove(path.as_str()) {
469            Some(te) => {
470                // Present in both — check for differences
471                if te.oid != ie.oid || te.mode != ie.mode {
472                    result.push(DiffEntry {
473                        status: DiffStatus::Modified,
474                        old_path: Some(path.clone()),
475                        new_path: Some(path),
476                        old_mode: format_mode(te.mode),
477                        new_mode: format_mode(ie.mode),
478                        old_oid: te.oid,
479                        new_oid: ie.oid,
480                        score: None,
481                    });
482                }
483            }
484            None => {
485                // In index but not tree → added
486                result.push(DiffEntry {
487                    status: DiffStatus::Added,
488                    old_path: None,
489                    new_path: Some(path),
490                    old_mode: "000000".to_owned(),
491                    new_mode: format_mode(ie.mode),
492                    old_oid: zero_oid(),
493                    new_oid: ie.oid,
494                    score: None,
495                });
496            }
497        }
498    }
499
500    for (path, (_, mode)) in &unmerged_modes {
501        if stage0_paths.contains(path) {
502            continue;
503        }
504        tree_map.remove(path.as_str());
505        result.push(DiffEntry {
506            status: DiffStatus::Unmerged,
507            old_path: Some(path.clone()),
508            new_path: Some(path.clone()),
509            old_mode: "000000".to_owned(),
510            new_mode: format_mode(*mode),
511            old_oid: zero_oid(),
512            new_oid: zero_oid(),
513            score: None,
514        });
515    }
516
517    // Remaining tree entries not in index → deleted
518    for (path, te) in tree_map {
519        result.push(DiffEntry {
520            status: DiffStatus::Deleted,
521            old_path: Some(path.to_owned()),
522            new_path: None,
523            old_mode: format_mode(te.mode),
524            new_mode: "000000".to_owned(),
525            old_oid: te.oid,
526            new_oid: zero_oid(),
527            score: None,
528        });
529    }
530
531    result.sort_by(|a, b| a.path().cmp(b.path()));
532    Ok(result)
533}
534
535// ── Index-to-worktree diff (unstaged changes) ───────────────────────
536
537/// Compare the index against the working tree.
538///
539/// This shows "unstaged" changes — modifications not yet staged.
540///
541/// Entries with [`IndexEntry::assume_unchanged`] or [`IndexEntry::skip_worktree`] are treated as
542/// matching the work tree without examining the filesystem (Git `CE_VALID` / skip-worktree).
543///
544/// # Parameters
545///
546/// - `odb` — object database (for hashing worktree files).
547/// - `index` — the current index.
548/// - `work_tree` — path to the working tree root.
549///
550/// # Errors
551///
552/// Returns errors from I/O or hashing.
553pub fn diff_index_to_worktree(
554    odb: &Odb,
555    index: &Index,
556    work_tree: &Path,
557) -> Result<Vec<DiffEntry>> {
558    use crate::config::ConfigSet;
559    use crate::crlf;
560
561    let git_dir = work_tree.join(".git");
562    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
563    let conv = crlf::ConversionConfig::from_config(&config);
564    let attrs = crlf::load_gitattributes(work_tree);
565
566    let mut result = Vec::new();
567    let mut unmerged_base: std::collections::BTreeMap<String, (u8, &IndexEntry)> =
568        std::collections::BTreeMap::new();
569
570    for ie in &index.entries {
571        if ie.stage() != 0 {
572            let path = String::from_utf8_lossy(&ie.path).to_string();
573            let rank = match ie.stage() {
574                2 => 0u8,
575                3 => 1u8,
576                1 => 2u8,
577                _ => 3u8,
578            };
579            match unmerged_base.get(&path) {
580                Some((existing_rank, _)) if *existing_rank <= rank => {}
581                _ => {
582                    unmerged_base.insert(path, (rank, ie));
583                }
584            }
585            continue;
586        }
587        if ie.skip_worktree() {
588            // Sparse checkout: paths outside the cone are not expected to exist on disk.
589            // Git's status omits them from the index↔worktree diff (wt-status.c).
590            continue;
591        }
592        // Use str slice directly to avoid allocation for path joining;
593        // only allocate String if we need it for DiffEntry output.
594        let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
595        let is_intent_to_add = ie.intent_to_add();
596
597        if ie.assume_unchanged() || ie.skip_worktree() {
598            continue;
599        }
600
601        // Gitlink entries (submodules): compare checked-out HEAD to the recorded commit.
602        // An uninitialized path (no `.git` in the directory) is not dirty — same as Git.
603        if ie.mode == 0o160000 {
604            let sub_dir = work_tree.join(path_str_ref);
605            let sub_head_oid = read_submodule_head_oid(&sub_dir);
606            let matches_index = match sub_head_oid {
607                Some(oid) => oid == ie.oid,
608                None => submodule_worktree_is_unpopulated_placeholder(&sub_dir),
609            };
610            if !matches_index {
611                let path_owned = path_str_ref.to_owned();
612                let new_oid = sub_head_oid.unwrap_or_else(zero_oid);
613                result.push(DiffEntry {
614                    status: DiffStatus::Modified,
615                    old_path: Some(path_owned.clone()),
616                    new_path: Some(path_owned),
617                    old_mode: format_mode(ie.mode),
618                    new_mode: format_mode(ie.mode),
619                    old_oid: ie.oid,
620                    new_oid,
621                    score: None,
622                });
623            }
624            continue;
625        }
626
627        let file_path = work_tree.join(path_str_ref);
628
629        if is_intent_to_add {
630            match fs::symlink_metadata(&file_path) {
631                Ok(meta) => {
632                    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
633                    let worktree_oid = hash_worktree_file(
634                        odb,
635                        &file_path,
636                        &meta,
637                        &conv,
638                        &file_attrs,
639                        path_str_ref,
640                        None,
641                    )?;
642                    let worktree_mode = mode_from_metadata(&meta);
643                    result.push(DiffEntry {
644                        status: DiffStatus::Added,
645                        old_path: None,
646                        new_path: Some(path_str_ref.to_owned()),
647                        old_mode: "000000".to_owned(),
648                        new_mode: format_mode(worktree_mode),
649                        old_oid: zero_oid(),
650                        new_oid: worktree_oid,
651                        score: None,
652                    });
653                }
654                Err(e)
655                    if e.kind() == std::io::ErrorKind::NotFound
656                        || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
657                {
658                    result.push(DiffEntry {
659                        status: DiffStatus::Deleted,
660                        old_path: Some(path_str_ref.to_owned()),
661                        new_path: None,
662                        old_mode: format_mode(ie.mode),
663                        new_mode: "000000".to_owned(),
664                        old_oid: empty_blob_oid(),
665                        new_oid: zero_oid(),
666                        score: None,
667                    });
668                }
669                Err(e) => return Err(Error::Io(e)),
670            }
671            continue;
672        }
673
674        // If any parent component of the path is a symlink, the file is effectively
675        // deleted from the working tree (a symlink replaced a directory).
676        if has_symlink_in_path(work_tree, path_str_ref) {
677            result.push(DiffEntry {
678                status: DiffStatus::Deleted,
679                old_path: Some(path_str_ref.to_owned()),
680                new_path: None,
681                old_mode: format_mode(ie.mode),
682                new_mode: "000000".to_owned(),
683                old_oid: ie.oid,
684                new_oid: zero_oid(),
685                score: None,
686            });
687            continue;
688        }
689
690        match fs::symlink_metadata(&file_path) {
691            Ok(meta) if meta.is_dir() => {
692                // A directory exists where the index expects a file.
693                // Treat as a type change: the indexed file is effectively deleted.
694                result.push(DiffEntry {
695                    status: DiffStatus::Deleted,
696                    old_path: Some(path_str_ref.to_owned()),
697                    new_path: None,
698                    old_mode: format_mode(ie.mode),
699                    new_mode: String::new(),
700                    old_oid: ie.oid,
701                    new_oid: zero_oid(),
702                    score: None,
703                });
704            }
705            Ok(meta) => {
706                let worktree_mode = mode_from_metadata(&meta);
707                // Mode-only change: stat still matches the index entry but executable bit differs.
708                if stat_matches(ie, &meta) && worktree_mode != ie.mode {
709                    let path_owned = path_str_ref.to_owned();
710                    result.push(DiffEntry {
711                        status: DiffStatus::Modified,
712                        old_path: Some(path_owned.clone()),
713                        new_path: Some(path_owned),
714                        old_mode: format_mode(ie.mode),
715                        new_mode: format_mode(worktree_mode),
716                        old_oid: ie.oid,
717                        new_oid: ie.oid,
718                        score: None,
719                    });
720                    continue;
721                }
722
723                // Hash the worktree blob. We cannot skip hashing when stat matches: two different
724                // contents can share the same size (e.g. `foo` vs `bar`), and rapid edits can
725                // leave timestamps looking "unchanged" relative to the index (t4015-diff-whitespace).
726                let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
727                let worktree_oid = hash_worktree_file(
728                    odb,
729                    &file_path,
730                    &meta,
731                    &conv,
732                    &file_attrs,
733                    path_str_ref,
734                    Some(ie),
735                )?;
736
737                // If clean conversion disagrees with the index but raw bytes match the
738                // blob (e.g. mixed line endings committed with autocrlf off), Git reports
739                // no diff (t0020: touch + git diff --exit-code).
740                let mut eff_oid = worktree_oid;
741                if eff_oid != ie.oid {
742                    if let Ok(raw) = fs::read(&file_path) {
743                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
744                        if raw_oid == ie.oid {
745                            eff_oid = ie.oid;
746                        }
747                    }
748                }
749
750                if eff_oid != ie.oid || worktree_mode != ie.mode {
751                    let path_owned = path_str_ref.to_owned();
752                    result.push(DiffEntry {
753                        status: DiffStatus::Modified,
754                        old_path: Some(path_owned.clone()),
755                        new_path: Some(path_owned),
756                        old_mode: format_mode(ie.mode),
757                        new_mode: format_mode(worktree_mode),
758                        old_oid: ie.oid,
759                        new_oid: eff_oid,
760                    score: None,
761                    });
762                }
763            }
764            Err(e) if e.kind() == std::io::ErrorKind::NotFound
765                || e.raw_os_error() == Some(20) /* ENOTDIR */ => {
766                // File deleted from working tree (or parent replaced by a file)
767                result.push(DiffEntry {
768                    status: DiffStatus::Deleted,
769                    old_path: Some(path_str_ref.to_owned()),
770                    new_path: None,
771                    old_mode: format_mode(ie.mode),
772                    new_mode: "000000".to_owned(),
773                    old_oid: ie.oid,
774                    new_oid: zero_oid(),
775                    score: None,
776                });
777            }
778            Err(e) => return Err(Error::Io(e)),
779        }
780    }
781
782    for (path, (_, base_entry)) in unmerged_base {
783        let file_path = work_tree.join(&path);
784        let wt_meta = match fs::symlink_metadata(&file_path) {
785            Ok(meta) => Some(meta),
786            Err(e)
787                if e.kind() == std::io::ErrorKind::NotFound
788                    || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
789            {
790                None
791            }
792            Err(e) => return Err(Error::Io(e)),
793        };
794
795        let new_mode = wt_meta.as_ref().map_or_else(
796            || "000000".to_owned(),
797            |meta| format_mode(mode_from_metadata(meta)),
798        );
799        result.push(DiffEntry {
800            status: DiffStatus::Unmerged,
801            old_path: Some(path.clone()),
802            new_path: Some(path.clone()),
803            old_mode: "000000".to_owned(),
804            new_mode,
805            old_oid: zero_oid(),
806            new_oid: zero_oid(),
807            score: None,
808        });
809
810        if let Some(meta) = wt_meta {
811            let file_attrs = crlf::get_file_attrs(&attrs, &path, false, &config);
812            let wt_oid = hash_worktree_file(
813                odb,
814                &file_path,
815                &meta,
816                &conv,
817                &file_attrs,
818                &path,
819                Some(base_entry),
820            )?;
821            let wt_mode = mode_from_metadata(&meta);
822            if wt_oid != base_entry.oid || wt_mode != base_entry.mode {
823                result.push(DiffEntry {
824                    status: DiffStatus::Modified,
825                    old_path: Some(path.clone()),
826                    new_path: Some(path),
827                    old_mode: format_mode(base_entry.mode),
828                    new_mode: format_mode(wt_mode),
829                    old_oid: base_entry.oid,
830                    new_oid: wt_oid,
831                    score: None,
832                });
833            }
834        }
835    }
836
837    Ok(result)
838}
839
840/// Quick stat check: does the index entry's cached stat data match the file?
841/// Returns true when the file at `ie`'s path differs from the index entry (mode or blob).
842///
843/// Used by commands such as `git mv` to detect "dirty" paths under sparse checkout.
844/// Symlinks and submodules are compared in a Git-compatible way.
845pub fn worktree_differs_from_index_entry(
846    odb: &Odb,
847    work_tree: &Path,
848    ie: &IndexEntry,
849) -> Result<bool> {
850    use crate::config::ConfigSet;
851    use crate::crlf;
852
853    let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
854    let file_path = work_tree.join(path_str_ref);
855
856    if ie.mode == 0o160000 {
857        let sub_head_oid = read_submodule_head(&file_path);
858        return Ok(sub_head_oid.as_ref() != Some(&ie.oid));
859    }
860
861    let meta = match fs::symlink_metadata(&file_path) {
862        Ok(m) => m,
863        Err(e)
864            if e.kind() == std::io::ErrorKind::NotFound
865                || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
866        {
867            return Ok(true);
868        }
869        Err(e) => return Err(Error::Io(e)),
870    };
871
872    if meta.is_dir() {
873        return Ok(true);
874    }
875
876    let worktree_mode = mode_from_metadata(&meta);
877    if worktree_mode != ie.mode {
878        return Ok(true);
879    }
880
881    let git_dir = work_tree.join(".git");
882    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
883    let conv = crlf::ConversionConfig::from_config(&config);
884    let attrs = crlf::load_gitattributes(work_tree);
885    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
886    let worktree_oid = hash_worktree_file(
887        odb,
888        &file_path,
889        &meta,
890        &conv,
891        &file_attrs,
892        path_str_ref,
893        Some(ie),
894    )?;
895
896    let mut eff_oid = worktree_oid;
897    if eff_oid != ie.oid {
898        if let Ok(raw) = fs::read(&file_path) {
899            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
900            if raw_oid == ie.oid {
901                eff_oid = ie.oid;
902            }
903        }
904    }
905
906    Ok(eff_oid != ie.oid)
907}
908
909pub fn stat_matches(ie: &IndexEntry, meta: &fs::Metadata) -> bool {
910    // Compare size
911    if meta.len() as u32 != ie.size {
912        return false;
913    }
914    // Compare mtime (seconds + nanoseconds)
915    if meta.mtime() as u32 != ie.mtime_sec {
916        return false;
917    }
918    if meta.mtime_nsec() as u32 != ie.mtime_nsec {
919        return false;
920    }
921    // Compare ctime (seconds + nanoseconds)
922    if meta.ctime() as u32 != ie.ctime_sec {
923        return false;
924    }
925    if meta.ctime_nsec() as u32 != ie.ctime_nsec {
926        return false;
927    }
928    // Compare inode and device
929    if meta.ino() as u32 != ie.ino {
930        return false;
931    }
932    if meta.dev() as u32 != ie.dev {
933        return false;
934    }
935    true
936}
937
938/// Hash a working tree file as a blob to get its OID.
939/// Check if any parent component of `rel_path` (relative to `work_tree`) is a symlink.
940fn has_symlink_in_path(work_tree: &Path, rel_path: &str) -> bool {
941    let mut check = work_tree.to_path_buf();
942    let components: Vec<&str> = rel_path.split('/').collect();
943    // Check all components except the last one (which is the file itself)
944    for component in &components[..components.len().saturating_sub(1)] {
945        check.push(component);
946        match fs::symlink_metadata(&check) {
947            Ok(meta) if meta.file_type().is_symlink() => return true,
948            _ => {}
949        }
950    }
951    false
952}
953
954pub fn hash_worktree_file(
955    odb: &Odb,
956    path: &Path,
957    meta: &fs::Metadata,
958    conv: &crate::crlf::ConversionConfig,
959    file_attrs: &crate::crlf::FileAttrs,
960    rel_path: &str,
961    index_entry: Option<&IndexEntry>,
962) -> Result<ObjectId> {
963    let prior_blob: Option<Vec<u8>> = index_entry
964        .filter(|e| e.oid != zero_oid())
965        .and_then(|e| odb.read(&e.oid).ok().map(|o| o.data));
966    let data = if meta.file_type().is_symlink() {
967        // For symlinks, hash the target path
968        let target = fs::read_link(path)?;
969        target.to_string_lossy().into_owned().into_bytes()
970    } else {
971        let raw = fs::read(path)?;
972        // Apply clean conversion (CRLF→LF) so hash matches index blob.
973        // Do not run safecrlf here: diff/commit use this for hashing and must not print warnings.
974        let opts = crate::crlf::ConvertToGitOpts {
975            index_blob: prior_blob.as_deref(),
976            renormalize: false,
977            check_safecrlf: false,
978        };
979        crate::crlf::convert_to_git_with_opts(&raw, rel_path, conv, file_attrs, opts).unwrap_or(raw)
980    };
981
982    Ok(Odb::hash_object_data(ObjectKind::Blob, &data))
983}
984
985/// Derive a Git file mode from filesystem metadata.
986pub fn mode_from_metadata(meta: &fs::Metadata) -> u32 {
987    if meta.file_type().is_symlink() {
988        0o120000
989    } else if meta.mode() & 0o111 != 0 {
990        0o100755
991    } else {
992        0o100644
993    }
994}
995
996/// Compare a tree against the working tree.
997///
998/// Shows changes from `tree_oid` to the current working directory state.
999/// Files tracked in the index but not in the tree are shown as Added.
1000/// Files in the tree but missing from the working tree are shown as Deleted.
1001///
1002/// # Parameters
1003///
1004/// - `odb` — object database.
1005/// - `tree_oid` — the tree to compare against (`None` for empty tree).
1006/// - `work_tree` — path to the working tree root.
1007/// - `index` — current index (used to discover new tracked files not in tree).
1008///
1009/// # Errors
1010///
1011/// Returns errors from ODB reads or I/O.
1012pub fn diff_tree_to_worktree(
1013    odb: &Odb,
1014    tree_oid: Option<&ObjectId>,
1015    work_tree: &Path,
1016    index: &Index,
1017) -> Result<Vec<DiffEntry>> {
1018    use crate::config::ConfigSet;
1019    use crate::crlf;
1020
1021    let git_dir = work_tree.join(".git");
1022    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1023    let conv = crlf::ConversionConfig::from_config(&config);
1024    let attrs = crlf::load_gitattributes(work_tree);
1025
1026    // Flatten the tree into a BTreeMap keyed by path
1027    let tree_flat = match tree_oid {
1028        Some(oid) => flatten_tree(odb, oid, "")?,
1029        None => Vec::new(),
1030    };
1031    let tree_map: std::collections::BTreeMap<String, &FlatEntry> =
1032        tree_flat.iter().map(|e| (e.path.clone(), e)).collect();
1033
1034    // Build index lookup: path → &IndexEntry (stage 0 only)
1035    let mut index_entries: std::collections::BTreeMap<&[u8], &IndexEntry> =
1036        std::collections::BTreeMap::new();
1037    let mut index_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1038    for ie in &index.entries {
1039        if ie.stage() != 0 {
1040            continue;
1041        }
1042        let path = String::from_utf8_lossy(&ie.path).to_string();
1043        index_entries.insert(&ie.path, ie);
1044        index_paths.insert(path);
1045    }
1046
1047    // Union of tree paths + index paths
1048    let mut all_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1049    all_paths.extend(tree_map.keys().cloned());
1050    all_paths.extend(index_paths.iter().cloned());
1051
1052    let mut result = Vec::new();
1053
1054    for path in &all_paths {
1055        let tree_entry = tree_map.get(path.as_str());
1056
1057        // Gitlink entries (submodules) — compare HEAD commit, not file content.
1058        let is_gitlink = tree_entry.is_some_and(|te| te.mode == 0o160000)
1059            || index_entries
1060                .get(path.as_bytes())
1061                .is_some_and(|ie| ie.mode == 0o160000);
1062        if is_gitlink {
1063            if let Some(te) = tree_entry {
1064                let sub_dir = work_tree.join(path);
1065                let sub_head = read_submodule_head_oid(&sub_dir);
1066                if sub_head.as_ref() != Some(&te.oid) {
1067                    let new_oid = sub_head.unwrap_or_else(zero_oid);
1068                    result.push(DiffEntry {
1069                        status: DiffStatus::Modified,
1070                        old_path: Some(path.clone()),
1071                        new_path: Some(path.clone()),
1072                        old_mode: format_mode(te.mode),
1073                        new_mode: format_mode(te.mode),
1074                        old_oid: te.oid,
1075                        new_oid,
1076                        score: None,
1077                    });
1078                }
1079            }
1080            continue;
1081        }
1082
1083        let file_path = work_tree.join(path);
1084
1085        let wt_meta = match fs::symlink_metadata(&file_path) {
1086            Ok(m) => Some(m),
1087            Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
1088            Err(e) => return Err(Error::Io(e)),
1089        };
1090
1091        match (tree_entry, wt_meta) {
1092            (Some(te), Some(ref meta)) => {
1093                let wt_mode = mode_from_metadata(meta);
1094                let Some(ie) = index_entries.get(path.as_bytes()) else {
1095                    continue;
1096                };
1097
1098                let index_matches_tree = ie.oid == te.oid && ie.mode == te.mode;
1099
1100                // Fully clean: index matches `HEAD`, worktree matches index, stat cache fresh.
1101                if index_matches_tree && wt_mode == te.mode && stat_matches(ie, meta) {
1102                    continue;
1103                }
1104
1105                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1106                let idx_ent = index_entries.get(path.as_bytes()).copied();
1107
1108                // Staged mode (same blob as `HEAD`, different mode recorded in the index).
1109                if ie.oid == te.oid && ie.mode != te.mode {
1110                    result.push(DiffEntry {
1111                        status: DiffStatus::Modified,
1112                        old_path: Some(path.clone()),
1113                        new_path: Some(path.clone()),
1114                        old_mode: format_mode(te.mode),
1115                        new_mode: format_mode(ie.mode),
1116                        old_oid: te.oid,
1117                        new_oid: te.oid,
1118                        score: None,
1119                    });
1120                    continue;
1121                }
1122
1123                // Index still matches `HEAD`: only unstaged worktree drift (content and/or
1124                // worktree-only exec bit when `update-index` was not run — t4049 harness).
1125                if index_matches_tree {
1126                    let wt_oid = hash_worktree_file(
1127                        odb,
1128                        &file_path,
1129                        meta,
1130                        &conv,
1131                        &file_attrs,
1132                        path,
1133                        idx_ent,
1134                    )?;
1135                    let mut eff_oid = wt_oid;
1136                    if eff_oid != te.oid {
1137                        if let Ok(raw) = fs::read(&file_path) {
1138                            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1139                            if raw_oid == te.oid {
1140                                eff_oid = te.oid;
1141                            }
1142                        }
1143                    }
1144                    if eff_oid != te.oid {
1145                        result.push(DiffEntry {
1146                            status: DiffStatus::Modified,
1147                            old_path: Some(path.clone()),
1148                            new_path: Some(path.clone()),
1149                            old_mode: format_mode(te.mode),
1150                            new_mode: format_mode(wt_mode),
1151                            old_oid: te.oid,
1152                            new_oid: eff_oid,
1153                            score: None,
1154                        });
1155                    } else if wt_mode != te.mode {
1156                        result.push(DiffEntry {
1157                            status: DiffStatus::Modified,
1158                            old_path: Some(path.clone()),
1159                            new_path: Some(path.clone()),
1160                            old_mode: format_mode(te.mode),
1161                            new_mode: format_mode(wt_mode),
1162                            old_oid: te.oid,
1163                            new_oid: te.oid,
1164                            score: None,
1165                        });
1166                    }
1167                    continue;
1168                }
1169
1170                // Staged content (and possibly mode): `git diff <rev>` is tree vs working tree.
1171                let wt_oid =
1172                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, idx_ent)?;
1173                let mut eff_oid = wt_oid;
1174                if eff_oid != te.oid {
1175                    if let Ok(raw) = fs::read(&file_path) {
1176                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1177                        if raw_oid == te.oid {
1178                            eff_oid = te.oid;
1179                        }
1180                    }
1181                }
1182                if eff_oid != te.oid || wt_mode != te.mode {
1183                    result.push(DiffEntry {
1184                        status: DiffStatus::Modified,
1185                        old_path: Some(path.clone()),
1186                        new_path: Some(path.clone()),
1187                        old_mode: format_mode(te.mode),
1188                        new_mode: format_mode(wt_mode),
1189                        old_oid: te.oid,
1190                        new_oid: eff_oid,
1191                        score: None,
1192                    });
1193                }
1194            }
1195            (Some(te), None) => {
1196                // In tree but missing from worktree
1197                result.push(DiffEntry {
1198                    status: DiffStatus::Deleted,
1199                    old_path: Some(path.clone()),
1200                    new_path: None,
1201                    old_mode: format_mode(te.mode),
1202                    new_mode: "000000".to_owned(),
1203                    old_oid: te.oid,
1204                    new_oid: zero_oid(),
1205                    score: None,
1206                });
1207            }
1208            (None, Some(ref meta)) => {
1209                // In index but not in tree, and exists in worktree
1210                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1211                let wt_oid = hash_worktree_file(
1212                    odb,
1213                    &file_path,
1214                    meta,
1215                    &conv,
1216                    &file_attrs,
1217                    path,
1218                    index_entries.get(path.as_bytes()).copied(),
1219                )?;
1220                let wt_mode = mode_from_metadata(meta);
1221                result.push(DiffEntry {
1222                    status: DiffStatus::Added,
1223                    old_path: None,
1224                    new_path: Some(path.clone()),
1225                    old_mode: "000000".to_owned(),
1226                    new_mode: format_mode(wt_mode),
1227                    old_oid: zero_oid(),
1228                    new_oid: wt_oid,
1229                    score: None,
1230                });
1231            }
1232            (None, None) => {
1233                // Tracked in index but neither in tree nor worktree — skip
1234            }
1235        }
1236    }
1237
1238    result.sort_by(|a, b| a.path().cmp(b.path()));
1239    Ok(result)
1240}
1241
1242// ── Rename detection ────────────────────────────────────────────────
1243
1244/// Detect renames by pairing Deleted and Added entries with similar content.
1245///
1246/// `threshold` is the minimum similarity percentage (0–100) for a pair to
1247/// be considered a rename (Git's default is 50%).  The function reads blob
1248/// content from the ODB to compute a line-level similarity score.
1249///
1250/// Exact-OID matches are always 100% similar regardless of content.
1251pub fn detect_renames(odb: &Odb, entries: Vec<DiffEntry>, threshold: u32) -> Vec<DiffEntry> {
1252    // Split entries into deleted, added, and others.
1253    let mut deleted: Vec<DiffEntry> = Vec::new();
1254    let mut added: Vec<DiffEntry> = Vec::new();
1255    let mut others: Vec<DiffEntry> = Vec::new();
1256
1257    for entry in entries {
1258        match entry.status {
1259            DiffStatus::Deleted => deleted.push(entry),
1260            DiffStatus::Added => added.push(entry),
1261            _ => others.push(entry),
1262        }
1263    }
1264
1265    if deleted.is_empty() || added.is_empty() {
1266        // Nothing to pair — return original order.
1267        let mut result = others;
1268        result.extend(deleted);
1269        result.extend(added);
1270        result.sort_by(|a, b| a.path().cmp(b.path()));
1271        return result;
1272    }
1273
1274    // Read content for all deleted blobs.
1275    let deleted_contents: Vec<Option<Vec<u8>>> = deleted
1276        .iter()
1277        .map(|d| odb.read(&d.old_oid).ok().map(|obj| obj.data))
1278        .collect();
1279
1280    // Read content for all added blobs.
1281    let added_contents: Vec<Option<Vec<u8>>> = added
1282        .iter()
1283        .map(|a| odb.read(&a.new_oid).ok().map(|obj| obj.data))
1284        .collect();
1285
1286    // Build a matrix of similarity scores and find the best pairings.
1287    // We use a greedy approach: pick the highest-scoring pair first.
1288    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
1289
1290    for (di, del) in deleted.iter().enumerate() {
1291        for (ai, add) in added.iter().enumerate() {
1292            // Exact OID match → 100%
1293            if del.old_oid == add.new_oid {
1294                scores.push((100, di, ai));
1295                continue;
1296            }
1297
1298            let score = match (&deleted_contents[di], &added_contents[ai]) {
1299                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
1300                _ => 0,
1301            };
1302
1303            if score >= threshold {
1304                scores.push((score, di, ai));
1305            }
1306        }
1307    }
1308
1309    // Sort: prefer same-basename pairs first, then by score descending.
1310    // This matches Git's behavior where basename matches are checked first.
1311    scores.sort_by(|a, b| {
1312        let a_same = same_basename(&deleted[a.1], &added[a.2]);
1313        let b_same = same_basename(&deleted[b.1], &added[b.2]);
1314        b_same.cmp(&a_same).then_with(|| b.0.cmp(&a.0))
1315    });
1316
1317    let mut used_deleted = vec![false; deleted.len()];
1318    let mut used_added = vec![false; added.len()];
1319    let mut renames: Vec<DiffEntry> = Vec::new();
1320
1321    for (score, di, ai) in &scores {
1322        if used_deleted[*di] || used_added[*ai] {
1323            continue;
1324        }
1325        used_deleted[*di] = true;
1326        used_added[*ai] = true;
1327
1328        let del = &deleted[*di];
1329        let add = &added[*ai];
1330
1331        renames.push(DiffEntry {
1332            status: DiffStatus::Renamed,
1333            old_path: del.old_path.clone(),
1334            new_path: add.new_path.clone(),
1335            old_mode: del.old_mode.clone(),
1336            new_mode: add.new_mode.clone(),
1337            old_oid: del.old_oid,
1338            new_oid: add.new_oid,
1339            score: Some(*score),
1340        });
1341    }
1342
1343    // Collect unmatched entries.
1344    let mut result = others;
1345    result.extend(renames);
1346    for (i, entry) in deleted.into_iter().enumerate() {
1347        if !used_deleted[i] {
1348            result.push(entry);
1349        }
1350    }
1351    for (i, entry) in added.into_iter().enumerate() {
1352        if !used_added[i] {
1353            result.push(entry);
1354        }
1355    }
1356
1357    result.sort_by(|a, b| a.path().cmp(b.path()));
1358    result
1359}
1360
1361/// Detect copies among diff entries.
1362///
1363/// This first runs rename detection (pairing Deleted+Added), then for any
1364/// remaining Added entries, looks for copy sources.
1365///
1366/// - `find_copies_harder` = false: only Modified entries are copy source candidates.
1367/// - `find_copies_harder` = true: also examine unmodified files from `source_tree_entries`.
1368///
1369/// `source_tree_entries` should be a list of (path, mode, oid) from the source tree;
1370/// used when `find_copies_harder` is true to consider unmodified files as copy sources.
1371pub fn detect_copies(
1372    odb: &Odb,
1373    entries: Vec<DiffEntry>,
1374    threshold: u32,
1375    find_copies_harder: bool,
1376    source_tree_entries: &[(String, String, ObjectId)],
1377) -> Vec<DiffEntry> {
1378    use std::collections::{HashMap, HashSet};
1379
1380    // Separate entries by status.
1381    let mut deleted: Vec<DiffEntry> = Vec::new();
1382    let mut added: Vec<DiffEntry> = Vec::new();
1383    let mut others: Vec<DiffEntry> = Vec::new();
1384
1385    for entry in entries {
1386        match entry.status {
1387            DiffStatus::Deleted => deleted.push(entry),
1388            DiffStatus::Added => added.push(entry),
1389            _ => others.push(entry),
1390        }
1391    }
1392
1393    if added.is_empty() {
1394        let mut result = others;
1395        result.extend(deleted);
1396        result.sort_by(|a, b| a.path().cmp(b.path()));
1397        return result;
1398    }
1399
1400    // Build source candidates: deleted files, modified files, and optionally tree entries.
1401    // Track which sources are from deleted files (can become renames).
1402    let mut sources: Vec<(String, ObjectId, bool)> = Vec::new(); // (path, oid, is_deleted)
1403    let mut deleted_source_idx: HashMap<String, usize> = HashMap::new();
1404
1405    for entry in &deleted {
1406        if let Some(ref path) = entry.old_path {
1407            deleted_source_idx.insert(path.clone(), sources.len());
1408            sources.push((path.clone(), entry.old_oid, true));
1409        }
1410    }
1411
1412    // Modified files are always candidates for -C.
1413    for entry in &others {
1414        if entry.status == DiffStatus::Modified {
1415            if let Some(ref old_path) = entry.old_path {
1416                if !sources.iter().any(|(p, _, _)| p == old_path) {
1417                    sources.push((old_path.clone(), entry.old_oid, false));
1418                }
1419            }
1420        }
1421    }
1422
1423    // With find_copies_harder, add all source tree entries.
1424    if find_copies_harder {
1425        for (path, _mode, oid) in source_tree_entries {
1426            if !sources.iter().any(|(p, _, _)| p == path) {
1427                sources.push((path.clone(), *oid, false));
1428            }
1429        }
1430    }
1431
1432    if sources.is_empty() {
1433        let mut result = others;
1434        result.extend(deleted);
1435        result.extend(added);
1436        result.sort_by(|a, b| a.path().cmp(b.path()));
1437        return result;
1438    }
1439
1440    // Read content for sources.
1441    let source_contents: Vec<Option<Vec<u8>>> = sources
1442        .iter()
1443        .map(|(_, oid, _)| odb.read(oid).ok().map(|obj| obj.data))
1444        .collect();
1445
1446    // Read content for added blobs.
1447    let added_contents: Vec<Option<Vec<u8>>> = added
1448        .iter()
1449        .map(|a| odb.read(&a.new_oid).ok().map(|obj| obj.data))
1450        .collect();
1451
1452    // Build score matrix: (score, source_idx, added_idx)
1453    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
1454    for (si, (src_path, src_oid, _)) in sources.iter().enumerate() {
1455        for (ai, add) in added.iter().enumerate() {
1456            // Never pair a path with itself as copy source (matches Git; avoids
1457            // arbitrary tie-breaking when several sources share the same blob).
1458            if add.new_path.as_deref() == Some(src_path.as_str()) {
1459                continue;
1460            }
1461            if *src_oid == add.new_oid {
1462                scores.push((100, si, ai));
1463                continue;
1464            }
1465            let score = match (&source_contents[si], &added_contents[ai]) {
1466                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
1467                _ => 0,
1468            };
1469            if score >= threshold {
1470                scores.push((score, si, ai));
1471            }
1472        }
1473    }
1474
1475    // Sort by score descending.
1476    scores.sort_by(|a, b| b.0.cmp(&a.0));
1477
1478    // Build source->added mappings, each added file assigned to best source.
1479    let mut used_added = vec![false; added.len()];
1480    let mut source_to_added: HashMap<usize, Vec<(usize, u32)>> = HashMap::new();
1481    for &(score, si, ai) in &scores {
1482        if used_added[ai] {
1483            continue;
1484        }
1485        used_added[ai] = true;
1486        source_to_added.entry(si).or_default().push((ai, score));
1487    }
1488
1489    // Determine which become Rename vs Copy.
1490    // For each deleted source, at most one assignment becomes a Rename;
1491    // the rest become Copies. Pick the last alphabetically as rename target.
1492    let mut used_added2 = vec![false; added.len()];
1493    let mut result_entries: Vec<DiffEntry> = Vec::new();
1494
1495    // Track which deleted sources got a rename.
1496    let mut renamed_deleted: HashSet<usize> = HashSet::new();
1497
1498    // For each deleted source, pick one assignment as Rename, rest as Copy.
1499    for (&si, assignments_for_src) in &source_to_added {
1500        let (_, _, is_deleted) = &sources[si];
1501        if *is_deleted && !assignments_for_src.is_empty() {
1502            // Pick the last one (by path) as the rename target.
1503            // Git tends to pick the rename as the last alphabetically.
1504            let rename_ai = assignments_for_src
1505                .iter()
1506                .max_by_key(|(ai, _score)| added[*ai].path().to_string())
1507                .map(|(ai, _)| *ai);
1508
1509            for &(ai, score) in assignments_for_src {
1510                let (ref src_path, _, _) = sources[si];
1511                let add = &added[ai];
1512                let src_mode = source_tree_entries
1513                    .iter()
1514                    .find(|(p, _, _)| p == src_path)
1515                    .map(|(_, m, _)| m.clone())
1516                    .unwrap_or_else(|| add.old_mode.clone());
1517
1518                let is_rename = Some(ai) == rename_ai;
1519                result_entries.push(DiffEntry {
1520                    status: if is_rename {
1521                        DiffStatus::Renamed
1522                    } else {
1523                        DiffStatus::Copied
1524                    },
1525                    old_path: Some(src_path.clone()),
1526                    new_path: add.new_path.clone(),
1527                    old_mode: src_mode,
1528                    new_mode: add.new_mode.clone(),
1529                    old_oid: sources[si].1,
1530                    new_oid: add.new_oid,
1531                    score: Some(score),
1532                });
1533                used_added2[ai] = true;
1534            }
1535            renamed_deleted.insert(si);
1536        } else {
1537            // Non-deleted source: all assignments are copies.
1538            for &(ai, score) in assignments_for_src {
1539                let (ref src_path, _, _) = sources[si];
1540                let add = &added[ai];
1541                let src_mode = source_tree_entries
1542                    .iter()
1543                    .find(|(p, _, _)| p == src_path)
1544                    .map(|(_, m, _)| m.clone())
1545                    .unwrap_or_else(|| add.old_mode.clone());
1546
1547                result_entries.push(DiffEntry {
1548                    status: DiffStatus::Copied,
1549                    old_path: Some(src_path.clone()),
1550                    new_path: add.new_path.clone(),
1551                    old_mode: src_mode,
1552                    new_mode: add.new_mode.clone(),
1553                    old_oid: sources[si].1,
1554                    new_oid: add.new_oid,
1555                    score: Some(score),
1556                });
1557                used_added2[ai] = true;
1558            }
1559        }
1560    }
1561
1562    // Keep deleted entries that weren't consumed by a rename.
1563    for entry in deleted.into_iter() {
1564        if let Some(ref path) = entry.old_path {
1565            if let Some(&si) = deleted_source_idx.get(path) {
1566                if renamed_deleted.contains(&si) {
1567                    // This deletion was consumed by a rename; skip it.
1568                    continue;
1569                }
1570            }
1571        }
1572        result_entries.push(entry);
1573    }
1574
1575    let mut result = others;
1576    result.extend(result_entries);
1577    // Keep unmatched added entries.
1578    for (i, entry) in added.into_iter().enumerate() {
1579        if !used_added2[i] {
1580            result.push(entry);
1581        }
1582    }
1583
1584    result.sort_by(|a, b| a.path().cmp(b.path()));
1585    result
1586}
1587
1588/// Apply Git-style rename and optional copy detection for index↔worktree diffs.
1589///
1590/// When `copies` is true (Git `diff.renames` / `status.renames` set to `copy`/`copies`),
1591/// runs [`detect_copies`] after rename detection so added files can match unchanged
1592/// paths from `HEAD` (e.g. intent-to-add copies).
1593///
1594/// # Errors
1595///
1596/// Propagates errors from reading the `head_tree` object from `odb`.
1597pub fn status_apply_rename_copy_detection(
1598    odb: &Odb,
1599    unstaged_raw: Vec<DiffEntry>,
1600    threshold: u32,
1601    copies: bool,
1602    head_tree: Option<&ObjectId>,
1603) -> Result<Vec<DiffEntry>> {
1604    let after_renames = detect_renames(odb, unstaged_raw, threshold);
1605    if !copies {
1606        return Ok(after_renames);
1607    }
1608    let source_tree_entries: Vec<(String, String, ObjectId)> = match head_tree {
1609        Some(oid) => flatten_tree(odb, oid, "")?
1610            .into_iter()
1611            .map(|e| (e.path, format_mode(e.mode), e.oid))
1612            .collect(),
1613        None => Vec::new(),
1614    };
1615    Ok(detect_copies(
1616        odb,
1617        after_renames,
1618        threshold,
1619        false,
1620        &source_tree_entries,
1621    ))
1622}
1623
1624/// Format a rename pair using Git's compact path format.
1625///
1626/// Examples:
1627/// - `a/b/c` → `c/b/a` → `a/b/c => c/b/a`
1628/// - `c/b/a` → `c/d/e` → `c/{b/a => d/e}`
1629/// - `c/d/e` → `d/e` → `{c/d => d}/e`
1630/// - `d/e` → `d/f/e` → `d/{ => f}/e`
1631pub fn format_rename_path(old: &str, new: &str) -> String {
1632    let ob = old.as_bytes();
1633    let nb = new.as_bytes();
1634
1635    // Find common prefix length, snapped to '/' boundary.
1636    let pfx = {
1637        let mut last_sep = 0usize;
1638        let min_len = ob.len().min(nb.len());
1639        for i in 0..min_len {
1640            if ob[i] != nb[i] {
1641                break;
1642            }
1643            if ob[i] == b'/' {
1644                last_sep = i + 1;
1645            }
1646        }
1647        last_sep
1648    };
1649
1650    // Find common suffix length, snapped to '/' boundary.
1651    let mut sfx = {
1652        let mut last_sep = 0usize;
1653        let min_len = ob.len().min(nb.len());
1654        for i in 0..min_len {
1655            let oi = ob.len() - 1 - i;
1656            let ni = nb.len() - 1 - i;
1657            if ob[oi] != nb[ni] {
1658                break;
1659            }
1660            if ob[oi] == b'/' {
1661                last_sep = i + 1;
1662            }
1663        }
1664        last_sep
1665    };
1666
1667    // Suffix starts at this position in each string.
1668    let mut sfx_at_old = ob.len() - sfx;
1669    let mut sfx_at_new = nb.len() - sfx;
1670
1671    // If prefix and suffix overlap in both strings (both middles empty),
1672    // reduce the suffix so that at least the longer string has a non-empty middle.
1673    while pfx > sfx_at_old && pfx > sfx_at_new && sfx > 0 {
1674        // Reduce suffix by snapping to the next smaller '/' boundary.
1675        let suffix_bytes = &ob[sfx_at_old..];
1676        let mut new_sfx = 0;
1677        // Find the next '/' after sfx_at_old (i.e., reduce suffix).
1678        for i in 1..suffix_bytes.len() {
1679            if suffix_bytes[i] == b'/' {
1680                new_sfx = sfx - i;
1681                break;
1682            }
1683        }
1684        if new_sfx == 0 || new_sfx >= sfx {
1685            sfx_at_old = ob.len();
1686            sfx_at_new = nb.len();
1687            break;
1688        }
1689        sfx = new_sfx;
1690        sfx_at_old = ob.len() - sfx;
1691        sfx_at_new = nb.len() - sfx;
1692    }
1693
1694    // When prefix and suffix overlap in the shorter string, they share
1695    // the '/' boundary character. In the output format, the shared '/'
1696    // appears in both positions (e.g. "d/{ => f}/e" for d/e → d/f/e).
1697    // Compute the middle parts. When prefix and suffix overlap in a
1698    // string, the middle for that string is empty. The shared '/' shows
1699    // in both prefix (trailing) and suffix (leading) positions.
1700    let prefix = &old[..pfx];
1701    let suffix = &old[sfx_at_old..];
1702    let old_mid = if pfx <= sfx_at_old {
1703        &old[pfx..sfx_at_old]
1704    } else {
1705        ""
1706    };
1707    let new_mid = if pfx <= sfx_at_new {
1708        &new[pfx..sfx_at_new]
1709    } else {
1710        ""
1711    };
1712
1713    if prefix.is_empty() && suffix.is_empty() {
1714        return format!("{old} => {new}");
1715    }
1716
1717    format!("{prefix}{{{old_mid} => {new_mid}}}{suffix}")
1718}
1719
1720/// Check if two entries share the same filename (basename).
1721fn same_basename(del: &DiffEntry, add: &DiffEntry) -> bool {
1722    let old = del.old_path.as_deref().unwrap_or("");
1723    let new = add.new_path.as_deref().unwrap_or("");
1724    let old_base = old.rsplit('/').next().unwrap_or(old);
1725    let new_base = new.rsplit('/').next().unwrap_or(new);
1726    old_base == new_base && !old_base.is_empty()
1727}
1728
1729/// Compute a similarity percentage (0–100) between two byte slices.
1730///
1731/// Uses Git's approach: count the bytes that are "shared" (appear in
1732/// equal lines), then compute `score = shared_bytes * 2 * 100 / (src_size + dst_size)`.
1733fn compute_similarity(old: &[u8], new: &[u8]) -> u32 {
1734    // Normalize CRLF → LF before comparing so that files differing
1735    // only in line endings are detected as renames.
1736    let old_norm = crate::crlf::crlf_to_lf(old);
1737    let new_norm = crate::crlf::crlf_to_lf(new);
1738
1739    let src_size = old_norm.len();
1740    let dst_size = new_norm.len();
1741
1742    if src_size == 0 && dst_size == 0 {
1743        return 100;
1744    }
1745    let total = src_size + dst_size;
1746    if total == 0 {
1747        return 100;
1748    }
1749
1750    // Use line-level diff to find shared content, then count bytes.
1751    use similar::{ChangeTag, TextDiff};
1752    let old_str = String::from_utf8_lossy(&old_norm);
1753    let new_str = String::from_utf8_lossy(&new_norm);
1754    let diff = TextDiff::from_lines(&old_str as &str, &new_str as &str);
1755
1756    let mut shared_bytes = 0usize;
1757    for change in diff.iter_all_changes() {
1758        if change.tag() == ChangeTag::Equal {
1759            // Count bytes in the matching line (including newline).
1760            shared_bytes += change.value().len();
1761        }
1762    }
1763
1764    // Git: score = copied * MAX_SCORE / max(src_size, dst_size)
1765    // We normalize to 0-100.
1766    let max_size = src_size.max(dst_size);
1767
1768    ((shared_bytes * 100) / max_size).min(100) as u32
1769}
1770
1771/// Compute rename/copy similarity percentage (0–100) between two byte slices.
1772///
1773/// This uses the same scoring logic as internal rename detection.
1774#[must_use]
1775pub fn rename_similarity_score(old: &[u8], new: &[u8]) -> u32 {
1776    compute_similarity(old, new)
1777}
1778
1779// ── Output formatting ───────────────────────────────────────────────
1780
1781/// Format a diff entry in Git's raw diff format.
1782///
1783/// Example: `:100644 100644 abc1234... def5678... M\tfile.txt`
1784pub fn format_raw(entry: &DiffEntry) -> String {
1785    let path = match entry.status {
1786        DiffStatus::Renamed | DiffStatus::Copied => {
1787            format!(
1788                "{}\t{}",
1789                entry.old_path.as_deref().unwrap_or(""),
1790                entry.new_path.as_deref().unwrap_or("")
1791            )
1792        }
1793        _ => entry.path().to_owned(),
1794    };
1795
1796    let status_str = match (entry.status, entry.score) {
1797        (DiffStatus::Renamed, Some(s)) => format!("R{:03}", s),
1798        (DiffStatus::Copied, Some(s)) => format!("C{:03}", s),
1799        _ => entry.status.letter().to_string(),
1800    };
1801
1802    format!(
1803        ":{} {} {} {} {}\t{}",
1804        entry.old_mode, entry.new_mode, entry.old_oid, entry.new_oid, status_str, path
1805    )
1806}
1807
1808/// Format a diff entry with abbreviated OIDs.
1809pub fn format_raw_abbrev(entry: &DiffEntry, abbrev_len: usize) -> String {
1810    let ellipsis = if std::env::var("GIT_PRINT_SHA1_ELLIPSIS").ok().as_deref() == Some("yes") {
1811        "..."
1812    } else {
1813        ""
1814    };
1815    let old_hex = format!("{}", entry.old_oid);
1816    let new_hex = format!("{}", entry.new_oid);
1817    let old_abbrev = &old_hex[..abbrev_len.min(old_hex.len())];
1818    let new_abbrev = &new_hex[..abbrev_len.min(new_hex.len())];
1819
1820    let path = entry.path();
1821
1822    format!(
1823        ":{} {} {}{} {}{} {}\t{}",
1824        entry.old_mode,
1825        entry.new_mode,
1826        old_abbrev,
1827        ellipsis,
1828        new_abbrev,
1829        ellipsis,
1830        entry.status.letter(),
1831        path
1832    )
1833}
1834
1835/// Generate a unified diff patch for two blobs.
1836///
1837/// # Parameters
1838///
1839/// - `old_content` — the old file content (empty for added files).
1840/// - `new_content` — the new file content (empty for deleted files).
1841/// - `old_path` — display path for the old side.
1842/// - `new_path` — display path for the new side.
1843/// - `context_lines` — number of context lines around changes (default: 3).
1844/// - Inter-hunk context defaults to `0` (see [`unified_diff_with_prefix`]).
1845///
1846/// # Returns
1847///
1848/// The unified diff as a string.
1849pub fn unified_diff(
1850    old_content: &str,
1851    new_content: &str,
1852    old_path: &str,
1853    new_path: &str,
1854    context_lines: usize,
1855) -> String {
1856    unified_diff_with_prefix(
1857        old_content,
1858        new_content,
1859        old_path,
1860        new_path,
1861        context_lines,
1862        0,
1863        "a/",
1864        "b/",
1865    )
1866}
1867
1868/// Same as `unified_diff` but with configurable source/destination prefixes.
1869///
1870/// `inter_hunk_context` is Git's `--inter-hunk-context`: adjacent hunks merge when
1871/// the unchanged gap between them is at most `2 * context_lines + inter_hunk_context` lines.
1872pub fn unified_diff_with_prefix(
1873    old_content: &str,
1874    new_content: &str,
1875    old_path: &str,
1876    new_path: &str,
1877    context_lines: usize,
1878    inter_hunk_context: usize,
1879    src_prefix: &str,
1880    dst_prefix: &str,
1881) -> String {
1882    unified_diff_with_prefix_and_funcname(
1883        old_content,
1884        new_content,
1885        old_path,
1886        new_path,
1887        context_lines,
1888        inter_hunk_context,
1889        src_prefix,
1890        dst_prefix,
1891        None,
1892    )
1893}
1894
1895/// Same as [`unified_diff_with_prefix`] with optional custom hunk-header
1896/// function-name matching.
1897pub fn unified_diff_with_prefix_and_funcname(
1898    old_content: &str,
1899    new_content: &str,
1900    old_path: &str,
1901    new_path: &str,
1902    context_lines: usize,
1903    inter_hunk_context: usize,
1904    src_prefix: &str,
1905    dst_prefix: &str,
1906    funcname_matcher: Option<&FuncnameMatcher>,
1907) -> String {
1908    unified_diff_with_prefix_and_funcname_and_algorithm(
1909        old_content,
1910        new_content,
1911        old_path,
1912        new_path,
1913        context_lines,
1914        inter_hunk_context,
1915        src_prefix,
1916        dst_prefix,
1917        funcname_matcher,
1918        similar::Algorithm::Myers,
1919    )
1920}
1921
1922/// Same as [`unified_diff_with_prefix_and_funcname`] but allows callers to
1923/// choose the line diff algorithm used for hunk generation.
1924pub fn unified_diff_with_prefix_and_funcname_and_algorithm(
1925    old_content: &str,
1926    new_content: &str,
1927    old_path: &str,
1928    new_path: &str,
1929    context_lines: usize,
1930    inter_hunk_context: usize,
1931    src_prefix: &str,
1932    dst_prefix: &str,
1933    funcname_matcher: Option<&FuncnameMatcher>,
1934    algorithm: similar::Algorithm,
1935) -> String {
1936    use similar::{group_diff_ops, udiff::UnifiedDiffHunk, TextDiff};
1937
1938    let diff = TextDiff::configure()
1939        .algorithm(algorithm)
1940        .diff_lines(old_content, new_content);
1941
1942    let mut output = String::new();
1943    if old_path == "/dev/null" {
1944        output.push_str("--- /dev/null\n");
1945    } else {
1946        output.push_str(&format!("--- {src_prefix}{old_path}\n"));
1947    }
1948    if new_path == "/dev/null" {
1949        output.push_str("+++ /dev/null\n");
1950    } else {
1951        output.push_str(&format!("+++ {dst_prefix}{new_path}\n"));
1952    }
1953
1954    let old_lines: Vec<&str> = old_content.lines().collect();
1955
1956    let group_radius = context_lines
1957        .saturating_mul(2)
1958        .saturating_add(inter_hunk_context);
1959    let op_groups = group_diff_ops(diff.ops().to_vec(), group_radius);
1960
1961    for ops in op_groups {
1962        if ops.is_empty() {
1963            continue;
1964        }
1965        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
1966        let hunk_str = format!("{hunk}");
1967        // The similar crate outputs @@ -a,b +c,d @@\n but Git adds
1968        // function context after the closing @@. Extract the hunk header
1969        // and add function context.
1970        if let Some(first_newline) = hunk_str.find('\n') {
1971            let header_line = &hunk_str[..first_newline];
1972            let rest = &hunk_str[first_newline..];
1973
1974            // Parse the old start line from the @@ header
1975            if let Some(func_ctx) =
1976                extract_function_context(header_line, &old_lines, funcname_matcher)
1977            {
1978                output.push_str(header_line);
1979                output.push(' ');
1980                output.push_str(&func_ctx);
1981                output.push_str(rest);
1982            } else {
1983                output.push_str(&hunk_str);
1984            }
1985        } else {
1986            output.push_str(&hunk_str);
1987        }
1988    }
1989
1990    output
1991}
1992
1993/// Compute a unified diff with anchored lines.
1994///
1995/// Anchored lines that appear exactly once in both old and new content are
1996/// forced to match, splitting the diff into segments around those anchor points.
1997/// This produces diffs where the anchored text stays as context and surrounding
1998/// lines are shown as additions/removals.
1999///
2000/// Segment diffs use `algorithm` (Git maps `--histogram` to patience-style line diff in our stack).
2001pub fn anchored_unified_diff(
2002    old_content: &str,
2003    new_content: &str,
2004    old_path: &str,
2005    new_path: &str,
2006    context_lines: usize,
2007    anchors: &[String],
2008    algorithm: similar::Algorithm,
2009) -> String {
2010    use similar::TextDiff;
2011
2012    let old_lines: Vec<&str> = old_content.lines().collect();
2013    let new_lines: Vec<&str> = new_content.lines().collect();
2014
2015    // Find anchored lines that appear exactly once in both old and new
2016    let mut anchor_pairs: Vec<(usize, usize)> = Vec::new(); // (old_idx, new_idx)
2017
2018    for anchor in anchors {
2019        let anchor_str = anchor.as_str();
2020
2021        // Count occurrences in old
2022        let old_positions: Vec<usize> = old_lines
2023            .iter()
2024            .enumerate()
2025            .filter(|(_, l)| l.trim_end() == anchor_str)
2026            .map(|(i, _)| i)
2027            .collect();
2028
2029        // Count occurrences in new
2030        let new_positions: Vec<usize> = new_lines
2031            .iter()
2032            .enumerate()
2033            .filter(|(_, l)| l.trim_end() == anchor_str)
2034            .map(|(i, _)| i)
2035            .collect();
2036
2037        // Only anchor if unique in both
2038        if old_positions.len() == 1 && new_positions.len() == 1 {
2039            anchor_pairs.push((old_positions[0], new_positions[0]));
2040        }
2041    }
2042
2043    // If no valid anchors, fall back to normal diff
2044    if anchor_pairs.is_empty() {
2045        return unified_diff_with_prefix_and_funcname_and_algorithm(
2046            old_content,
2047            new_content,
2048            old_path,
2049            new_path,
2050            context_lines,
2051            0,
2052            "a/",
2053            "b/",
2054            None,
2055            algorithm,
2056        );
2057    }
2058
2059    // Sort anchor pairs by their position in the old file
2060    anchor_pairs.sort_by_key(|&(old_idx, _)| old_idx);
2061
2062    // Filter to only keep pairs where new positions are also increasing
2063    // (longest increasing subsequence of new positions)
2064    let mut filtered: Vec<(usize, usize)> = Vec::new();
2065    for &pair in &anchor_pairs {
2066        if filtered.is_empty() || pair.1 > filtered.last().unwrap().1 {
2067            filtered.push(pair);
2068        }
2069    }
2070    let anchor_pairs = filtered;
2071
2072    // Build a modified version of old/new where we diff segments between anchors.
2073    // We'll construct the diff by processing segments:
2074    // - Before first anchor
2075    // - Between consecutive anchors
2076    // - After last anchor
2077    // Each anchor line itself is a fixed context match.
2078
2079    // Collect all diff operations
2080    struct DiffOp {
2081        tag: char, // ' ', '+', '-'
2082        line: String,
2083    }
2084
2085    let mut ops: Vec<DiffOp> = Vec::new();
2086    let mut old_pos = 0usize;
2087    let mut new_pos = 0usize;
2088
2089    for &(old_anchor, new_anchor) in &anchor_pairs {
2090        // Diff the segment before this anchor
2091        let old_segment: Vec<&str> = old_lines[old_pos..old_anchor].to_vec();
2092        let new_segment: Vec<&str> = new_lines[new_pos..new_anchor].to_vec();
2093
2094        let old_seg_text = old_segment.join("\n");
2095        let new_seg_text = new_segment.join("\n");
2096
2097        if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
2098            let old_seg_input = if old_seg_text.is_empty() {
2099                String::new()
2100            } else {
2101                format!("{}\n", old_seg_text)
2102            };
2103            let new_seg_input = if new_seg_text.is_empty() {
2104                String::new()
2105            } else {
2106                format!("{}\n", new_seg_text)
2107            };
2108            let seg_diff = TextDiff::configure()
2109                .algorithm(algorithm)
2110                .diff_lines(&old_seg_input, &new_seg_input);
2111            for change in seg_diff.iter_all_changes() {
2112                let tag = match change.tag() {
2113                    similar::ChangeTag::Equal => ' ',
2114                    similar::ChangeTag::Delete => '-',
2115                    similar::ChangeTag::Insert => '+',
2116                };
2117                ops.push(DiffOp {
2118                    tag,
2119                    line: change.value().trim_end_matches('\n').to_string(),
2120                });
2121            }
2122        }
2123
2124        // The anchor line itself is always context
2125        ops.push(DiffOp {
2126            tag: ' ',
2127            line: old_lines[old_anchor].to_string(),
2128        });
2129
2130        old_pos = old_anchor + 1;
2131        new_pos = new_anchor + 1;
2132    }
2133
2134    // Diff the remaining segment after the last anchor
2135    let old_segment: Vec<&str> = old_lines[old_pos..].to_vec();
2136    let new_segment: Vec<&str> = new_lines[new_pos..].to_vec();
2137    let old_seg_text = old_segment.join("\n");
2138    let new_seg_text = new_segment.join("\n");
2139
2140    if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
2141        let old_seg_input = if old_seg_text.is_empty() {
2142            String::new()
2143        } else {
2144            format!("{}\n", old_seg_text)
2145        };
2146        let new_seg_input = if new_seg_text.is_empty() {
2147            String::new()
2148        } else {
2149            format!("{}\n", new_seg_text)
2150        };
2151        let seg_diff = TextDiff::configure()
2152            .algorithm(algorithm)
2153            .diff_lines(&old_seg_input, &new_seg_input);
2154        for change in seg_diff.iter_all_changes() {
2155            let tag = match change.tag() {
2156                similar::ChangeTag::Equal => ' ',
2157                similar::ChangeTag::Delete => '-',
2158                similar::ChangeTag::Insert => '+',
2159            };
2160            ops.push(DiffOp {
2161                tag,
2162                line: change.value().trim_end_matches('\n').to_string(),
2163            });
2164        }
2165    }
2166
2167    // Now format as unified diff with hunks
2168    let mut output = String::new();
2169    if old_path == "/dev/null" {
2170        output.push_str("--- /dev/null\n");
2171    } else {
2172        output.push_str(&format!("--- a/{old_path}\n"));
2173    }
2174    if new_path == "/dev/null" {
2175        output.push_str("+++ /dev/null\n");
2176    } else {
2177        output.push_str(&format!("+++ b/{new_path}\n"));
2178    }
2179
2180    // Group ops into hunks with context
2181    let total_ops = ops.len();
2182    if total_ops == 0 {
2183        return output;
2184    }
2185
2186    // Find ranges of changes
2187    let mut hunks: Vec<(usize, usize)> = Vec::new(); // (start, end) indices into ops
2188    let mut i = 0;
2189    while i < total_ops {
2190        if ops[i].tag != ' ' {
2191            let start = i.saturating_sub(context_lines);
2192            let mut end = i;
2193            // Extend to include consecutive changes and their context
2194            while end < total_ops {
2195                if ops[end].tag != ' ' {
2196                    end += 1;
2197                    continue;
2198                }
2199                // Check if there's another change within context_lines
2200                let mut next_change = end;
2201                while next_change < total_ops && ops[next_change].tag == ' ' {
2202                    next_change += 1;
2203                }
2204                if next_change < total_ops && next_change - end <= context_lines * 2 {
2205                    end = next_change + 1;
2206                } else {
2207                    end = (end + context_lines).min(total_ops);
2208                    break;
2209                }
2210            }
2211            // Merge with previous hunk if overlapping
2212            if let Some(last) = hunks.last_mut() {
2213                if start <= last.1 {
2214                    last.1 = end;
2215                } else {
2216                    hunks.push((start, end));
2217                }
2218            } else {
2219                hunks.push((start, end));
2220            }
2221            i = end;
2222        } else {
2223            i += 1;
2224        }
2225    }
2226
2227    // Output each hunk
2228    for (start, end) in hunks {
2229        // Count old/new lines in this hunk
2230        let mut old_start = 1usize;
2231        let mut new_start = 1usize;
2232        // Calculate line numbers by counting ops before this hunk
2233        for op in &ops[..start] {
2234            match op.tag {
2235                ' ' => {
2236                    old_start += 1;
2237                    new_start += 1;
2238                }
2239                '-' => {
2240                    old_start += 1;
2241                }
2242                '+' => {
2243                    new_start += 1;
2244                }
2245                _ => {}
2246            }
2247        }
2248        let mut old_count = 0usize;
2249        let mut new_count = 0usize;
2250        for op in &ops[start..end] {
2251            match op.tag {
2252                ' ' => {
2253                    old_count += 1;
2254                    new_count += 1;
2255                }
2256                '-' => {
2257                    old_count += 1;
2258                }
2259                '+' => {
2260                    new_count += 1;
2261                }
2262                _ => {}
2263            }
2264        }
2265
2266        output.push_str(&format!(
2267            "@@ -{},{} +{},{} @@\n",
2268            old_start, old_count, new_start, new_count
2269        ));
2270        for op in &ops[start..end] {
2271            output.push(op.tag);
2272            output.push_str(&op.line);
2273            output.push('\n');
2274        }
2275    }
2276
2277    output
2278}
2279
2280/// Extract function context for a hunk header.
2281///
2282/// Given a hunk header like `@@ -8,7 +8,7 @@`, find the last line
2283/// before line 8 in the old content that looks like a function header
2284/// (starts with a non-whitespace character, like Git's default).
2285fn extract_function_context(
2286    header: &str,
2287    old_lines: &[&str],
2288    funcname_matcher: Option<&FuncnameMatcher>,
2289) -> Option<String> {
2290    // Parse the old start line number from "@@ -<start>,<count> ..."
2291    let at_pos = header.find("-")?;
2292    let rest = &header[at_pos + 1..];
2293    let comma_or_space = rest.find([',', ' '])?;
2294    let start_str = &rest[..comma_or_space];
2295    let start_line: usize = start_str.parse().ok()?;
2296
2297    if start_line <= 1 {
2298        return None;
2299    }
2300
2301    // Look backwards from the line before the hunk start for a line that
2302    // starts with a non-whitespace character (Git's default funcname pattern).
2303    // start_line is 1-indexed, so the hunk starts at old_lines[start_line-1].
2304    // We want to look at lines before that: old_lines[0..start_line-1].
2305    let search_end = (start_line - 1).min(old_lines.len());
2306    let truncate = |text: &str| {
2307        if text.len() > 80 {
2308            let mut end = 80;
2309            while end > 0 && !text.is_char_boundary(end) {
2310                end -= 1;
2311            }
2312            text[..end].to_owned()
2313        } else {
2314            text.to_owned()
2315        }
2316    };
2317
2318    for i in (0..search_end).rev() {
2319        let line = old_lines[i];
2320        if line.is_empty() {
2321            continue;
2322        }
2323        if let Some(matcher) = funcname_matcher {
2324            if let Some(matched) = matcher.match_line(line) {
2325                return Some(truncate(&matched));
2326            }
2327            continue;
2328        }
2329
2330        let first = line.as_bytes()[0];
2331        if first.is_ascii_alphabetic() || first == b'_' || first == b'$' {
2332            return Some(truncate(line.trim_end_matches(char::is_whitespace)));
2333        }
2334    }
2335    None
2336}
2337
2338/// Generate diff stat output (file name + insertions/deletions).
2339///
2340/// Returns a single line like: ` file.txt | 5 ++---`
2341pub fn format_stat_line(
2342    path: &str,
2343    insertions: usize,
2344    deletions: usize,
2345    max_path_len: usize,
2346) -> String {
2347    format_stat_line_width(path, insertions, deletions, max_path_len, 0)
2348}
2349
2350pub fn format_stat_line_width(
2351    path: &str,
2352    insertions: usize,
2353    deletions: usize,
2354    max_path_len: usize,
2355    count_width: usize,
2356) -> String {
2357    let total = insertions + deletions;
2358    let plus = "+".repeat(insertions.min(50));
2359    let minus = "-".repeat(deletions.min(50));
2360    let cw = if count_width > 0 {
2361        count_width
2362    } else {
2363        format!("{}", total).len()
2364    };
2365    let bar = format!("{}{}", plus, minus);
2366    if bar.is_empty() {
2367        format!(
2368            " {:<width$} | {:>cw$}",
2369            path,
2370            total,
2371            width = max_path_len,
2372            cw = cw
2373        )
2374    } else {
2375        format!(
2376            " {:<width$} | {:>cw$} {}",
2377            path,
2378            total,
2379            bar,
2380            width = max_path_len,
2381            cw = cw
2382        )
2383    }
2384}
2385
2386/// Count insertions and deletions between two strings.
2387///
2388/// Returns `(insertions, deletions)`.
2389pub fn count_changes(old_content: &str, new_content: &str) -> (usize, usize) {
2390    count_changes_with_algorithm(old_content, new_content, similar::Algorithm::Myers)
2391}
2392
2393/// Count insertions and deletions using the given line-diff algorithm.
2394///
2395/// Git's `--stat` / `--numstat` follow the configured diff algorithm; this mirrors that by
2396/// running [`similar::TextDiff`] with an explicit [`similar::Algorithm`].
2397#[must_use]
2398pub fn count_changes_with_algorithm(
2399    old_content: &str,
2400    new_content: &str,
2401    algorithm: similar::Algorithm,
2402) -> (usize, usize) {
2403    use similar::{ChangeTag, TextDiff};
2404
2405    let diff = TextDiff::configure()
2406        .algorithm(algorithm)
2407        .diff_lines(old_content, new_content);
2408    let mut ins = 0;
2409    let mut del = 0;
2410
2411    for change in diff.iter_all_changes() {
2412        match change.tag() {
2413            ChangeTag::Insert => ins += 1,
2414            ChangeTag::Delete => del += 1,
2415            ChangeTag::Equal => {}
2416        }
2417    }
2418
2419    (ins, del)
2420}
2421
2422/// Line count for diffstat/`--numstat`, matching Git's `count_lines()` in `diff.c`.
2423///
2424/// Counts newline-terminated lines; a final line without trailing newline still counts as one line.
2425/// An empty buffer yields `0`.
2426#[must_use]
2427pub fn count_git_lines(data: &[u8]) -> usize {
2428    if data.is_empty() {
2429        return 0;
2430    }
2431    let mut count = 0usize;
2432    let mut nl_just_seen = false;
2433    for &ch in data {
2434        if ch == b'\n' {
2435            count += 1;
2436            nl_just_seen = true;
2437        } else {
2438            nl_just_seen = false;
2439        }
2440    }
2441    if !nl_just_seen {
2442        count += 1;
2443    }
2444    count
2445}
2446
2447const DIFF_MAX_SCORE: u64 = 60_000;
2448const DIFF_MINIMUM_BREAK_SIZE: usize = 400;
2449const DIFF_DEFAULT_BREAK_SCORE: u64 = 30_000;
2450const DIFF_HASHBASE: u32 = 107_927;
2451
2452#[derive(Clone, Copy, Default)]
2453struct SpanSlot {
2454    hashval: u32,
2455    cnt: u32,
2456}
2457
2458struct SpanHashTop {
2459    alloc_log2: u8,
2460    free_slots: i32,
2461    data: Vec<SpanSlot>,
2462}
2463
2464impl SpanHashTop {
2465    fn new(initial_log2: u8) -> Self {
2466        let cap = 1usize << initial_log2;
2467        Self {
2468            alloc_log2: initial_log2,
2469            free_slots: initial_free(initial_log2),
2470            data: vec![SpanSlot::default(); cap],
2471        }
2472    }
2473
2474    fn len(&self) -> usize {
2475        1usize << self.alloc_log2
2476    }
2477
2478    fn add_span(&mut self, hashval: u32, cnt: u32) {
2479        loop {
2480            let lim = self.len();
2481            let mut bucket = (hashval as usize) & (lim - 1);
2482            loop {
2483                let h = &mut self.data[bucket];
2484                if h.cnt == 0 {
2485                    h.hashval = hashval;
2486                    h.cnt = cnt;
2487                    self.free_slots -= 1;
2488                    if self.free_slots < 0 {
2489                        self.rehash();
2490                        break;
2491                    }
2492                    return;
2493                }
2494                if h.hashval == hashval {
2495                    h.cnt = h.cnt.saturating_add(cnt);
2496                    return;
2497                }
2498                bucket += 1;
2499                if bucket >= lim {
2500                    bucket = 0;
2501                }
2502            }
2503        }
2504    }
2505
2506    fn rehash(&mut self) {
2507        let old = std::mem::take(&mut self.data);
2508        let old_log = self.alloc_log2;
2509        self.alloc_log2 = old_log.saturating_add(1);
2510        let new_len = 1usize << self.alloc_log2;
2511        self.free_slots = initial_free(self.alloc_log2);
2512        self.data = vec![SpanSlot::default(); new_len];
2513        let old_sz = 1usize << old_log;
2514        for o in old.iter().take(old_sz) {
2515            let o = *o;
2516            if o.cnt == 0 {
2517                continue;
2518            }
2519            self.add_span_after_rehash(o.hashval, o.cnt);
2520        }
2521    }
2522
2523    fn add_span_after_rehash(&mut self, hashval: u32, cnt: u32) {
2524        loop {
2525            let lim = self.len();
2526            let mut bucket = (hashval as usize) & (lim - 1);
2527            loop {
2528                let h = &mut self.data[bucket];
2529                if h.cnt == 0 {
2530                    h.hashval = hashval;
2531                    h.cnt = cnt;
2532                    self.free_slots -= 1;
2533                    if self.free_slots < 0 {
2534                        self.rehash();
2535                        break;
2536                    }
2537                    return;
2538                }
2539                if h.hashval == hashval {
2540                    h.cnt = h.cnt.saturating_add(cnt);
2541                    return;
2542                }
2543                bucket += 1;
2544                if bucket >= lim {
2545                    bucket = 0;
2546                }
2547            }
2548        }
2549    }
2550
2551    fn sort_by_hashval(&mut self) {
2552        let sz = self.len();
2553        self.data[..sz].sort_by(|a, b| {
2554            if a.cnt == 0 {
2555                return std::cmp::Ordering::Greater;
2556            }
2557            if b.cnt == 0 {
2558                return std::cmp::Ordering::Less;
2559            }
2560            a.hashval.cmp(&b.hashval)
2561        });
2562    }
2563}
2564
2565fn initial_free(sz_log2: u8) -> i32 {
2566    let sz = sz_log2 as i32;
2567    ((1i32 << sz_log2) * (sz - 3) / sz).max(0)
2568}
2569
2570fn hash_blob_spans(buf: &[u8], is_text: bool) -> SpanHashTop {
2571    let mut hash = SpanHashTop::new(9);
2572    let mut n = 0u32;
2573    let mut accum1: u32 = 0;
2574    let mut accum2: u32 = 0;
2575    let mut i = 0usize;
2576    while i < buf.len() {
2577        let c = buf[i] as u32;
2578        let old_1 = accum1;
2579        i += 1;
2580
2581        if is_text && c == b'\r' as u32 && i < buf.len() && buf[i] == b'\n' {
2582            continue;
2583        }
2584
2585        accum1 = accum1.wrapping_shl(7) ^ accum2.wrapping_shr(25);
2586        accum2 = accum2.wrapping_shl(7) ^ old_1.wrapping_shr(25);
2587        accum1 = accum1.wrapping_add(c);
2588        n += 1;
2589        if n < 64 && c != b'\n' as u32 {
2590            continue;
2591        }
2592        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
2593        hash.add_span(hashval, n);
2594        n = 0;
2595        accum1 = 0;
2596        accum2 = 0;
2597    }
2598    if n > 0 {
2599        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
2600        hash.add_span(hashval, n);
2601    }
2602    hash.sort_by_hashval();
2603    hash
2604}
2605
2606/// Approximate copied vs added material between two blobs (Git `diffcore_count_changes`).
2607///
2608/// Returns `(copied_bytes_from_src, literal_added_bytes_in_dst)` matching Git's
2609/// `diffcore_count_changes` semantics (used for `--dirstat=changes` damage).
2610#[must_use]
2611pub fn diffcore_count_changes(old: &[u8], new: &[u8]) -> (u64, u64) {
2612    let src_is_text = !crate::merge_file::is_binary(old);
2613    let dst_is_text = !crate::merge_file::is_binary(new);
2614    let src_count = hash_blob_spans(old, src_is_text);
2615    let dst_count = hash_blob_spans(new, dst_is_text);
2616    let mut sc: u64 = 0;
2617    let mut la: u64 = 0;
2618    let mut si = 0usize;
2619    let mut di = 0usize;
2620    let src_len = src_count.len();
2621    let dst_len = dst_count.len();
2622    loop {
2623        if si >= src_len || src_count.data[si].cnt == 0 {
2624            break;
2625        }
2626        let s_hash = src_count.data[si].hashval;
2627        let s_cnt = u64::from(src_count.data[si].cnt);
2628        while di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval < s_hash {
2629            la += u64::from(dst_count.data[di].cnt);
2630            di += 1;
2631        }
2632        let mut dst_cnt = 0u64;
2633        if di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval == s_hash {
2634            dst_cnt = u64::from(dst_count.data[di].cnt);
2635            di += 1;
2636        }
2637        if s_cnt < dst_cnt {
2638            la += dst_cnt - s_cnt;
2639            sc += s_cnt;
2640        } else {
2641            sc += dst_cnt;
2642        }
2643        si += 1;
2644    }
2645    while di < dst_len && dst_count.data[di].cnt != 0 {
2646        la += u64::from(dst_count.data[di].cnt);
2647        di += 1;
2648    }
2649    (sc, la)
2650}
2651
2652/// Whether this modified blob pair should use Git's "complete rewrite" diffstat path when
2653/// `--break-rewrites` is in effect (`should_break` in `diffcore-break.c`).
2654#[must_use]
2655pub fn should_break_rewrite_for_stat(old: &[u8], new: &[u8]) -> bool {
2656    should_break_rewrite_inner(old, new, DIFF_DEFAULT_BREAK_SCORE)
2657}
2658
2659/// Git `merge_score` from `diffcore-break.c` when a pair is considered broken: how much of the
2660/// source blob was removed (0–[`DIFF_MAX_SCORE`] scale). Used for `dissimilarity index` metadata.
2661#[must_use]
2662pub fn rewrite_merge_score(old: &[u8], new: &[u8]) -> Option<u64> {
2663    if old.is_empty() {
2664        return None;
2665    }
2666    let max_size = old.len().max(new.len());
2667    if max_size < DIFF_MINIMUM_BREAK_SIZE {
2668        return None;
2669    }
2670    let (src_copied, _) = diffcore_count_changes(old, new);
2671    let src_copied = src_copied.min(old.len() as u64);
2672    let src_removed = (old.len() as u64).saturating_sub(src_copied);
2673    Some(src_removed * DIFF_MAX_SCORE / old.len() as u64)
2674}
2675
2676/// Percentage shown in `dissimilarity index N%` for a rewrite (`similarity_index` in Git's diff.c).
2677#[must_use]
2678pub fn rewrite_dissimilarity_index_percent(old: &[u8], new: &[u8]) -> Option<u32> {
2679    let score = rewrite_merge_score(old, new)?;
2680    Some((score * 100 / DIFF_MAX_SCORE).min(100) as u32)
2681}
2682
2683fn should_break_rewrite_inner(src: &[u8], dst: &[u8], break_score: u64) -> bool {
2684    if src.is_empty() {
2685        return false;
2686    }
2687    let max_size = src.len().max(dst.len());
2688    if max_size < DIFF_MINIMUM_BREAK_SIZE {
2689        return false;
2690    }
2691    let (src_copied, literal_added) = diffcore_count_changes(src, dst);
2692    let src_copied = src_copied.min(src.len() as u64);
2693    let mut literal_added = literal_added;
2694    let dst_len = dst.len() as u64;
2695    if src_copied < dst_len && literal_added + src_copied > dst_len {
2696        literal_added = dst_len.saturating_sub(src_copied);
2697    }
2698    let src_removed = (src.len() as u64).saturating_sub(src_copied);
2699    let merge_score = src_removed * DIFF_MAX_SCORE / src.len() as u64;
2700    if merge_score > break_score {
2701        return true;
2702    }
2703    let delta_size = src_removed.saturating_add(literal_added);
2704    if delta_size * DIFF_MAX_SCORE / (max_size as u64) < break_score {
2705        return false;
2706    }
2707    let s = src.len() as u64;
2708    if (s * break_score < src_removed * DIFF_MAX_SCORE)
2709        && (literal_added * 20 < src_removed)
2710        && (literal_added * 20 < src_copied)
2711    {
2712        return false;
2713    }
2714    true
2715}
2716
2717// ── Helpers ─────────────────────────────────────────────────────────
2718
2719/// Flatten a tree object recursively into a sorted list of (path, mode, oid).
2720struct FlatEntry {
2721    path: String,
2722    mode: u32,
2723    oid: ObjectId,
2724}
2725
2726fn flatten_tree(odb: &Odb, tree_oid: &ObjectId, prefix: &str) -> Result<Vec<FlatEntry>> {
2727    let entries = read_tree(odb, tree_oid)?;
2728    let mut result = Vec::new();
2729
2730    for entry in entries {
2731        let name_str = String::from_utf8_lossy(&entry.name);
2732        let path = format_path(prefix, &name_str);
2733        if is_tree_mode(entry.mode) {
2734            let nested = flatten_tree(odb, &entry.oid, &path)?;
2735            result.extend(nested);
2736        } else {
2737            result.push(FlatEntry {
2738                path,
2739                mode: entry.mode,
2740                oid: entry.oid,
2741            });
2742        }
2743    }
2744
2745    Ok(result)
2746}
2747
2748/// Paths present in `HEAD`'s tree with mode and blob/commit OID (for status porcelain v2).
2749pub fn head_path_states(
2750    odb: &Odb,
2751    head_tree: Option<&ObjectId>,
2752) -> Result<std::collections::BTreeMap<String, (u32, ObjectId)>> {
2753    let mut m = std::collections::BTreeMap::new();
2754    let Some(t) = head_tree else {
2755        return Ok(m);
2756    };
2757    for fe in flatten_tree(odb, t, "")? {
2758        m.insert(fe.path, (fe.mode, fe.oid));
2759    }
2760    Ok(m)
2761}
2762
2763/// Whether a mode represents a tree (directory).
2764fn is_tree_mode(mode: u32) -> bool {
2765    mode == 0o040000
2766}
2767
2768/// Build a path with an optional prefix.
2769fn format_path(prefix: &str, name: &str) -> String {
2770    if prefix.is_empty() {
2771        name.to_owned()
2772    } else {
2773        format!("{prefix}/{name}")
2774    }
2775}
2776
2777/// Format a numeric mode as a zero-padded octal string.
2778pub fn format_mode(mode: u32) -> String {
2779    format!("{mode:06o}")
2780}
2781
2782/// Read the HEAD commit OID from a submodule checkout directory.
2783///
2784/// Returns `None` if the path is missing, not a submodule checkout, or has no resolvable HEAD.
2785#[must_use]
2786pub fn read_submodule_head_for_checkout(sub_dir: &Path) -> Option<ObjectId> {
2787    read_submodule_head(sub_dir)
2788}
2789
2790/// True when `sub_dir` is an empty directory (or missing), i.e. the placeholder left by
2791/// `git apply --index` before `git submodule update`.
2792fn submodule_worktree_is_unpopulated_placeholder(sub_dir: &Path) -> bool {
2793    match fs::read_dir(sub_dir) {
2794        Ok(mut it) => it.next().is_none(),
2795        Err(e) if e.kind() == std::io::ErrorKind::NotFound => true,
2796        Err(_) => false,
2797    }
2798}
2799
2800fn read_submodule_head(sub_dir: &Path) -> Option<ObjectId> {
2801    read_submodule_head_oid(sub_dir)
2802}
2803
2804/// Resolve the embedded git directory for a submodule work tree (`sub_dir/.git`).
2805#[must_use]
2806pub fn submodule_embedded_git_dir(sub_dir: &Path) -> Option<PathBuf> {
2807    let gitfile = sub_dir.join(".git");
2808    if gitfile.is_file() {
2809        let content = fs::read_to_string(&gitfile).ok()?;
2810        let gitdir = content
2811            .lines()
2812            .find_map(|l| l.strip_prefix("gitdir: "))?
2813            .trim();
2814        Some(if Path::new(gitdir).is_absolute() {
2815            PathBuf::from(gitdir)
2816        } else {
2817            sub_dir.join(gitdir)
2818        })
2819    } else if gitfile.is_dir() {
2820        Some(gitfile)
2821    } else {
2822        None
2823    }
2824}
2825
2826/// Walk upward from `sub_dir` to find the nearest containing Git work tree.
2827fn find_superproject_git(sub_dir: &Path) -> Option<(PathBuf, PathBuf)> {
2828    let mut cur = sub_dir.parent()?;
2829    loop {
2830        let git_path = cur.join(".git");
2831        if git_path.exists() {
2832            let gd = if git_path.is_file() {
2833                let content = fs::read_to_string(&git_path).ok()?;
2834                let line = content
2835                    .lines()
2836                    .find_map(|l| l.strip_prefix("gitdir: "))?
2837                    .trim();
2838                if Path::new(line).is_absolute() {
2839                    PathBuf::from(line)
2840                } else {
2841                    cur.join(line)
2842                }
2843            } else {
2844                git_path
2845            };
2846            return Some((cur.to_path_buf(), gd));
2847        }
2848        cur = cur.parent()?;
2849    }
2850}
2851
2852/// Read the HEAD commit OID from a submodule working tree directory.
2853///
2854/// Handles both embedded `.git` directories and `gitdir:` gitfiles pointing at
2855/// `.git/modules/...` (or other locations). Returns `None` if the path is not
2856/// a checkout or has no resolvable HEAD.
2857pub fn read_submodule_head_oid(sub_dir: &Path) -> Option<ObjectId> {
2858    // Submodule `.git` may be a gitfile pointing at `.git/modules/<name>` in another superproject
2859    // after `cp -R`. Prefer the current superproject's module dir when present.
2860    let mut git_dir = submodule_embedded_git_dir(sub_dir)?;
2861    if let Some((super_wt, super_git_dir)) = find_superproject_git(sub_dir) {
2862        let rel = sub_dir.strip_prefix(&super_wt).ok()?;
2863        let rel_str = rel.to_string_lossy().replace('\\', "/");
2864        let local_mod = super_git_dir
2865            .join("modules")
2866            .join(rel_str.trim_start_matches('/'));
2867        if local_mod.join("HEAD").exists() {
2868            let sg = super_git_dir.canonicalize().unwrap_or(super_git_dir);
2869            let cur = git_dir.canonicalize().unwrap_or_else(|_| git_dir.clone());
2870            if !cur.starts_with(&sg) {
2871                git_dir = local_mod;
2872            }
2873        }
2874    }
2875    let head_content = fs::read_to_string(git_dir.join("HEAD")).ok()?;
2876    let head_content = head_content.trim();
2877    if let Some(refname) = head_content.strip_prefix("ref: ") {
2878        // Symbolic ref — resolve it
2879        let ref_path = git_dir.join(refname);
2880        let oid_hex = fs::read_to_string(&ref_path).ok()?;
2881        ObjectId::from_hex(oid_hex.trim()).ok()
2882    } else {
2883        // Detached HEAD — direct OID
2884        ObjectId::from_hex(head_content).ok()
2885    }
2886}
2887
2888/// Submodule dirty bits aligned with Git's `DIRTY_SUBMODULE_*` / porcelain v2 `S???` token.
2889#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
2890pub struct SubmodulePorcelainFlags {
2891    /// Submodule checkout HEAD differs from the gitlink OID recorded in the parent index.
2892    pub new_commits: bool,
2893    /// The submodule has its own staged or unstaged changes (`DIRTY_SUBMODULE_MODIFIED`).
2894    pub modified: bool,
2895    /// The submodule work tree contains paths not in its index (`DIRTY_SUBMODULE_UNTRACKED`).
2896    pub untracked: bool,
2897}
2898
2899/// Inspect a checked-out submodule at `rel_path` (relative to `super_worktree`) and return
2900/// flags used for `git status --porcelain=v2` submodule tokens.
2901///
2902/// `recorded_oid` is the gitlink OID stored in the **parent** index (stage 0). When the
2903/// submodule is not checked out or cannot be opened, returns [`Default::default()`].
2904pub fn submodule_porcelain_flags(
2905    super_worktree: &Path,
2906    rel_path: &str,
2907    recorded_oid: ObjectId,
2908) -> SubmodulePorcelainFlags {
2909    let sub_dir = super_worktree.join(rel_path);
2910    let Some(sub_git_dir) = submodule_embedded_git_dir(&sub_dir) else {
2911        return SubmodulePorcelainFlags::default();
2912    };
2913    let Some(sub_head) = read_submodule_head_oid(&sub_dir) else {
2914        return SubmodulePorcelainFlags::default();
2915    };
2916
2917    let new_commits = sub_head != recorded_oid;
2918
2919    let index_path = sub_git_dir.join("index");
2920    let sub_index = match crate::index::Index::load(&index_path) {
2921        Ok(ix) => ix,
2922        Err(_) => {
2923            return SubmodulePorcelainFlags {
2924                new_commits,
2925                ..Default::default()
2926            }
2927        }
2928    };
2929
2930    let tracked: std::collections::BTreeSet<String> = sub_index
2931        .entries
2932        .iter()
2933        .filter(|e| e.stage() == 0)
2934        .map(|e| String::from_utf8_lossy(&e.path).into_owned())
2935        .collect();
2936    let untracked = submodule_dir_has_untracked_inner(&sub_dir, &sub_dir, &tracked);
2937
2938    let objects_dir = sub_git_dir.join("objects");
2939    let odb = Odb::new(&objects_dir);
2940
2941    let sub_head_tree = (|| -> Option<ObjectId> {
2942        let h = fs::read_to_string(sub_git_dir.join("HEAD")).ok()?;
2943        let h_str = h.trim();
2944        let commit_oid = if let Some(r) = h_str.strip_prefix("ref: ") {
2945            let oid_hex = fs::read_to_string(sub_git_dir.join(r)).ok()?;
2946            ObjectId::from_hex(oid_hex.trim()).ok()?
2947        } else {
2948            ObjectId::from_hex(h_str).ok()?
2949        };
2950        let obj = odb.read(&commit_oid).ok()?;
2951        let commit = parse_commit(&obj.data).ok()?;
2952        Some(commit.tree)
2953    })();
2954
2955    let staged_dirty = sub_head_tree
2956        .as_ref()
2957        .map(|t| diff_index_to_tree(&odb, &sub_index, Some(t)).map(|v| !v.is_empty()))
2958        .unwrap_or(Ok(false));
2959    let staged_dirty = staged_dirty.unwrap_or(false);
2960
2961    let unstaged_dirty = diff_index_to_worktree(&odb, &sub_index, &sub_dir)
2962        .map(|v| !v.is_empty())
2963        .unwrap_or(false);
2964
2965    let modified = staged_dirty || unstaged_dirty;
2966
2967    SubmodulePorcelainFlags {
2968        new_commits,
2969        modified,
2970        untracked,
2971    }
2972}
2973
2974fn submodule_dir_has_untracked_inner(
2975    dir: &Path,
2976    root: &Path,
2977    tracked: &std::collections::BTreeSet<String>,
2978) -> bool {
2979    let entries = match fs::read_dir(dir) {
2980        Ok(e) => e,
2981        Err(_) => return false,
2982    };
2983    let mut sorted: Vec<_> = entries.filter_map(|e| e.ok()).collect();
2984    sorted.sort_by_key(|e| e.file_name());
2985
2986    for entry in sorted {
2987        let name = entry.file_name().to_string_lossy().to_string();
2988        if name == ".git" {
2989            continue;
2990        }
2991        let path = entry.path();
2992        let rel = path
2993            .strip_prefix(root)
2994            .map(|p| p.to_string_lossy().to_string())
2995            .unwrap_or_else(|_| name.clone());
2996
2997        let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
2998        if is_dir {
2999            if submodule_dir_has_untracked_inner(&path, root, tracked) {
3000                return true;
3001            }
3002        } else if !tracked.contains(&rel) {
3003            return true;
3004        }
3005    }
3006    false
3007}