Skip to main content

grit_lib/
diff.rs

1//! Diff machinery — compare trees, index entries, and working tree files.
2//!
3//! # Overview
4//!
5//! This module provides the core diffing infrastructure shared by `diff`,
6//! `diff-index`, `status`, `log`, `show`, `commit`, and `merge`.
7//!
8//! ## Levels of comparison
9//!
10//! 1. **Tree-to-tree** — compare two tree objects (e.g. for `log`/`show`).
11//! 2. **Tree-to-index** — compare a tree (usually HEAD) against the index
12//!    (staged changes, used by `diff --cached` and `status`).
13//! 3. **Index-to-worktree** — compare index against the working directory
14//!    (unstaged changes, used by `diff` and `status`).
15//!
16//! ## Content diff
17//!
18//! Line-level diffing uses the `similar` crate (Myers, patience, minimal) and,
19//! for Git's `histogram` algorithm, `imara-diff` for output compatible with upstream Git.
20//! Output formats: unified patch, raw (`:old-mode new-mode ...`), stat,
21//! numstat.
22
23use std::fs;
24#[cfg(unix)]
25use std::os::unix::fs::MetadataExt;
26use std::path::{Path, PathBuf};
27
28use crate::config::ConfigSet;
29use crate::diff_indent_heuristic;
30use crate::error::{Error, Result};
31use crate::index::{Index, IndexEntry};
32use crate::objects::{parse_commit, parse_tree, CommitData, ObjectId, ObjectKind, TreeEntry};
33use crate::odb::Odb;
34use crate::userdiff::FuncnameMatcher;
35
36/// Splits imara-diff unified body (concatenated hunks) into per-hunk slices for post-processing.
37fn imara_unified_hunk_slices(body: &str) -> Vec<&str> {
38    let mut starts: Vec<usize> = Vec::new();
39    if body.starts_with("@@") {
40        starts.push(0);
41    }
42    for (idx, _) in body.match_indices("\n@@ ") {
43        starts.push(idx + 1);
44    }
45    starts.push(body.len());
46    starts.windows(2).map(|w| &body[w[0]..w[1]]).collect()
47}
48
49fn histogram_unified_body_raw(
50    old_content: &str,
51    new_content: &str,
52    context_lines: usize,
53    inter_hunk_context: usize,
54) -> String {
55    use imara_diff::{Algorithm, Diff, Hunk, InternedInput};
56    use std::fmt::Write as _;
57
58    let input = InternedInput::new(old_content, new_content);
59    let mut diff = Diff::compute(Algorithm::Histogram, &input);
60    diff.postprocess_lines(&input);
61
62    // Assemble hunks ourselves: imara's `UnifiedDiff` printer starts the first
63    // hunk's context at line 0 whenever the first change is within
64    // `2 * context_len` of the file start, emitting more leading context than
65    // its own header claims (t4061), and its gap threshold cannot express
66    // Git's odd `2 * U + inter_hunk_context` fuse limits (t4032).
67    let hunks: Vec<Hunk> = diff.hunks().collect();
68    if hunks.is_empty() {
69        return String::new();
70    }
71
72    let ctx = context_lines.min(u32::MAX as usize) as u32;
73    let max_gap = (2usize.saturating_mul(context_lines))
74        .saturating_add(inter_hunk_context)
75        .min(u32::MAX as usize) as u32;
76    let before_len = input.before.len() as u32;
77    let after_len = input.after.len() as u32;
78
79    // Fuse hunks whose unchanged gap is at most `max_gap` (Git xdl_get_hunk).
80    let mut groups: Vec<&[Hunk]> = Vec::new();
81    let mut group_start = 0usize;
82    for i in 1..hunks.len() {
83        if hunks[i].before.start - hunks[i - 1].before.end > max_gap {
84            groups.push(&hunks[group_start..i]);
85            group_start = i;
86        }
87    }
88    groups.push(&hunks[group_start..]);
89
90    fn push_line(out: &mut String, prefix: char, text: &str) {
91        out.push(prefix);
92        out.push_str(text);
93        if !text.ends_with('\n') {
94            out.push('\n');
95        }
96    }
97
98    // Git hunk header range: 1-based start (the preceding line when the range
99    // is empty) with the `,count` part omitted when the count is exactly 1.
100    fn fmt_side(start: u32, count: u32) -> String {
101        let shown_start = if count == 0 { start } else { start + 1 };
102        if count == 1 {
103            format!("{shown_start}")
104        } else {
105            format!("{shown_start},{count}")
106        }
107    }
108
109    let mut out = String::new();
110    for group in groups {
111        let first = &group[0];
112        let last = &group[group.len() - 1];
113        let b_start = first.before.start.saturating_sub(ctx);
114        let a_start = first.after.start.saturating_sub(ctx);
115        let b_end = (last.before.end.saturating_add(ctx)).min(before_len);
116        let a_end = (last.after.end.saturating_add(ctx)).min(after_len);
117
118        let _ = writeln!(
119            out,
120            "@@ -{} +{} @@",
121            fmt_side(b_start, b_end - b_start),
122            fmt_side(a_start, a_end - a_start)
123        );
124
125        let mut pos = b_start;
126        for hunk in group {
127            for &token in &input.before[pos as usize..hunk.before.start as usize] {
128                push_line(&mut out, ' ', input.interner[token]);
129            }
130            for &token in &input.before[hunk.before.start as usize..hunk.before.end as usize] {
131                push_line(&mut out, '-', input.interner[token]);
132            }
133            for &token in &input.after[hunk.after.start as usize..hunk.after.end as usize] {
134                push_line(&mut out, '+', input.interner[token]);
135            }
136            pos = hunk.before.end;
137        }
138        for &token in &input.before[pos as usize..b_end as usize] {
139            push_line(&mut out, ' ', input.interner[token]);
140        }
141    }
142
143    out
144}
145
146/// Unified diff hunks for Git's histogram algorithm (no `---` / `+++` lines).
147///
148/// Used by `--no-index` when whitespace normalization is off so the patch matches upstream Git.
149#[must_use]
150pub fn unified_diff_histogram_hunks_only(
151    old_content: &str,
152    new_content: &str,
153    context_lines: usize,
154    inter_hunk_context: usize,
155) -> String {
156    histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context)
157}
158
159/// Full unified diff (`---` / `+++` / hunks) using Git's histogram algorithm.
160#[must_use]
161pub fn unified_diff_histogram_with_prefix_and_funcname(
162    old_content: &str,
163    new_content: &str,
164    old_path: &str,
165    new_path: &str,
166    context_lines: usize,
167    inter_hunk_context: usize,
168    src_prefix: &str,
169    dst_prefix: &str,
170    funcname_matcher: Option<&FuncnameMatcher>,
171    quote_path_fully: bool,
172) -> String {
173    use crate::quote_path::format_diff_path_with_prefix;
174
175    let body =
176        histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context);
177
178    let mut output = String::new();
179    if old_path == "/dev/null" {
180        output.push_str("--- /dev/null\n");
181    } else if src_prefix.is_empty() {
182        output.push_str(&format!("--- {old_path}\n"));
183    } else {
184        output.push_str("--- ");
185        output.push_str(&format_diff_path_with_prefix(
186            src_prefix,
187            old_path,
188            quote_path_fully,
189        ));
190        output.push('\n');
191    }
192    if new_path == "/dev/null" {
193        output.push_str("+++ /dev/null\n");
194    } else if dst_prefix.is_empty() {
195        output.push_str(&format!("+++ {new_path}\n"));
196    } else {
197        output.push_str("+++ ");
198        output.push_str(&format_diff_path_with_prefix(
199            dst_prefix,
200            new_path,
201            quote_path_fully,
202        ));
203        output.push('\n');
204    }
205
206    let old_lines: Vec<&str> = old_content.lines().collect();
207    for hunk_str in imara_unified_hunk_slices(&body) {
208        if hunk_str.is_empty() {
209            continue;
210        }
211        if let Some(first_newline) = hunk_str.find('\n') {
212            let header_line = &hunk_str[..first_newline];
213            let rest = &hunk_str[first_newline..];
214            if let Some(func_ctx) =
215                extract_function_context(header_line, &old_lines, funcname_matcher)
216            {
217                output.push_str(header_line);
218                output.push(' ');
219                output.push_str(&func_ctx);
220                output.push_str(rest);
221            } else {
222                output.push_str(hunk_str);
223            }
224        } else {
225            output.push_str(hunk_str);
226        }
227    }
228
229    output
230}
231
232/// `diff.indentHeuristic` from config (Git defaults to true when unset).
233#[must_use]
234pub fn indent_heuristic_from_config(config: &ConfigSet) -> bool {
235    match config.get_bool("diff.indentHeuristic") {
236        Some(Ok(b)) => b,
237        Some(Err(_)) | None => true,
238    }
239}
240
241/// Resolve indent heuristic: `--no-indent-heuristic` and `--indent-heuristic` override config.
242#[must_use]
243pub fn resolve_indent_heuristic(
244    config: &ConfigSet,
245    cli_indent_heuristic: bool,
246    cli_no_indent_heuristic: bool,
247) -> bool {
248    if cli_no_indent_heuristic {
249        false
250    } else if cli_indent_heuristic {
251        true
252    } else {
253        indent_heuristic_from_config(config)
254    }
255}
256
257/// Parse `--indent-heuristic` / `--no-indent-heuristic` from a plumbing argv slice (last occurrence wins).
258#[must_use]
259pub fn parse_indent_heuristic_cli_flags(argv: &[String]) -> (bool, bool) {
260    let mut indent_heuristic = false;
261    let mut no_indent_heuristic = false;
262    for a in argv {
263        match a.as_str() {
264            "--indent-heuristic" => {
265                indent_heuristic = true;
266                no_indent_heuristic = false;
267            }
268            "--no-indent-heuristic" => {
269                no_indent_heuristic = true;
270                indent_heuristic = false;
271            }
272            _ => {}
273        }
274    }
275    (indent_heuristic, no_indent_heuristic)
276}
277
278/// Diff two token streams with imara's Myers implementation (Git's default xdiff engine) and
279/// return the result as `similar::DiffOp`s. Used by the word-diff machinery, where matching
280/// Git's exact LCS tie-breaking matters (`similar`'s Myers picks different — but equally
281/// minimal — alignments, mismatching Git's reference output for e.g. the `ada` driver).
282#[must_use]
283pub fn word_diff_ops_imara(old_words: &[&str], new_words: &[&str]) -> Vec<similar::DiffOp> {
284    use imara_diff::{Algorithm, Diff, InternedInput};
285    use similar::DiffOp;
286
287    let mut input: InternedInput<&str> = InternedInput::default();
288    input.update_before(old_words.iter().copied());
289    input.update_after(new_words.iter().copied());
290    let mut diff = Diff::compute(Algorithm::Myers, &input);
291    diff.postprocess_lines(&input);
292
293    let mut ops: Vec<DiffOp> = Vec::new();
294    let mut old_pos = 0usize;
295    let mut new_pos = 0usize;
296    for hunk in diff.hunks() {
297        let b_start = hunk.before.start as usize;
298        let b_end = hunk.before.end as usize;
299        let a_start = hunk.after.start as usize;
300        let a_end = hunk.after.end as usize;
301        if b_start > old_pos {
302            let len = b_start - old_pos;
303            ops.push(DiffOp::Equal {
304                old_index: old_pos,
305                new_index: new_pos,
306                len,
307            });
308        }
309        let del = b_end - b_start;
310        let ins = a_end - a_start;
311        if del > 0 && ins > 0 {
312            ops.push(DiffOp::Replace {
313                old_index: b_start,
314                old_len: del,
315                new_index: a_start,
316                new_len: ins,
317            });
318        } else if del > 0 {
319            ops.push(DiffOp::Delete {
320                old_index: b_start,
321                old_len: del,
322                new_index: a_start,
323            });
324        } else if ins > 0 {
325            ops.push(DiffOp::Insert {
326                old_index: b_start,
327                new_index: a_start,
328                new_len: ins,
329            });
330        }
331        old_pos = b_end;
332        new_pos = a_end;
333    }
334    if old_pos < old_words.len() {
335        ops.push(DiffOp::Equal {
336            old_index: old_pos,
337            new_index: new_pos,
338            len: old_words.len() - old_pos,
339        });
340    }
341    // Slide changed runs to Git's canonical position (`xdl_change_compact`); the word
342    // diff never enables the indent heuristic.
343    diff_indent_heuristic::apply_change_compact_to_ops(&ops, old_words, new_words, false)
344}
345
346/// Line-diff ops for string slices after Git `xdl_change_compact` (and optional indent heuristic).
347#[must_use]
348pub fn diff_slice_ops_compacted(
349    old_lines: &[&str],
350    new_lines: &[&str],
351    algorithm: similar::Algorithm,
352    indent_heuristic: bool,
353) -> Vec<similar::DiffOp> {
354    diff_indent_heuristic::diff_slice_ops_compacted(
355        old_lines,
356        new_lines,
357        algorithm,
358        indent_heuristic,
359    )
360}
361
362/// Map each line in `new_joined` to its origin in `old_joined` after Git-style compaction (for blame).
363#[must_use]
364pub fn map_new_to_old_lines_compacted(
365    old_joined: &str,
366    new_joined: &str,
367    algorithm: similar::Algorithm,
368    indent_heuristic: bool,
369    new_line_count: usize,
370) -> Vec<Option<usize>> {
371    let ops = diff_indent_heuristic::diff_lines_ops_compacted(
372        old_joined,
373        new_joined,
374        algorithm,
375        indent_heuristic,
376    );
377    diff_indent_heuristic::map_new_to_old_from_ops(&ops, new_line_count)
378}
379
380/// The kind of change between two sides of a diff.
381#[derive(Debug, Clone, Copy, PartialEq, Eq)]
382pub enum DiffStatus {
383    /// File was added.
384    Added,
385    /// File was deleted.
386    Deleted,
387    /// File was modified (content or mode change).
388    Modified,
389    /// File was renamed (with optional content change).
390    Renamed,
391    /// File was copied.
392    Copied,
393    /// File type changed (e.g. regular → symlink).
394    TypeChanged,
395    /// Unmerged (conflict).
396    Unmerged,
397}
398
399impl DiffStatus {
400    /// Single-character status letter used in raw diff output.
401    #[must_use]
402    pub fn letter(&self) -> char {
403        match self {
404            Self::Added => 'A',
405            Self::Deleted => 'D',
406            Self::Modified => 'M',
407            Self::Renamed => 'R',
408            Self::Copied => 'C',
409            Self::TypeChanged => 'T',
410            Self::Unmerged => 'U',
411        }
412    }
413}
414
415/// A single diff entry representing one changed path.
416#[derive(Debug, Clone, PartialEq, Eq)]
417pub struct DiffEntry {
418    /// The status of this change.
419    pub status: DiffStatus,
420    /// Path in the "old" side (None for Added).
421    pub old_path: Option<String>,
422    /// Path in the "new" side (None for Deleted).
423    pub new_path: Option<String>,
424    /// Old file mode (as octal string, e.g. "100644").
425    pub old_mode: String,
426    /// New file mode.
427    pub new_mode: String,
428    /// Old object ID (zero OID for Added).
429    pub old_oid: ObjectId,
430    /// New object ID (zero OID for Deleted).
431    pub new_oid: ObjectId,
432    /// Similarity score (0–100) for renames/copies.
433    pub score: Option<u32>,
434}
435
436impl DiffEntry {
437    /// The primary path for display (new_path for adds, old_path for deletes).
438    #[must_use]
439    pub fn path(&self) -> &str {
440        self.new_path
441            .as_deref()
442            .or(self.old_path.as_deref())
443            .unwrap_or("")
444    }
445
446    /// Return a human-oriented path display for this entry.
447    ///
448    /// For renames and copies this returns `old -> new`; for all other entry
449    /// kinds this returns the primary path.
450    #[must_use]
451    pub fn display_path(&self) -> String {
452        match self.status {
453            DiffStatus::Renamed | DiffStatus::Copied => {
454                let old = self.old_path.as_deref().unwrap_or("");
455                let new = self.new_path.as_deref().unwrap_or("");
456                if old.is_empty() || new.is_empty() {
457                    self.path().to_owned()
458                } else {
459                    format!("{old} -> {new}")
460                }
461            }
462            _ => self.path().to_owned(),
463        }
464    }
465}
466
467/// The zero (null) object ID used for "no object" in diff output.
468pub const ZERO_OID: &str = "0000000000000000000000000000000000000000";
469
470/// Return the zero ObjectId.
471#[must_use]
472pub fn zero_oid() -> ObjectId {
473    ObjectId::from_bytes(&[0u8; 20]).unwrap_or_else(|_| {
474        // This should never fail since we pass exactly 20 bytes
475        panic!("internal error: failed to create zero OID");
476    })
477}
478
479/// Return the ObjectId for the empty blob object.
480#[must_use]
481pub fn empty_blob_oid() -> ObjectId {
482    ObjectId::from_hex("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391").unwrap_or_else(|_| {
483        // This should never fail since the object ID literal is valid.
484        panic!("internal error: failed to create empty blob OID");
485    })
486}
487
488// ── Tree-to-tree diff ───────────────────────────────────────────────
489
490/// Compare two trees and return the list of changed entries.
491///
492/// # Parameters
493///
494/// - `odb` — object database to read tree objects from.
495/// - `old_tree_oid` — OID of the old tree (or `None` for comparison against empty).
496/// - `new_tree_oid` — OID of the new tree (or `None` for comparison against empty).
497/// - `prefix` — path prefix for nested tree recursion (empty string for root).
498///
499/// # Errors
500///
501/// Returns errors from object database reads.
502pub fn diff_trees(
503    odb: &Odb,
504    old_tree_oid: Option<&ObjectId>,
505    new_tree_oid: Option<&ObjectId>,
506    prefix: &str,
507) -> Result<Vec<DiffEntry>> {
508    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, false)
509}
510
511/// Like `diff_trees` but with `show_trees` flag: when true, emit entries for
512/// tree objects themselves in addition to their recursive contents (the `-t`
513/// flag of `diff-tree`).
514pub fn diff_trees_show_tree_entries(
515    odb: &Odb,
516    old_tree_oid: Option<&ObjectId>,
517    new_tree_oid: Option<&ObjectId>,
518    prefix: &str,
519) -> Result<Vec<DiffEntry>> {
520    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, true)
521}
522
523fn diff_trees_opts(
524    odb: &Odb,
525    old_tree_oid: Option<&ObjectId>,
526    new_tree_oid: Option<&ObjectId>,
527    prefix: &str,
528    show_trees: bool,
529) -> Result<Vec<DiffEntry>> {
530    let old_entries = match old_tree_oid {
531        Some(oid) => read_tree(odb, oid)?,
532        None => Vec::new(),
533    };
534    let new_entries = match new_tree_oid {
535        Some(oid) => read_tree(odb, oid)?,
536        None => Vec::new(),
537    };
538
539    let mut result = Vec::new();
540    diff_tree_entries_opts(
541        odb,
542        &old_entries,
543        &new_entries,
544        prefix,
545        show_trees,
546        &mut result,
547    )?;
548    Ok(result)
549}
550
551/// Read and parse a tree object from the ODB.
552fn read_tree(odb: &Odb, oid: &ObjectId) -> Result<Vec<TreeEntry>> {
553    let obj = odb.read(oid)?;
554    if obj.kind != ObjectKind::Tree {
555        return Err(Error::CorruptObject(format!(
556            "expected tree, got {}",
557            obj.kind.as_str()
558        )));
559    }
560    parse_tree(&obj.data)
561}
562
563/// Compare two sorted lists of tree entries, recursing into subtrees.
564fn diff_tree_entries_opts(
565    odb: &Odb,
566    old: &[TreeEntry],
567    new: &[TreeEntry],
568    prefix: &str,
569    show_trees: bool,
570    result: &mut Vec<DiffEntry>,
571) -> Result<()> {
572    let mut oi = 0;
573    let mut ni = 0;
574
575    while oi < old.len() || ni < new.len() {
576        match (old.get(oi), new.get(ni)) {
577            (Some(o), Some(n)) => {
578                let cmp = crate::objects::tree_entry_cmp(
579                    &o.name,
580                    is_tree_mode(o.mode),
581                    &n.name,
582                    is_tree_mode(n.mode),
583                );
584                match cmp {
585                    std::cmp::Ordering::Less => {
586                        // Old entry not in new → deleted
587                        emit_deleted_opts(odb, o, prefix, show_trees, result)?;
588                        oi += 1;
589                    }
590                    std::cmp::Ordering::Greater => {
591                        // New entry not in old → added
592                        emit_added_opts(odb, n, prefix, show_trees, result)?;
593                        ni += 1;
594                    }
595                    std::cmp::Ordering::Equal => {
596                        // Both present — check for changes
597                        if o.oid != n.oid || o.mode != n.mode {
598                            let name_str = String::from_utf8_lossy(&o.name);
599                            let path = format_path(prefix, &name_str);
600                            if is_tree_mode(o.mode) && is_tree_mode(n.mode) {
601                                // Both are trees
602                                if show_trees {
603                                    result.push(DiffEntry {
604                                        status: DiffStatus::Modified,
605                                        old_path: Some(path.clone()),
606                                        new_path: Some(path.clone()),
607                                        old_mode: format_mode(o.mode),
608                                        new_mode: format_mode(n.mode),
609                                        old_oid: o.oid,
610                                        new_oid: n.oid,
611                                        score: None,
612                                    });
613                                }
614                                // Recurse
615                                let nested = diff_trees_opts(
616                                    odb,
617                                    Some(&o.oid),
618                                    Some(&n.oid),
619                                    &path,
620                                    show_trees,
621                                )?;
622                                result.extend(nested);
623                            } else if is_tree_mode(o.mode) && !is_tree_mode(n.mode) {
624                                // Tree → blob: delete tree contents, add blob
625                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
626                                emit_added_opts(odb, n, prefix, show_trees, result)?;
627                            } else if !is_tree_mode(o.mode) && is_tree_mode(n.mode) {
628                                // Blob → tree: delete blob, add tree contents
629                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
630                                emit_added_opts(odb, n, prefix, show_trees, result)?;
631                            } else {
632                                // Both blobs — modified.
633                                // A mode-only change (e.g. chmod) is Modified.
634                                // TypeChanged is only for actual type changes (blob ↔ symlink).
635                                let old_type = o.mode & 0o170000;
636                                let new_type = n.mode & 0o170000;
637                                result.push(DiffEntry {
638                                    status: if old_type != new_type {
639                                        DiffStatus::TypeChanged
640                                    } else {
641                                        DiffStatus::Modified
642                                    },
643                                    old_path: Some(path.clone()),
644                                    new_path: Some(path),
645                                    old_mode: format_mode(o.mode),
646                                    new_mode: format_mode(n.mode),
647                                    old_oid: o.oid,
648                                    new_oid: n.oid,
649                                    score: None,
650                                });
651                            }
652                        }
653                        oi += 1;
654                        ni += 1;
655                    }
656                }
657            }
658            (Some(o), None) => {
659                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
660                oi += 1;
661            }
662            (None, Some(n)) => {
663                emit_added_opts(odb, n, prefix, show_trees, result)?;
664                ni += 1;
665            }
666            (None, None) => break,
667        }
668    }
669
670    Ok(())
671}
672
673fn emit_deleted_opts(
674    odb: &Odb,
675    entry: &TreeEntry,
676    prefix: &str,
677    show_trees: bool,
678    result: &mut Vec<DiffEntry>,
679) -> Result<()> {
680    let name_str = String::from_utf8_lossy(&entry.name);
681    let path = format_path(prefix, &name_str);
682    if is_tree_mode(entry.mode) {
683        if show_trees {
684            result.push(DiffEntry {
685                status: DiffStatus::Deleted,
686                old_path: Some(path.clone()),
687                new_path: None,
688                old_mode: format_mode(entry.mode),
689                new_mode: "000000".to_owned(),
690                old_oid: entry.oid,
691                new_oid: zero_oid(),
692                score: None,
693            });
694        }
695        // Recurse into deleted tree
696        let nested = diff_trees_opts(odb, Some(&entry.oid), None, &path, show_trees)?;
697        result.extend(nested);
698    } else {
699        result.push(DiffEntry {
700            status: DiffStatus::Deleted,
701            old_path: Some(path.clone()),
702            new_path: None,
703            old_mode: format_mode(entry.mode),
704            new_mode: "000000".to_owned(),
705            old_oid: entry.oid,
706            new_oid: zero_oid(),
707            score: None,
708        });
709    }
710    Ok(())
711}
712
713fn emit_added_opts(
714    odb: &Odb,
715    entry: &TreeEntry,
716    prefix: &str,
717    show_trees: bool,
718    result: &mut Vec<DiffEntry>,
719) -> Result<()> {
720    let name_str = String::from_utf8_lossy(&entry.name);
721    let path = format_path(prefix, &name_str);
722    if is_tree_mode(entry.mode) {
723        if show_trees {
724            result.push(DiffEntry {
725                status: DiffStatus::Added,
726                old_path: None,
727                new_path: Some(path.clone()),
728                old_mode: "000000".to_owned(),
729                new_mode: format_mode(entry.mode),
730                old_oid: zero_oid(),
731                new_oid: entry.oid,
732                score: None,
733            });
734        }
735        // Recurse into added tree
736        let nested = diff_trees_opts(odb, None, Some(&entry.oid), &path, show_trees)?;
737        result.extend(nested);
738    } else {
739        result.push(DiffEntry {
740            status: DiffStatus::Added,
741            old_path: None,
742            new_path: Some(path),
743            old_mode: "000000".to_owned(),
744            new_mode: format_mode(entry.mode),
745            old_oid: zero_oid(),
746            new_oid: entry.oid,
747            score: None,
748        });
749    }
750    Ok(())
751}
752
753// ── Index-to-tree diff (staged changes) ─────────────────────────────
754
755/// Compare the index against a tree (usually HEAD's tree).
756///
757/// This shows "staged" changes — what would be committed.
758///
759/// # Parameters
760///
761/// - `odb` — object database.
762/// - `index` — the current index.
763/// - `tree_oid` — the tree to compare against (e.g. HEAD's tree), or `None`
764///   for comparison against an empty tree (initial commit).
765///
766/// # Errors
767///
768/// Returns errors from ODB reads.
769///
770/// When `ignore_submodules` is true, gitlink (`160000`) paths are omitted from the diff, matching
771/// Git's `require_clean_work_tree(..., ignore_submodules=1)` used by `git rebase` / `git pull`.
772pub fn diff_index_to_tree(
773    odb: &Odb,
774    index: &Index,
775    tree_oid: Option<&ObjectId>,
776    ignore_submodules: bool,
777) -> Result<Vec<DiffEntry>> {
778    // Flatten the tree into a sorted list of (path, mode, oid)
779    let tree_entries = match tree_oid {
780        Some(oid) => flatten_tree(odb, oid, "")?,
781        None => Vec::new(),
782    };
783
784    // Build maps keyed by path
785    let mut tree_map: std::collections::BTreeMap<&str, &FlatEntry> =
786        std::collections::BTreeMap::new();
787    for entry in &tree_entries {
788        tree_map.insert(&entry.path, entry);
789    }
790
791    let mut result = Vec::new();
792    let mut stage0_paths = std::collections::BTreeSet::new();
793    let mut unmerged_modes: std::collections::BTreeMap<String, (u8, u32)> =
794        std::collections::BTreeMap::new();
795
796    // Check index entries against tree
797    for ie in &index.entries {
798        let path = String::from_utf8_lossy(&ie.path).to_string();
799        if ie.stage() == 0 && ie.intent_to_add() {
800            // Intent-to-add entries are not "staged" for diff-index / status
801            // (matches Git: `git diff --cached` is empty for `-N` paths).
802            continue;
803        }
804        if ie.stage() != 0 {
805            let rank = match ie.stage() {
806                2 => 0u8,
807                3 => 1u8,
808                1 => 2u8,
809                _ => 3u8,
810            };
811            match unmerged_modes.get(&path) {
812                Some((existing_rank, _)) if *existing_rank <= rank => {}
813                _ => {
814                    unmerged_modes.insert(path, (rank, ie.mode));
815                }
816            }
817            continue;
818        }
819        if ignore_submodules && ie.mode == 0o160000 {
820            let _ = tree_map.remove(path.as_str());
821            stage0_paths.insert(path.clone());
822            continue;
823        }
824        stage0_paths.insert(path.clone());
825        match tree_map.remove(path.as_str()) {
826            Some(te) => {
827                // Present in both — check for differences
828                if te.oid != ie.oid || te.mode != ie.mode {
829                    result.push(DiffEntry {
830                        status: DiffStatus::Modified,
831                        old_path: Some(path.clone()),
832                        new_path: Some(path),
833                        old_mode: format_mode(te.mode),
834                        new_mode: format_mode(ie.mode),
835                        old_oid: te.oid,
836                        new_oid: ie.oid,
837                        score: None,
838                    });
839                }
840            }
841            None => {
842                // In index but not tree → added
843                result.push(DiffEntry {
844                    status: DiffStatus::Added,
845                    old_path: None,
846                    new_path: Some(path),
847                    old_mode: "000000".to_owned(),
848                    new_mode: format_mode(ie.mode),
849                    old_oid: zero_oid(),
850                    new_oid: ie.oid,
851                    score: None,
852                });
853            }
854        }
855    }
856
857    for (path, (_, mode)) in &unmerged_modes {
858        if stage0_paths.contains(path) {
859            continue;
860        }
861        tree_map.remove(path.as_str());
862        result.push(DiffEntry {
863            status: DiffStatus::Unmerged,
864            old_path: Some(path.clone()),
865            new_path: Some(path.clone()),
866            old_mode: "000000".to_owned(),
867            new_mode: format_mode(*mode),
868            old_oid: zero_oid(),
869            new_oid: zero_oid(),
870            score: None,
871        });
872    }
873
874    // Remaining tree entries not in index → deleted
875    for (path, te) in tree_map {
876        if ignore_submodules && te.mode == 0o160000 {
877            continue;
878        }
879        result.push(DiffEntry {
880            status: DiffStatus::Deleted,
881            old_path: Some(path.to_owned()),
882            new_path: None,
883            old_mode: format_mode(te.mode),
884            new_mode: "000000".to_owned(),
885            old_oid: te.oid,
886            new_oid: zero_oid(),
887            score: None,
888        });
889    }
890
891    result.sort_by(|a, b| a.path().cmp(b.path()));
892    Ok(result)
893}
894
895// ── Index-to-worktree diff (unstaged changes) ───────────────────────
896
897/// Compare the index against the working tree.
898///
899/// This shows "unstaged" changes — modifications not yet staged.
900///
901/// Entries with [`IndexEntry::assume_unchanged`] or [`IndexEntry::skip_worktree`] are treated as
902/// matching the work tree without examining the filesystem (Git `CE_VALID` / skip-worktree).
903///
904/// # Parameters
905///
906/// - `odb` — object database (for hashing worktree files).
907/// - `index` — the current index.
908/// - `work_tree` — path to the working tree root.
909/// - `ignore_submodule_untracked` — when true, gitlink entries are not dirty solely from untracked
910///   files inside the submodule (matches `git status -uno`).
911/// - `simplify_gitlinks` — when true, nested gitlink entries only compare the submodule checkout
912///   HEAD to the recorded OID (ignore dirty work trees inside nested submodules). Used when
913///   computing `submodule_porcelain_flags` so untracked files under a nested submodule do not set
914///   the parent submodule's `modified` bit (Git `DIRTY_SUBMODULE_MODIFIED`; t7506).
915///
916/// # Errors
917///
918/// Returns errors from I/O or hashing.
919pub fn diff_index_to_worktree(
920    odb: &Odb,
921    index: &Index,
922    work_tree: &Path,
923    ignore_submodule_untracked: bool,
924    simplify_gitlinks: bool,
925) -> Result<Vec<DiffEntry>> {
926    diff_index_to_worktree_with_options(
927        odb,
928        index,
929        work_tree,
930        DiffIndexToWorktreeOptions {
931            ignore_submodule_untracked,
932            simplify_gitlinks,
933            ..DiffIndexToWorktreeOptions::default()
934        },
935    )
936}
937
938/// Additional inputs for [`diff_index_to_worktree_with_options`].
939#[derive(Debug, Clone, Copy, Default)]
940pub struct DiffIndexToWorktreeOptions {
941    /// Optional index mtime pair `(sec, nsec)` sampled when the index was read.
942    ///
943    /// When provided, entries with matching stat data are still considered dirty candidates if
944    /// their recorded mtime is "racy" (at or after this timestamp), matching Git's
945    /// `is_racy_timestamp` behavior.
946    pub index_mtime: Option<(u32, u32)>,
947    /// When true, gitlink entries are not dirty solely from untracked files inside the submodule.
948    pub ignore_submodule_untracked: bool,
949    /// When true, nested gitlink entries only compare the submodule checkout HEAD to the recorded OID.
950    pub simplify_gitlinks: bool,
951    /// When true, a populated gitlink checkout whose `.git` indirection cannot resolve to a HEAD
952    /// is returned as an error instead of a normal modified gitlink.
953    pub error_on_broken_gitlinks: bool,
954}
955
956/// Compare the index against the working tree with optional racy-timestamp context.
957///
958/// This variant enables a stat-trust fast path: if an entry's stat tuple matches and the mode is
959/// unchanged, the worktree blob hash is skipped unless the entry is racy relative to the supplied
960/// index mtime.
961///
962/// # Parameters
963///
964/// - `odb` — object database (for hashing worktree files).
965/// - `index` — the current index.
966/// - `work_tree` — path to the working tree root.
967/// - `options` — optional context for racy timestamp checks.
968///
969/// # Errors
970///
971/// Returns errors from I/O or hashing.
972pub fn diff_index_to_worktree_with_options(
973    odb: &Odb,
974    index: &Index,
975    work_tree: &Path,
976    options: DiffIndexToWorktreeOptions,
977) -> Result<Vec<DiffEntry>> {
978    use crate::config::ConfigSet;
979    use crate::crlf;
980
981    let ignore_submodule_untracked = options.ignore_submodule_untracked;
982    let simplify_gitlinks = options.simplify_gitlinks;
983
984    let git_dir = work_tree.join(".git");
985    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
986    let conv = crlf::ConversionConfig::from_config(&config);
987    let attrs = crlf::load_gitattributes(work_tree);
988
989    let mut result = Vec::new();
990    let mut unmerged_base: std::collections::BTreeMap<String, (u8, &IndexEntry)> =
991        std::collections::BTreeMap::new();
992
993    for ie in &index.entries {
994        if ie.stage() != 0 {
995            let path = String::from_utf8_lossy(&ie.path).to_string();
996            let rank = match ie.stage() {
997                2 => 0u8,
998                3 => 1u8,
999                1 => 2u8,
1000                _ => 3u8,
1001            };
1002            match unmerged_base.get(&path) {
1003                Some((existing_rank, _)) if *existing_rank <= rank => {}
1004                _ => {
1005                    unmerged_base.insert(path, (rank, ie));
1006                }
1007            }
1008            continue;
1009        }
1010        // Sparse checkout: paths outside the cone are not expected on disk; `assume_unchanged`
1011        // is treated as clean without reading the filesystem (wt-status.c).
1012        if ie.skip_worktree() || ie.assume_unchanged() {
1013            continue;
1014        }
1015        // Use str slice directly to avoid allocation for path joining;
1016        // only allocate String if we need it for DiffEntry output.
1017        let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
1018        let is_intent_to_add = ie.intent_to_add();
1019
1020        // Gitlink entries (submodules): Git's `diff-index` reports `M` when the recorded
1021        // commit differs from the submodule checkout **or** when the submodule work tree is
1022        // dirty (staged/unstaged/untracked) even if HEAD still matches the gitlink. For the
1023        // latter case the "new" OID column is the null OID (see `git diff-index` / t7506).
1024        if ie.mode == 0o160000 {
1025            let sub_dir = work_tree.join(path_str_ref);
1026            let sub_head_oid = read_submodule_head_oid(&sub_dir);
1027            let ref_matches = if let Some(oid) = sub_head_oid {
1028                oid == ie.oid
1029            } else {
1030                let is_placeholder = submodule_worktree_is_unpopulated_placeholder(&sub_dir);
1031                if options.error_on_broken_gitlinks
1032                    && !is_placeholder
1033                    && submodule_embedded_git_dir(&sub_dir).is_some()
1034                {
1035                    return Err(Error::ConfigError(format!(
1036                        "could not read submodule HEAD for '{path_str_ref}'"
1037                    )));
1038                }
1039                is_placeholder
1040            };
1041            if simplify_gitlinks {
1042                if !ref_matches {
1043                    let path_owned = path_str_ref.to_owned();
1044                    let new_oid = sub_head_oid.unwrap_or_else(zero_oid);
1045                    result.push(DiffEntry {
1046                        status: DiffStatus::Modified,
1047                        old_path: Some(path_owned.clone()),
1048                        new_path: Some(path_owned),
1049                        old_mode: format_mode(ie.mode),
1050                        new_mode: format_mode(ie.mode),
1051                        old_oid: ie.oid,
1052                        new_oid,
1053                        score: None,
1054                    });
1055                }
1056                continue;
1057            }
1058            let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
1059            if ignore_submodule_untracked {
1060                flags.untracked = false;
1061            }
1062            let inner_dirty = flags.modified || flags.untracked;
1063            if !ref_matches || inner_dirty {
1064                let path_owned = path_str_ref.to_owned();
1065                let new_oid = if !ref_matches {
1066                    sub_head_oid.unwrap_or_else(zero_oid)
1067                } else {
1068                    zero_oid()
1069                };
1070                result.push(DiffEntry {
1071                    status: DiffStatus::Modified,
1072                    old_path: Some(path_owned.clone()),
1073                    new_path: Some(path_owned),
1074                    old_mode: format_mode(ie.mode),
1075                    new_mode: format_mode(ie.mode),
1076                    old_oid: ie.oid,
1077                    new_oid,
1078                    score: None,
1079                });
1080            }
1081            continue;
1082        }
1083
1084        let file_path = work_tree.join(path_str_ref);
1085
1086        if is_intent_to_add {
1087            match fs::symlink_metadata(&file_path) {
1088                Ok(meta) => {
1089                    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
1090                    let worktree_oid = hash_worktree_file(
1091                        odb,
1092                        &file_path,
1093                        &meta,
1094                        &conv,
1095                        &file_attrs,
1096                        path_str_ref,
1097                        None,
1098                    )?;
1099                    let worktree_mode = mode_from_metadata(&meta);
1100                    result.push(DiffEntry {
1101                        status: DiffStatus::Added,
1102                        old_path: None,
1103                        new_path: Some(path_str_ref.to_owned()),
1104                        old_mode: "000000".to_owned(),
1105                        new_mode: format_mode(worktree_mode),
1106                        // `ita_invisible_in_index`: null OID on the index side for patch output
1107                        // (`index 0000000..`, t2203); index entry still stores the empty blob.
1108                        old_oid: zero_oid(),
1109                        new_oid: worktree_oid,
1110                        score: None,
1111                    });
1112                }
1113                Err(e)
1114                    if e.kind() == std::io::ErrorKind::NotFound
1115                        || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
1116                {
1117                    result.push(DiffEntry {
1118                        status: DiffStatus::Deleted,
1119                        old_path: Some(path_str_ref.to_owned()),
1120                        new_path: None,
1121                        old_mode: format_mode(ie.mode),
1122                        new_mode: "000000".to_owned(),
1123                        old_oid: ie.oid,
1124                        new_oid: zero_oid(),
1125                        score: None,
1126                    });
1127                }
1128                Err(e) => return Err(Error::Io(e)),
1129            }
1130            continue;
1131        }
1132
1133        // If any parent component of the path is a symlink, the file is effectively
1134        // deleted from the working tree (a symlink replaced a directory).
1135        if has_symlink_in_path(work_tree, path_str_ref) {
1136            result.push(DiffEntry {
1137                status: DiffStatus::Deleted,
1138                old_path: Some(path_str_ref.to_owned()),
1139                new_path: None,
1140                old_mode: format_mode(ie.mode),
1141                new_mode: "000000".to_owned(),
1142                old_oid: ie.oid,
1143                new_oid: zero_oid(),
1144                score: None,
1145            });
1146            continue;
1147        }
1148
1149        match fs::symlink_metadata(&file_path) {
1150            Ok(meta) if meta.is_dir() => {
1151                // A directory exists where the index expects a file. A populated submodule
1152                // checkout (`.git` present) is a blob→gitlink typechange with the submodule HEAD on
1153                // the new side (raw output re-zeros it); otherwise the indexed file is effectively
1154                // deleted. See t4041/t4060 #13.
1155                if file_path.join(".git").exists() {
1156                    let head = read_submodule_head_oid(&file_path).unwrap_or_else(zero_oid);
1157                    let path_owned = path_str_ref.to_owned();
1158                    result.push(DiffEntry {
1159                        status: DiffStatus::TypeChanged,
1160                        old_path: Some(path_owned.clone()),
1161                        new_path: Some(path_owned),
1162                        old_mode: format_mode(ie.mode),
1163                        new_mode: format_mode(0o160000),
1164                        old_oid: ie.oid,
1165                        new_oid: head,
1166                        score: None,
1167                    });
1168                    continue;
1169                }
1170                result.push(DiffEntry {
1171                    status: DiffStatus::Deleted,
1172                    old_path: Some(path_str_ref.to_owned()),
1173                    new_path: None,
1174                    old_mode: format_mode(ie.mode),
1175                    new_mode: String::new(),
1176                    old_oid: ie.oid,
1177                    new_oid: zero_oid(),
1178                    score: None,
1179                });
1180            }
1181            Ok(meta) => {
1182                let worktree_mode = mode_from_metadata(&meta);
1183                let stat_same = stat_matches(ie, &meta);
1184                // Mode-only change: stat still matches the index entry but executable bit differs.
1185                if stat_same && worktree_mode != ie.mode {
1186                    let path_owned = path_str_ref.to_owned();
1187                    result.push(DiffEntry {
1188                        status: DiffStatus::Modified,
1189                        old_path: Some(path_owned.clone()),
1190                        new_path: Some(path_owned),
1191                        old_mode: format_mode(ie.mode),
1192                        new_mode: format_mode(worktree_mode),
1193                        old_oid: ie.oid,
1194                        new_oid: ie.oid,
1195                        score: None,
1196                    });
1197                    continue;
1198                }
1199
1200                // Fast path: unchanged stat + unchanged mode + non-racy timestamp means this entry
1201                // is clean without re-hashing blob data.
1202                if stat_same && worktree_mode == ie.mode && !entry_is_racy(ie, options.index_mtime) {
1203                    continue;
1204                }
1205
1206                // Hash the worktree blob for uncertain/racy entries.
1207                let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
1208                let worktree_oid = hash_worktree_file(
1209                    odb,
1210                    &file_path,
1211                    &meta,
1212                    &conv,
1213                    &file_attrs,
1214                    path_str_ref,
1215                    Some(ie),
1216                )?;
1217
1218                // If clean conversion disagrees with the index but raw bytes match the
1219                // blob (e.g. mixed line endings committed with autocrlf off), Git reports
1220                // no diff (t0020: touch + git diff --exit-code).
1221                let mut eff_oid = worktree_oid;
1222                if eff_oid != ie.oid {
1223                    if let Ok(raw) = fs::read(&file_path) {
1224                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1225                        if raw_oid == ie.oid {
1226                            eff_oid = ie.oid;
1227                        }
1228                    }
1229                }
1230
1231                if eff_oid != ie.oid || worktree_mode != ie.mode {
1232                    let path_owned = path_str_ref.to_owned();
1233                    result.push(DiffEntry {
1234                        status: DiffStatus::Modified,
1235                        old_path: Some(path_owned.clone()),
1236                        new_path: Some(path_owned),
1237                        old_mode: format_mode(ie.mode),
1238                        new_mode: format_mode(worktree_mode),
1239                        old_oid: ie.oid,
1240                        new_oid: eff_oid,
1241                    score: None,
1242                    });
1243                }
1244            }
1245            Err(e) if e.kind() == std::io::ErrorKind::NotFound
1246                || e.raw_os_error() == Some(20) /* ENOTDIR */ => {
1247                // File deleted from working tree (or parent replaced by a file)
1248                result.push(DiffEntry {
1249                    status: DiffStatus::Deleted,
1250                    old_path: Some(path_str_ref.to_owned()),
1251                    new_path: None,
1252                    old_mode: format_mode(ie.mode),
1253                    new_mode: "000000".to_owned(),
1254                    old_oid: ie.oid,
1255                    new_oid: zero_oid(),
1256                    score: None,
1257                });
1258            }
1259            Err(e) => return Err(Error::Io(e)),
1260        }
1261    }
1262
1263    for (path, (_, base_entry)) in unmerged_base {
1264        let file_path = work_tree.join(&path);
1265        let wt_meta = match fs::symlink_metadata(&file_path) {
1266            Ok(meta) => Some(meta),
1267            Err(e)
1268                if e.kind() == std::io::ErrorKind::NotFound
1269                    || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
1270            {
1271                None
1272            }
1273            Err(e) => return Err(Error::Io(e)),
1274        };
1275
1276        let new_mode = wt_meta.as_ref().map_or_else(
1277            || "000000".to_owned(),
1278            |meta| format_mode(mode_from_metadata(meta)),
1279        );
1280        result.push(DiffEntry {
1281            status: DiffStatus::Unmerged,
1282            old_path: Some(path.clone()),
1283            new_path: Some(path.clone()),
1284            old_mode: "000000".to_owned(),
1285            new_mode,
1286            old_oid: zero_oid(),
1287            new_oid: zero_oid(),
1288            score: None,
1289        });
1290
1291        if let Some(meta) = wt_meta {
1292            let file_attrs = crlf::get_file_attrs(&attrs, &path, false, &config);
1293            let wt_oid = hash_worktree_file(
1294                odb,
1295                &file_path,
1296                &meta,
1297                &conv,
1298                &file_attrs,
1299                &path,
1300                Some(base_entry),
1301            )?;
1302            let wt_mode = mode_from_metadata(&meta);
1303            if wt_oid != base_entry.oid || wt_mode != base_entry.mode {
1304                result.push(DiffEntry {
1305                    status: DiffStatus::Modified,
1306                    old_path: Some(path.clone()),
1307                    new_path: Some(path),
1308                    old_mode: format_mode(base_entry.mode),
1309                    new_mode: format_mode(wt_mode),
1310                    old_oid: base_entry.oid,
1311                    new_oid: wt_oid,
1312                    score: None,
1313                });
1314            }
1315        }
1316    }
1317
1318    Ok(result)
1319}
1320
1321fn entry_is_racy(ie: &IndexEntry, index_mtime: Option<(u32, u32)>) -> bool {
1322    let Some((index_mtime_sec, index_mtime_nsec)) = index_mtime else {
1323        return false;
1324    };
1325    if index_mtime_sec == 0 {
1326        return false;
1327    }
1328    index_mtime_sec < ie.mtime_sec
1329        || (index_mtime_sec == ie.mtime_sec && index_mtime_nsec <= ie.mtime_nsec)
1330}
1331
1332/// Quick stat check: does the index entry's cached stat data match the file?
1333/// Returns true when the file at `ie`'s path differs from the index entry (mode or blob).
1334///
1335/// Used by commands such as `git mv` to detect "dirty" paths under sparse checkout.
1336/// Symlinks and submodules are compared in a Git-compatible way.
1337///
1338/// `ignore_submodule_untracked` mirrors [`diff_index_to_worktree`]'s same flag for gitlinks.
1339pub fn worktree_differs_from_index_entry(
1340    odb: &Odb,
1341    work_tree: &Path,
1342    ie: &IndexEntry,
1343    ignore_submodule_untracked: bool,
1344) -> Result<bool> {
1345    use crate::config::ConfigSet;
1346    use crate::crlf;
1347
1348    let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
1349    let file_path = work_tree.join(path_str_ref);
1350
1351    if ie.mode == 0o160000 {
1352        let sub_head_oid = read_submodule_head(&file_path);
1353        let ref_matches = match sub_head_oid {
1354            Some(oid) => oid == ie.oid,
1355            None => submodule_worktree_is_unpopulated_placeholder(&file_path),
1356        };
1357        let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
1358        if ignore_submodule_untracked {
1359            flags.untracked = false;
1360        }
1361        return Ok(!ref_matches || flags.modified || flags.untracked);
1362    }
1363
1364    let meta = match fs::symlink_metadata(&file_path) {
1365        Ok(m) => m,
1366        Err(e)
1367            if e.kind() == std::io::ErrorKind::NotFound
1368                || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
1369        {
1370            return Ok(true);
1371        }
1372        Err(e) => return Err(Error::Io(e)),
1373    };
1374
1375    if meta.is_dir() {
1376        return Ok(true);
1377    }
1378
1379    let worktree_mode = mode_from_metadata(&meta);
1380    if worktree_mode != ie.mode {
1381        return Ok(true);
1382    }
1383
1384    let git_dir = work_tree.join(".git");
1385    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1386    let conv = crlf::ConversionConfig::from_config(&config);
1387    let attrs = crlf::load_gitattributes(work_tree);
1388    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
1389    let worktree_oid = hash_worktree_file(
1390        odb,
1391        &file_path,
1392        &meta,
1393        &conv,
1394        &file_attrs,
1395        path_str_ref,
1396        Some(ie),
1397    )?;
1398
1399    let mut eff_oid = worktree_oid;
1400    if eff_oid != ie.oid {
1401        if let Ok(raw) = fs::read(&file_path) {
1402            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1403            if raw_oid == ie.oid {
1404                eff_oid = ie.oid;
1405            }
1406        }
1407    }
1408
1409    Ok(eff_oid != ie.oid)
1410}
1411
1412pub fn stat_matches(ie: &IndexEntry, meta: &fs::Metadata) -> bool {
1413    // Compare size
1414    if meta.len() as u32 != ie.size {
1415        return false;
1416    }
1417    #[cfg(unix)]
1418    {
1419        use std::os::unix::fs::MetadataExt;
1420        // Compare mtime (seconds + nanoseconds)
1421        if meta.mtime() as u32 != ie.mtime_sec {
1422            return false;
1423        }
1424        if meta.mtime_nsec() as u32 != ie.mtime_nsec {
1425            return false;
1426        }
1427        // Compare ctime (seconds + nanoseconds)
1428        if meta.ctime() as u32 != ie.ctime_sec {
1429            return false;
1430        }
1431        if meta.ctime_nsec() as u32 != ie.ctime_nsec {
1432            return false;
1433        }
1434        // Compare inode and device
1435        if meta.ino() as u32 != ie.ino {
1436            return false;
1437        }
1438        if meta.dev() as u32 != ie.dev {
1439            return false;
1440        }
1441    }
1442    #[cfg(not(unix))]
1443    {
1444        use std::time::UNIX_EPOCH;
1445        if let Ok(mtime) = meta.modified() {
1446            if let Ok(dur) = mtime.duration_since(UNIX_EPOCH) {
1447                if dur.as_secs() as u32 != ie.mtime_sec {
1448                    return false;
1449                }
1450                if dur.subsec_nanos() != ie.mtime_nsec {
1451                    return false;
1452                }
1453            }
1454        }
1455    }
1456    true
1457}
1458
1459/// Refresh cached stat data for stage-0 file/symlink entries whose worktree content still matches
1460/// the recorded OID but whose on-disk stat went stale.
1461///
1462/// This mirrors Git's `refresh_index` / `refresh_cache_ent`: an entry is only marked clean (stat
1463/// adopted from the worktree) after its content is re-verified against the index OID. A genuinely
1464/// modified entry keeps its stale stat so `diff-files` / `status` continue to report it. Operations
1465/// that rewrite the worktree (`status`, `reset --mixed`, `stash`) call this before writing the
1466/// index so a subsequent `git diff-files` sees refreshed entries as clean.
1467///
1468/// Gitlinks, sparse (`skip_worktree`), `assume_unchanged` and intent-to-add entries are skipped.
1469/// The blob comparison is a raw-content hash, so a CRLF-smudged match is conservatively missed
1470/// (the entry simply stays stat-dirty and is re-hashed next time — never the reverse).
1471///
1472/// `index_mtime` is the on-disk index file's `(mtime_sec, mtime_nsec)` (see
1473/// `entry_is_racy` / Git `is_racy_timestamp`); pass `None` when unknown — racy detection is
1474/// then skipped, which is conservative for tree-built indexes whose zeroed stat never matches.
1475///
1476/// Returns `true` when at least one entry was refreshed or invalidated, so callers can write
1477/// the index opportunistically (Git only persists a refresh that changed something).
1478pub fn refresh_index_stat_content_verified(
1479    index: &mut Index,
1480    work_tree: &Path,
1481    index_mtime: Option<(u32, u32)>,
1482) -> bool {
1483    use crate::index::{MODE_EXECUTABLE, MODE_REGULAR, MODE_SYMLINK};
1484    let mut changed = false;
1485    for ie in &mut index.entries {
1486        if ie.stage() != 0 || ie.skip_worktree() || ie.assume_unchanged() || ie.intent_to_add() {
1487            continue;
1488        }
1489        if ie.mode != MODE_REGULAR && ie.mode != MODE_EXECUTABLE && ie.mode != MODE_SYMLINK {
1490            continue;
1491        }
1492        let Ok(path) = std::str::from_utf8(&ie.path) else {
1493            continue;
1494        };
1495        let abs = work_tree.join(path);
1496        let Ok(meta) = fs::symlink_metadata(&abs) else {
1497            continue;
1498        };
1499        if stat_matches(ie, &meta) {
1500            // Git `ie_match_stat`: a clean stat is trusted without reading the file unless the
1501            // entry is racy (written within the index's own mtime). Only then re-verify content;
1502            // stat can be refreshed from the work tree without matching the indexed blob (e.g.
1503            // after merge stat refresh while local edits remain) — invalidate so diff/status
1504            // re-hash.
1505            if entry_is_racy(ie, index_mtime)
1506                && !worktree_content_matches_index_oid(ie, &abs, &meta)
1507            {
1508                invalidate_index_stat_cache(ie);
1509                changed = true;
1510            }
1511            continue;
1512        }
1513        if !worktree_content_matches_index_oid(ie, &abs, &meta) {
1514            continue;
1515        }
1516        let refreshed = crate::index::entry_from_metadata(&meta, &ie.path, ie.oid, ie.mode);
1517        ie.ctime_sec = refreshed.ctime_sec;
1518        ie.ctime_nsec = refreshed.ctime_nsec;
1519        ie.mtime_sec = refreshed.mtime_sec;
1520        ie.mtime_nsec = refreshed.mtime_nsec;
1521        ie.dev = refreshed.dev;
1522        ie.ino = refreshed.ino;
1523        ie.uid = refreshed.uid;
1524        ie.gid = refreshed.gid;
1525        ie.size = refreshed.size;
1526        changed = true;
1527    }
1528    changed
1529}
1530
1531/// Whether the work tree blob at `abs` matches the index entry OID (raw bytes, no CRLF smudge).
1532fn worktree_content_matches_index_oid(ie: &IndexEntry, abs: &Path, meta: &fs::Metadata) -> bool {
1533    use crate::index::{MODE_EXECUTABLE, MODE_REGULAR, MODE_SYMLINK};
1534    if ie.mode == MODE_SYMLINK {
1535        if !meta.file_type().is_symlink() {
1536            return false;
1537        }
1538        use std::os::unix::ffi::OsStrExt as _;
1539        fs::read_link(abs)
1540            .map(|t| Odb::hash_object_data(ObjectKind::Blob, t.as_os_str().as_bytes()) == ie.oid)
1541            .unwrap_or(false)
1542    } else if ie.mode == MODE_REGULAR || ie.mode == MODE_EXECUTABLE {
1543        if !meta.file_type().is_file() {
1544            return false;
1545        }
1546        fs::read(abs)
1547            .map(|bytes| Odb::hash_object_data(ObjectKind::Blob, &bytes) == ie.oid)
1548            .unwrap_or(false)
1549    } else {
1550        false
1551    }
1552}
1553
1554/// Clear cached stat fields so the next diff/status pass re-reads the work tree.
1555fn invalidate_index_stat_cache(ie: &mut IndexEntry) {
1556    ie.ctime_sec = 0;
1557    ie.ctime_nsec = 0;
1558    ie.mtime_sec = 0;
1559    ie.mtime_nsec = 0;
1560    ie.dev = 0;
1561    ie.ino = 0;
1562    ie.size = 0;
1563}
1564
1565/// Hash a working tree file as a blob to get its OID.
1566/// Check if any parent component of `rel_path` (relative to `work_tree`) is a symlink.
1567fn has_symlink_in_path(work_tree: &Path, rel_path: &str) -> bool {
1568    let mut check = work_tree.to_path_buf();
1569    let components: Vec<&str> = rel_path.split('/').collect();
1570    // Check all components except the last one (which is the file itself)
1571    for component in &components[..components.len().saturating_sub(1)] {
1572        check.push(component);
1573        match fs::symlink_metadata(&check) {
1574            Ok(meta) if meta.file_type().is_symlink() => return true,
1575            _ => {}
1576        }
1577    }
1578    false
1579}
1580
1581pub fn hash_worktree_file(
1582    odb: &Odb,
1583    path: &Path,
1584    meta: &fs::Metadata,
1585    conv: &crate::crlf::ConversionConfig,
1586    file_attrs: &crate::crlf::FileAttrs,
1587    rel_path: &str,
1588    index_entry: Option<&IndexEntry>,
1589) -> Result<ObjectId> {
1590    let prior_blob: Option<Vec<u8>> = index_entry
1591        .filter(|e| e.oid != zero_oid())
1592        .and_then(|e| odb.read(&e.oid).ok().map(|o| o.data));
1593    let data = if meta.file_type().is_symlink() {
1594        // For symlinks, hash the target path
1595        let target = fs::read_link(path)?;
1596        target.to_string_lossy().into_owned().into_bytes()
1597    } else if meta.is_dir() {
1598        // `read()` on a directory fails with EISDIR; unmerged paths may leave an empty
1599        // placeholder directory (e.g. t4027 combined submodule conflict).
1600        Vec::new()
1601    } else {
1602        let raw = fs::read(path)?;
1603        // Apply clean conversion (CRLF→LF) so hash matches index blob.
1604        // Do not run safecrlf here: diff/commit use this for hashing and must not print warnings.
1605        let opts = crate::crlf::ConvertToGitOpts {
1606            index_blob: prior_blob.as_deref(),
1607            renormalize: false,
1608            check_safecrlf: false,
1609        };
1610        crate::crlf::convert_to_git_with_opts(&raw, rel_path, conv, file_attrs, opts).unwrap_or(raw)
1611    };
1612
1613    Ok(Odb::hash_object_data(ObjectKind::Blob, &data))
1614}
1615
1616/// Derive a Git file mode from filesystem metadata.
1617pub fn mode_from_metadata(meta: &fs::Metadata) -> u32 {
1618    if meta.file_type().is_symlink() {
1619        0o120000
1620    } else {
1621        #[cfg(unix)]
1622        {
1623            if meta.mode() & 0o111 != 0 {
1624                return 0o100755;
1625            }
1626        }
1627        0o100644
1628    }
1629}
1630
1631/// Compare a tree against the working tree.
1632///
1633/// Shows changes from `tree_oid` to the current working directory state.
1634/// Files tracked in the index but not in the tree are shown as Added.
1635/// Files in the tree but missing from the working tree are shown as Deleted.
1636///
1637/// # Parameters
1638///
1639/// - `odb` — object database.
1640/// - `tree_oid` — the tree to compare against (`None` for empty tree).
1641/// - `work_tree` — path to the working tree root.
1642/// - `index` — current index (used to discover new tracked files not in tree).
1643///
1644/// # Errors
1645///
1646/// Returns errors from ODB reads or I/O.
1647pub fn diff_tree_to_worktree(
1648    odb: &Odb,
1649    tree_oid: Option<&ObjectId>,
1650    work_tree: &Path,
1651    index: &Index,
1652) -> Result<Vec<DiffEntry>> {
1653    use crate::config::ConfigSet;
1654    use crate::crlf;
1655
1656    let git_dir = work_tree.join(".git");
1657    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1658    let conv = crlf::ConversionConfig::from_config(&config);
1659    let attrs = crlf::load_gitattributes(work_tree);
1660
1661    // Flatten the tree into a BTreeMap keyed by path
1662    let tree_flat = match tree_oid {
1663        Some(oid) => flatten_tree(odb, oid, "")?,
1664        None => Vec::new(),
1665    };
1666    let tree_map: std::collections::BTreeMap<String, &FlatEntry> =
1667        tree_flat.iter().map(|e| (e.path.clone(), e)).collect();
1668
1669    // Build index lookup: path → &IndexEntry (stage 0 only)
1670    let mut index_entries: std::collections::BTreeMap<&[u8], &IndexEntry> =
1671        std::collections::BTreeMap::new();
1672    let mut index_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1673    let mut stage0_paths: std::collections::BTreeSet<Vec<u8>> = std::collections::BTreeSet::new();
1674    for ie in &index.entries {
1675        if ie.stage() != 0 {
1676            continue;
1677        }
1678        let path = String::from_utf8_lossy(&ie.path).to_string();
1679        index_entries.insert(&ie.path, ie);
1680        index_paths.insert(path);
1681        stage0_paths.insert(ie.path.clone());
1682    }
1683
1684    // Paths with only unmerged stages (1–3) and no stage 0 — `git diff <rev>` must still list them
1685    // so combined `diff --cc` conflict hunks can be emitted (`t4108-apply-threeway`).
1686    let mut unmerged_only_paths: std::collections::BTreeSet<String> =
1687        std::collections::BTreeSet::new();
1688    for ie in &index.entries {
1689        if !(1..=3).contains(&ie.stage()) {
1690            continue;
1691        }
1692        if stage0_paths.contains(&ie.path) {
1693            continue;
1694        }
1695        unmerged_only_paths.insert(String::from_utf8_lossy(&ie.path).into_owned());
1696    }
1697
1698    // Union of tree paths + index paths
1699    let mut all_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1700    all_paths.extend(tree_map.keys().cloned());
1701    all_paths.extend(index_paths.iter().cloned());
1702    all_paths.extend(unmerged_only_paths.iter().cloned());
1703
1704    let mut result = Vec::new();
1705
1706    for path in &all_paths {
1707        if index_entries
1708            .get(path.as_bytes())
1709            .is_some_and(|ie| ie.skip_worktree())
1710        {
1711            // Sparse checkout: `git diff <rev>` does not report tree↔worktree drift for
1712            // skip-worktree paths (they are outside the sparse cone). Matches t7012 stash flow.
1713            continue;
1714        }
1715
1716        let tree_entry = tree_map.get(path.as_str());
1717
1718        // Gitlink entries (submodules) — compare HEAD commit, not file content.
1719        let is_gitlink = tree_entry.is_some_and(|te| te.mode == 0o160000)
1720            || index_entries
1721                .get(path.as_bytes())
1722                .is_some_and(|ie| ie.mode == 0o160000);
1723        if is_gitlink {
1724            if let Some(te) = tree_entry {
1725                let sub_dir = work_tree.join(path);
1726                let sub_head = read_submodule_head_oid(&sub_dir);
1727                let index_oid = index_entries
1728                    .get(path.as_bytes())
1729                    .filter(|ie| ie.mode == 0o160000)
1730                    .map(|ie| ie.oid);
1731                let index_matches_tree = index_oid.is_some_and(|oid| oid == te.oid);
1732                let head_differs = sub_head.as_ref() != Some(&te.oid);
1733                let dirty_while_aligned = index_matches_tree
1734                    && !head_differs
1735                    && submodule_has_dirty_worktree_for_super_diff(work_tree, path, &te.oid);
1736                if head_differs || dirty_while_aligned {
1737                    // Raw `git diff <tree>` lines use a null OID on the worktree side when the
1738                    // checked-out submodule HEAD differs from the tree's gitlink; patch output still
1739                    // resolves the real commit from the submodule directory.
1740                    let new_oid = if head_differs { zero_oid() } else { te.oid };
1741                    result.push(DiffEntry {
1742                        status: DiffStatus::Modified,
1743                        old_path: Some(path.clone()),
1744                        new_path: Some(path.clone()),
1745                        old_mode: format_mode(te.mode),
1746                        new_mode: format_mode(te.mode),
1747                        old_oid: te.oid,
1748                        new_oid,
1749                        score: None,
1750                    });
1751                }
1752            }
1753            continue;
1754        }
1755
1756        let file_path = work_tree.join(path);
1757
1758        let wt_meta = match fs::symlink_metadata(&file_path) {
1759            Ok(m) => Some(m),
1760            Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
1761            Err(e) => return Err(Error::Io(e)),
1762        };
1763
1764        if unmerged_only_paths.contains(path) {
1765            if let (Some(te), Some(meta)) = (tree_entry, wt_meta.as_ref()) {
1766                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1767                let wt_oid =
1768                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, None)?;
1769                let wt_mode = mode_from_metadata(meta);
1770                if wt_oid != te.oid || wt_mode != te.mode {
1771                    result.push(DiffEntry {
1772                        status: DiffStatus::Modified,
1773                        old_path: Some(path.clone()),
1774                        new_path: Some(path.clone()),
1775                        old_mode: format_mode(te.mode),
1776                        new_mode: format_mode(wt_mode),
1777                        old_oid: te.oid,
1778                        new_oid: wt_oid,
1779                        score: None,
1780                    });
1781                }
1782            }
1783            continue;
1784        }
1785
1786        match (tree_entry, wt_meta) {
1787            (Some(te), Some(ref meta)) => {
1788                let wt_mode = mode_from_metadata(meta);
1789                let Some(ie) = index_entries.get(path.as_bytes()) else {
1790                    continue;
1791                };
1792
1793                let index_matches_tree = ie.oid == te.oid && ie.mode == te.mode;
1794
1795                // Fully clean: index matches `HEAD`, worktree matches index, stat cache fresh.
1796                if index_matches_tree && wt_mode == te.mode && stat_matches(ie, meta) {
1797                    continue;
1798                }
1799
1800                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1801                let idx_ent = index_entries.get(path.as_bytes()).copied();
1802
1803                // Staged mode (same blob as `HEAD`, different mode recorded in the index).
1804                if ie.oid == te.oid && ie.mode != te.mode {
1805                    result.push(DiffEntry {
1806                        status: DiffStatus::Modified,
1807                        old_path: Some(path.clone()),
1808                        new_path: Some(path.clone()),
1809                        old_mode: format_mode(te.mode),
1810                        new_mode: format_mode(ie.mode),
1811                        old_oid: te.oid,
1812                        new_oid: te.oid,
1813                        score: None,
1814                    });
1815                    continue;
1816                }
1817
1818                // Index still matches `HEAD`: only unstaged worktree drift (content and/or
1819                // worktree-only exec bit when `update-index` was not run — t4049 harness).
1820                if index_matches_tree {
1821                    let wt_oid = hash_worktree_file(
1822                        odb,
1823                        &file_path,
1824                        meta,
1825                        &conv,
1826                        &file_attrs,
1827                        path,
1828                        idx_ent,
1829                    )?;
1830                    let mut eff_oid = wt_oid;
1831                    if eff_oid != te.oid {
1832                        if let Ok(raw) = fs::read(&file_path) {
1833                            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1834                            if raw_oid == te.oid {
1835                                eff_oid = te.oid;
1836                            }
1837                        }
1838                    }
1839                    if eff_oid != te.oid {
1840                        result.push(DiffEntry {
1841                            status: DiffStatus::Modified,
1842                            old_path: Some(path.clone()),
1843                            new_path: Some(path.clone()),
1844                            old_mode: format_mode(te.mode),
1845                            new_mode: format_mode(wt_mode),
1846                            old_oid: te.oid,
1847                            new_oid: eff_oid,
1848                            score: None,
1849                        });
1850                    } else if wt_mode != te.mode {
1851                        result.push(DiffEntry {
1852                            status: DiffStatus::Modified,
1853                            old_path: Some(path.clone()),
1854                            new_path: Some(path.clone()),
1855                            old_mode: format_mode(te.mode),
1856                            new_mode: format_mode(wt_mode),
1857                            old_oid: te.oid,
1858                            new_oid: te.oid,
1859                            score: None,
1860                        });
1861                    }
1862                    continue;
1863                }
1864
1865                // Staged content (and possibly mode): `git diff <rev>` is tree vs working tree.
1866                let wt_oid =
1867                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, idx_ent)?;
1868                let mut eff_oid = wt_oid;
1869                if eff_oid != te.oid {
1870                    if let Ok(raw) = fs::read(&file_path) {
1871                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1872                        if raw_oid == te.oid {
1873                            eff_oid = te.oid;
1874                        }
1875                    }
1876                }
1877                if eff_oid != te.oid || wt_mode != te.mode {
1878                    result.push(DiffEntry {
1879                        status: DiffStatus::Modified,
1880                        old_path: Some(path.clone()),
1881                        new_path: Some(path.clone()),
1882                        old_mode: format_mode(te.mode),
1883                        new_mode: format_mode(wt_mode),
1884                        old_oid: te.oid,
1885                        new_oid: eff_oid,
1886                        score: None,
1887                    });
1888                }
1889            }
1890            (Some(te), None) => {
1891                // In tree but missing from worktree
1892                result.push(DiffEntry {
1893                    status: DiffStatus::Deleted,
1894                    old_path: Some(path.clone()),
1895                    new_path: None,
1896                    old_mode: format_mode(te.mode),
1897                    new_mode: "000000".to_owned(),
1898                    old_oid: te.oid,
1899                    new_oid: zero_oid(),
1900                    score: None,
1901                });
1902            }
1903            (None, Some(ref meta)) => {
1904                // In index but not in tree, and exists in worktree
1905                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1906                let wt_oid = hash_worktree_file(
1907                    odb,
1908                    &file_path,
1909                    meta,
1910                    &conv,
1911                    &file_attrs,
1912                    path,
1913                    index_entries.get(path.as_bytes()).copied(),
1914                )?;
1915                let wt_mode = mode_from_metadata(meta);
1916                result.push(DiffEntry {
1917                    status: DiffStatus::Added,
1918                    old_path: None,
1919                    new_path: Some(path.clone()),
1920                    old_mode: "000000".to_owned(),
1921                    new_mode: format_mode(wt_mode),
1922                    old_oid: zero_oid(),
1923                    new_oid: wt_oid,
1924                    score: None,
1925                });
1926            }
1927            (None, None) => {
1928                // Tracked in index but neither in tree nor worktree — skip
1929            }
1930        }
1931    }
1932
1933    result.sort_by(|a, b| a.path().cmp(b.path()));
1934    Ok(result)
1935}
1936
1937// ── Rename detection ────────────────────────────────────────────────
1938
1939fn read_added_entry_bytes(
1940    odb: &Odb,
1941    entry: &DiffEntry,
1942    work_root: Option<&Path>,
1943) -> Option<Vec<u8>> {
1944    if entry.new_oid != zero_oid() {
1945        return odb.read(&entry.new_oid).ok().map(|obj| obj.data);
1946    }
1947    let path = entry.new_path.as_deref()?;
1948    let root = work_root?;
1949    fs::read(root.join(path)).ok()
1950}
1951
1952fn modified_as_copy_from_sources(
1953    odb: &Odb,
1954    work_root: Option<&Path>,
1955    e: &DiffEntry,
1956    threshold: u32,
1957    sources: &[(String, ObjectId, bool)],
1958    source_contents: &[Option<Vec<u8>>],
1959    source_tree_entries: &[(String, String, ObjectId)],
1960) -> Option<DiffEntry> {
1961    fn regular_file_mode(mode: &str) -> bool {
1962        mode == "100644" || mode == "100755"
1963    }
1964
1965    if e.status != DiffStatus::Modified || !regular_file_mode(&e.new_mode) {
1966        return None;
1967    }
1968    let new_data = read_added_entry_bytes(odb, e, work_root)?;
1969    let new_oid_eff = if e.new_oid != zero_oid() {
1970        e.new_oid
1971    } else {
1972        Odb::hash_object_data(ObjectKind::Blob, &new_data)
1973    };
1974
1975    let mut best: Option<(usize, u32)> = None;
1976    for (si, (src_path, src_oid, is_deleted)) in sources.iter().enumerate() {
1977        if *is_deleted {
1978            continue;
1979        }
1980        if e.new_path.as_deref() == Some(src_path.as_str()) {
1981            continue;
1982        }
1983        let src_mode_str = source_tree_entries
1984            .iter()
1985            .find(|(p, _, _)| p == src_path)
1986            .map(|(_, m, _)| m.as_str())
1987            .unwrap_or("100644");
1988        if !regular_file_mode(src_mode_str) {
1989            continue;
1990        }
1991
1992        let score = if *src_oid == new_oid_eff {
1993            100
1994        } else {
1995            match (&source_contents[si], Some(new_data.as_slice())) {
1996                (Some(old_data), Some(nd)) => compute_similarity(old_data, nd),
1997                _ => 0,
1998            }
1999        };
2000        if score >= threshold {
2001            let replace = match best {
2002                None => true,
2003                Some((_, s)) => score > s,
2004            };
2005            if replace {
2006                best = Some((si, score));
2007            }
2008        }
2009    }
2010
2011    let (si, score) = best?;
2012    let (src_path, src_oid, _) = &sources[si];
2013    let src_mode = source_tree_entries
2014        .iter()
2015        .find(|(p, _, _)| p == src_path)
2016        .map(|(_, m, _)| m.clone())
2017        .unwrap_or_else(|| e.old_mode.clone());
2018
2019    Some(DiffEntry {
2020        status: DiffStatus::Copied,
2021        old_path: Some(src_path.clone()),
2022        new_path: e.new_path.clone(),
2023        old_mode: src_mode,
2024        new_mode: e.new_mode.clone(),
2025        old_oid: *src_oid,
2026        new_oid: e.new_oid,
2027        score: Some(score),
2028    })
2029}
2030
2031/// Detect renames by pairing Deleted and Added entries with similar content.
2032///
2033/// `threshold` is the minimum similarity percentage (0–100) for a pair to
2034/// be considered a rename (Git's default is 50%).  The function reads blob
2035/// content from the ODB to compute a line-level similarity score.
2036///
2037/// Exact-OID matches are always 100% similar regardless of content.
2038///
2039/// When `work_root` is set, added entries whose `new_oid` is the zero placeholder (as in
2040/// uncached `diff-index` when the work tree diverged from the index) load content from disk
2041/// under that root instead of the object database.
2042pub fn detect_renames(
2043    odb: &Odb,
2044    work_root: Option<&Path>,
2045    entries: Vec<DiffEntry>,
2046    threshold: u32,
2047) -> Vec<DiffEntry> {
2048    // Split entries into deleted, added, and others.
2049    let mut deleted: Vec<DiffEntry> = Vec::new();
2050    let mut added: Vec<DiffEntry> = Vec::new();
2051    let mut others: Vec<DiffEntry> = Vec::new();
2052
2053    for entry in entries {
2054        match entry.status {
2055            DiffStatus::Deleted => deleted.push(entry),
2056            DiffStatus::Added => added.push(entry),
2057            _ => others.push(entry),
2058        }
2059    }
2060
2061    if deleted.is_empty() || added.is_empty() {
2062        // Nothing to pair — return original order.
2063        let mut result = others;
2064        result.extend(deleted);
2065        result.extend(added);
2066        result.sort_by(|a, b| a.path().cmp(b.path()));
2067        return result;
2068    }
2069
2070    // Read content for all deleted blobs.
2071    let deleted_contents: Vec<Option<Vec<u8>>> = deleted
2072        .iter()
2073        .map(|d| odb.read(&d.old_oid).ok().map(|obj| obj.data))
2074        .collect();
2075
2076    // Read content for all added blobs.
2077    let added_contents: Vec<Option<Vec<u8>>> = added
2078        .iter()
2079        .map(|a| read_added_entry_bytes(odb, a, work_root))
2080        .collect();
2081
2082    // Build a matrix of similarity scores and find the best pairings.
2083    // We use a greedy approach: pick the highest-scoring pair first.
2084    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
2085
2086    fn is_regularish_mode(mode: &str) -> bool {
2087        mode == "100644" || mode == "100755"
2088    }
2089
2090    fn same_path_same_blob(del: &DiffEntry, add: &DiffEntry) -> bool {
2091        del.old_path == add.new_path && del.old_oid == add.new_oid && del.old_mode == add.new_mode
2092    }
2093
2094    for (di, del) in deleted.iter().enumerate() {
2095        for (ai, add) in added.iter().enumerate() {
2096            // Exact OID match → 100%
2097            if del.old_oid == add.new_oid {
2098                scores.push((100, di, ai));
2099                continue;
2100            }
2101
2102            // Do not use line similarity across file types (e.g. regular ↔ symlink); Git keeps these
2103            // as separate changes (`t4008-diff-break-rewrite` #7).
2104            if !is_regularish_mode(&del.old_mode) || !is_regularish_mode(&add.new_mode) {
2105                continue;
2106            }
2107
2108            let score = match (&deleted_contents[di], &added_contents[ai]) {
2109                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
2110                _ => 0,
2111            };
2112
2113            if score >= threshold {
2114                scores.push((score, di, ai));
2115            }
2116        }
2117    }
2118
2119    // Sort: prefer real path-changing pairs before same-path no-op pairs, then
2120    // same-basename pairs, then by score descending.
2121    // This matches Git's behavior where basename matches are checked first.
2122    scores.sort_by(|a, b| {
2123        let a_noop = same_path_same_blob(&deleted[a.1], &added[a.2]);
2124        let b_noop = same_path_same_blob(&deleted[b.1], &added[b.2]);
2125        let a_same = same_basename(&deleted[a.1], &added[a.2]);
2126        let b_same = same_basename(&deleted[b.1], &added[b.2]);
2127        a_noop
2128            .cmp(&b_noop)
2129            .then_with(|| b_same.cmp(&a_same))
2130            .then_with(|| b.0.cmp(&a.0))
2131    });
2132
2133    let mut used_deleted = vec![false; deleted.len()];
2134    let mut used_added = vec![false; added.len()];
2135    let mut renames: Vec<DiffEntry> = Vec::new();
2136
2137    for (score, di, ai) in &scores {
2138        if used_deleted[*di] || used_added[*ai] {
2139            continue;
2140        }
2141        used_deleted[*di] = true;
2142        used_added[*ai] = true;
2143
2144        let del = &deleted[*di];
2145        let add = &added[*ai];
2146
2147        // A "rename" whose source and destination are the same path with the
2148        // same blob is not a change at all (this arises with pathological
2149        // duplicate tree entries, t4058). Git pairs and then drops it, leaving
2150        // no diff entry; mirror that by skipping emission.
2151        if same_path_same_blob(del, add) {
2152            continue;
2153        }
2154
2155        renames.push(DiffEntry {
2156            status: DiffStatus::Renamed,
2157            old_path: del.old_path.clone(),
2158            new_path: add.new_path.clone(),
2159            old_mode: del.old_mode.clone(),
2160            new_mode: add.new_mode.clone(),
2161            old_oid: del.old_oid,
2162            new_oid: add.new_oid,
2163            score: Some(*score),
2164        });
2165    }
2166
2167    // Collect unmatched entries.
2168    let mut result = others;
2169    result.extend(renames);
2170    for (i, entry) in deleted.into_iter().enumerate() {
2171        if !used_deleted[i] {
2172            result.push(entry);
2173        }
2174    }
2175    for (i, entry) in added.into_iter().enumerate() {
2176        if !used_added[i] {
2177            result.push(entry);
2178        }
2179    }
2180
2181    result.sort_by(|a, b| a.path().cmp(b.path()));
2182    result
2183}
2184
2185/// Detect copies among diff entries.
2186///
2187/// This first runs rename detection (pairing Deleted+Added), then for any
2188/// remaining Added entries, looks for copy sources.
2189///
2190/// - `find_copies_harder` = false: only Modified entries are copy source candidates.
2191/// - `find_copies_harder` = true: also examine unmodified files from `source_tree_entries`.
2192///
2193/// `source_tree_entries` should be a list of (path, mode, oid) from the source tree;
2194/// used when `find_copies_harder` is true to consider unmodified files as copy sources.
2195pub fn detect_copies(
2196    odb: &Odb,
2197    work_root: Option<&Path>,
2198    entries: Vec<DiffEntry>,
2199    threshold: u32,
2200    find_copies_harder: bool,
2201    source_tree_entries: &[(String, String, ObjectId)],
2202) -> Vec<DiffEntry> {
2203    use std::collections::{HashMap, HashSet};
2204
2205    // Separate entries by status.
2206    let mut deleted: Vec<DiffEntry> = Vec::new();
2207    let mut added: Vec<DiffEntry> = Vec::new();
2208    let mut others: Vec<DiffEntry> = Vec::new();
2209
2210    for entry in entries {
2211        match entry.status {
2212            DiffStatus::Deleted => deleted.push(entry),
2213            DiffStatus::Added => added.push(entry),
2214            _ => others.push(entry),
2215        }
2216    }
2217
2218    // Build source candidates: deleted files, modified files, and optionally tree entries.
2219    // Track which sources are from deleted files (can become renames).
2220    let mut sources: Vec<(String, ObjectId, bool)> = Vec::new(); // (path, oid, is_deleted)
2221    let mut deleted_source_idx: HashMap<String, usize> = HashMap::new();
2222
2223    for entry in &deleted {
2224        if let Some(ref path) = entry.old_path {
2225            deleted_source_idx.insert(path.clone(), sources.len());
2226            sources.push((path.clone(), entry.old_oid, true));
2227        }
2228    }
2229
2230    // Modified and type-changed files are candidates for `-C` (e.g. symlink rewrite leaves the
2231    // old blob available as a copy source for another path; see `t4008-diff-break-rewrite`).
2232    for entry in &others {
2233        if matches!(entry.status, DiffStatus::Modified | DiffStatus::TypeChanged) {
2234            if let Some(ref old_path) = entry.old_path {
2235                if !sources.iter().any(|(p, _, _)| p == old_path) {
2236                    sources.push((old_path.clone(), entry.old_oid, false));
2237                }
2238            }
2239        }
2240    }
2241
2242    // With find_copies_harder, add all source tree entries.
2243    if find_copies_harder {
2244        for (path, _mode, oid) in source_tree_entries {
2245            if !sources.iter().any(|(p, _, _)| p == path) {
2246                sources.push((path.clone(), *oid, false));
2247            }
2248        }
2249    }
2250
2251    if sources.is_empty() {
2252        let mut result = others;
2253        result.extend(deleted);
2254        result.extend(added);
2255        result.sort_by(|a, b| a.path().cmp(b.path()));
2256        return result;
2257    }
2258
2259    // Read content for sources.
2260    let source_contents: Vec<Option<Vec<u8>>> = sources
2261        .iter()
2262        .map(|(_, oid, _)| odb.read(oid).ok().map(|obj| obj.data))
2263        .collect();
2264
2265    let mut result_entries: Vec<DiffEntry> = Vec::new();
2266    let mut renamed_deleted: HashSet<usize> = HashSet::new();
2267    let mut used_added2 = vec![false; added.len()];
2268
2269    if !added.is_empty() {
2270        // Read content for added blobs.
2271        let added_contents: Vec<Option<Vec<u8>>> = added
2272            .iter()
2273            .map(|a| read_added_entry_bytes(odb, a, work_root))
2274            .collect();
2275
2276        // Build score matrix: (score, source_idx, added_idx)
2277        let mut scores: Vec<(u32, usize, usize)> = Vec::new();
2278        for (si, (src_path, src_oid, _)) in sources.iter().enumerate() {
2279            for (ai, add) in added.iter().enumerate() {
2280                // Never pair a path with itself as copy source (matches Git; avoids
2281                // arbitrary tie-breaking when several sources share the same blob).
2282                if add.new_path.as_deref() == Some(src_path.as_str()) {
2283                    continue;
2284                }
2285                let add_oid = if add.new_oid != zero_oid() {
2286                    add.new_oid
2287                } else if let Some(ref data) = added_contents[ai] {
2288                    Odb::hash_object_data(ObjectKind::Blob, data)
2289                } else {
2290                    zero_oid()
2291                };
2292                if *src_oid == add_oid {
2293                    scores.push((100, si, ai));
2294                    continue;
2295                }
2296                let score = match (&source_contents[si], &added_contents[ai]) {
2297                    (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
2298                    _ => 0,
2299                };
2300                if score >= threshold {
2301                    scores.push((score, si, ai));
2302                }
2303            }
2304        }
2305
2306        // Sort by score descending.
2307        scores.sort_by(|a, b| b.0.cmp(&a.0));
2308
2309        // Build source->added mappings, each added file assigned to best source.
2310        let mut used_added = vec![false; added.len()];
2311        let mut source_to_added: HashMap<usize, Vec<(usize, u32)>> = HashMap::new();
2312        for &(score, si, ai) in &scores {
2313            if used_added[ai] {
2314                continue;
2315            }
2316            used_added[ai] = true;
2317            source_to_added.entry(si).or_default().push((ai, score));
2318        }
2319
2320        // For each deleted source, pick one assignment as Rename, rest as Copy.
2321        for (&si, assignments_for_src) in &source_to_added {
2322            let (_, _, is_deleted) = &sources[si];
2323            if *is_deleted && !assignments_for_src.is_empty() {
2324                // Pick the last one (by path) as the rename target.
2325                // Git tends to pick the rename as the last alphabetically.
2326                let rename_ai = assignments_for_src
2327                    .iter()
2328                    .max_by_key(|(ai, _score)| added[*ai].path().to_string())
2329                    .map(|(ai, _)| *ai);
2330
2331                for &(ai, score) in assignments_for_src {
2332                    let (ref src_path, _, _) = sources[si];
2333                    let add = &added[ai];
2334                    let src_mode = source_tree_entries
2335                        .iter()
2336                        .find(|(p, _, _)| p == src_path)
2337                        .map(|(_, m, _)| m.clone())
2338                        .unwrap_or_else(|| add.old_mode.clone());
2339
2340                    let is_rename = Some(ai) == rename_ai;
2341                    result_entries.push(DiffEntry {
2342                        status: if is_rename {
2343                            DiffStatus::Renamed
2344                        } else {
2345                            DiffStatus::Copied
2346                        },
2347                        old_path: Some(src_path.clone()),
2348                        new_path: add.new_path.clone(),
2349                        old_mode: src_mode,
2350                        new_mode: add.new_mode.clone(),
2351                        old_oid: sources[si].1,
2352                        new_oid: add.new_oid,
2353                        score: Some(score),
2354                    });
2355                    used_added2[ai] = true;
2356                }
2357                renamed_deleted.insert(si);
2358            } else {
2359                // Non-deleted source: all assignments are copies.
2360                for &(ai, score) in assignments_for_src {
2361                    let (ref src_path, _, _) = sources[si];
2362                    let add = &added[ai];
2363                    let src_mode = source_tree_entries
2364                        .iter()
2365                        .find(|(p, _, _)| p == src_path)
2366                        .map(|(_, m, _)| m.clone())
2367                        .unwrap_or_else(|| add.old_mode.clone());
2368
2369                    result_entries.push(DiffEntry {
2370                        status: DiffStatus::Copied,
2371                        old_path: Some(src_path.clone()),
2372                        new_path: add.new_path.clone(),
2373                        old_mode: src_mode,
2374                        new_mode: add.new_mode.clone(),
2375                        old_oid: sources[si].1,
2376                        new_oid: add.new_oid,
2377                        score: Some(score),
2378                    });
2379                    used_added2[ai] = true;
2380                }
2381            }
2382        }
2383    }
2384
2385    // Keep deleted entries that weren't consumed by a rename.
2386    for entry in deleted.into_iter() {
2387        if let Some(ref path) = entry.old_path {
2388            if let Some(&si) = deleted_source_idx.get(path) {
2389                if renamed_deleted.contains(&si) {
2390                    // This deletion was consumed by a rename; skip it.
2391                    continue;
2392                }
2393            }
2394        }
2395        result_entries.push(entry);
2396    }
2397
2398    let mut result = others;
2399    result.extend(result_entries);
2400    // Keep unmatched added entries.
2401    for (i, entry) in added.into_iter().enumerate() {
2402        if !used_added2[i] {
2403            result.push(entry);
2404        }
2405    }
2406
2407    let mut final_result = Vec::with_capacity(result.len());
2408    for e in result {
2409        if let Some(c) = modified_as_copy_from_sources(
2410            odb,
2411            work_root,
2412            &e,
2413            threshold,
2414            &sources,
2415            &source_contents,
2416            source_tree_entries,
2417        ) {
2418            final_result.push(c);
2419        } else {
2420            final_result.push(e);
2421        }
2422    }
2423
2424    final_result.sort_by(|a, b| a.path().cmp(b.path()));
2425    final_result
2426}
2427
2428/// Apply Git-style rename and optional copy detection for index↔worktree diffs.
2429///
2430/// When `copies` is true (Git `diff.renames` / `status.renames` set to `copy`/`copies`),
2431/// runs [`detect_copies`] after rename detection so added files can match unchanged
2432/// paths from `HEAD` (e.g. intent-to-add copies).
2433///
2434/// # Errors
2435///
2436/// Propagates errors from reading the `head_tree` object from `odb`.
2437pub fn status_apply_rename_copy_detection(
2438    odb: &Odb,
2439    unstaged_raw: Vec<DiffEntry>,
2440    threshold: u32,
2441    copies: bool,
2442    head_tree: Option<&ObjectId>,
2443) -> Result<Vec<DiffEntry>> {
2444    let after_renames = detect_renames(odb, None, unstaged_raw, threshold);
2445    if !copies {
2446        return Ok(after_renames);
2447    }
2448    let source_tree_entries: Vec<(String, String, ObjectId)> = match head_tree {
2449        Some(oid) => flatten_tree(odb, oid, "")?
2450            .into_iter()
2451            .map(|e| (e.path, format_mode(e.mode), e.oid))
2452            .collect(),
2453        None => Vec::new(),
2454    };
2455    Ok(detect_copies(
2456        odb,
2457        None,
2458        after_renames,
2459        threshold,
2460        false,
2461        &source_tree_entries,
2462    ))
2463}
2464
2465/// Format a rename pair using Git's compact path format.
2466///
2467/// Examples:
2468/// - `a/b/c` → `c/b/a` → `a/b/c => c/b/a`
2469/// - `c/b/a` → `c/d/e` → `c/{b/a => d/e}`
2470/// - `c/d/e` → `d/e` → `{c/d => d}/e`
2471/// - `d/e` → `d/f/e` → `d/{ => f}/e`
2472pub fn format_rename_path(old: &str, new: &str) -> String {
2473    let ob = old.as_bytes();
2474    let nb = new.as_bytes();
2475
2476    // Find common prefix length, snapped to '/' boundary.
2477    let pfx = {
2478        let mut last_sep = 0usize;
2479        let min_len = ob.len().min(nb.len());
2480        for i in 0..min_len {
2481            if ob[i] != nb[i] {
2482                break;
2483            }
2484            if ob[i] == b'/' {
2485                last_sep = i + 1;
2486            }
2487        }
2488        last_sep
2489    };
2490
2491    // Find common suffix length, snapped to '/' boundary.
2492    let mut sfx = {
2493        let mut last_sep = 0usize;
2494        let min_len = ob.len().min(nb.len());
2495        for i in 0..min_len {
2496            let oi = ob.len() - 1 - i;
2497            let ni = nb.len() - 1 - i;
2498            if ob[oi] != nb[ni] {
2499                break;
2500            }
2501            if ob[oi] == b'/' {
2502                last_sep = i + 1;
2503            }
2504        }
2505        last_sep
2506    };
2507
2508    // Suffix starts at this position in each string.
2509    let mut sfx_at_old = ob.len() - sfx;
2510    let mut sfx_at_new = nb.len() - sfx;
2511
2512    // If prefix and suffix overlap in both strings (both middles empty),
2513    // reduce the suffix so that at least the longer string has a non-empty middle.
2514    while pfx > sfx_at_old && pfx > sfx_at_new && sfx > 0 {
2515        // Reduce suffix by snapping to the next smaller '/' boundary.
2516        let suffix_bytes = &ob[sfx_at_old..];
2517        let mut new_sfx = 0;
2518        // Find the next '/' after sfx_at_old (i.e., reduce suffix).
2519        for (i, &b) in suffix_bytes.iter().enumerate().skip(1) {
2520            if b == b'/' {
2521                new_sfx = sfx - i;
2522                break;
2523            }
2524        }
2525        if new_sfx == 0 || new_sfx >= sfx {
2526            sfx_at_old = ob.len();
2527            sfx_at_new = nb.len();
2528            break;
2529        }
2530        sfx = new_sfx;
2531        sfx_at_old = ob.len() - sfx;
2532        sfx_at_new = nb.len() - sfx;
2533    }
2534
2535    // When prefix and suffix overlap in the shorter string, they share
2536    // the '/' boundary character. In the output format, the shared '/'
2537    // appears in both positions (e.g. "d/{ => f}/e" for d/e → d/f/e).
2538    // Compute the middle parts. When prefix and suffix overlap in a
2539    // string, the middle for that string is empty. The shared '/' shows
2540    // in both prefix (trailing) and suffix (leading) positions.
2541    let prefix = &old[..pfx];
2542    let suffix = &old[sfx_at_old..];
2543    let old_mid = if pfx <= sfx_at_old {
2544        &old[pfx..sfx_at_old]
2545    } else {
2546        ""
2547    };
2548    let new_mid = if pfx <= sfx_at_new {
2549        &new[pfx..sfx_at_new]
2550    } else {
2551        ""
2552    };
2553
2554    if prefix.is_empty() && suffix.is_empty() {
2555        return format!("{old} => {new}");
2556    }
2557
2558    format!("{prefix}{{{old_mid} => {new_mid}}}{suffix}")
2559}
2560
2561/// Check if two entries share the same filename (basename).
2562fn same_basename(del: &DiffEntry, add: &DiffEntry) -> bool {
2563    let old = del.old_path.as_deref().unwrap_or("");
2564    let new = add.new_path.as_deref().unwrap_or("");
2565    let old_base = old.rsplit('/').next().unwrap_or(old);
2566    let new_base = new.rsplit('/').next().unwrap_or(new);
2567    old_base == new_base && !old_base.is_empty()
2568}
2569
2570/// Compute a similarity percentage (0–100) between two byte slices.
2571///
2572/// Uses Git's approach: count the bytes that are "shared" (appear in
2573/// equal lines), then compute `score = shared_bytes * 2 * 100 / (src_size + dst_size)`.
2574fn compute_similarity(old: &[u8], new: &[u8]) -> u32 {
2575    // Normalize CRLF → LF before comparing so that files differing
2576    // only in line endings are detected as renames.
2577    let old_norm = crate::crlf::crlf_to_lf(old);
2578    let new_norm = crate::crlf::crlf_to_lf(new);
2579
2580    let src_size = old_norm.len();
2581    let dst_size = new_norm.len();
2582
2583    if src_size == 0 && dst_size == 0 {
2584        return 100;
2585    }
2586    let total = src_size + dst_size;
2587    if total == 0 {
2588        return 100;
2589    }
2590
2591    // Use line-level diff to find shared content, then count bytes.
2592    use similar::{ChangeTag, TextDiff};
2593    let old_str = String::from_utf8_lossy(&old_norm);
2594    let new_str = String::from_utf8_lossy(&new_norm);
2595    let diff = TextDiff::from_lines(&old_str as &str, &new_str as &str);
2596
2597    let mut shared_bytes = 0usize;
2598    for change in diff.iter_all_changes() {
2599        if change.tag() == ChangeTag::Equal {
2600            // Count bytes in the matching line (including newline).
2601            shared_bytes += change.value().len();
2602        }
2603    }
2604
2605    // Git: score = copied * MAX_SCORE / max(src_size, dst_size)
2606    // We normalize to 0-100.
2607    let max_size = src_size.max(dst_size);
2608
2609    ((shared_bytes * 100) / max_size).min(100) as u32
2610}
2611
2612/// Compute rename/copy similarity percentage (0–100) between two byte slices.
2613///
2614/// This uses the same scoring logic as internal rename detection.
2615#[must_use]
2616pub fn rename_similarity_score(old: &[u8], new: &[u8]) -> u32 {
2617    compute_similarity(old, new)
2618}
2619
2620// ── Output formatting ───────────────────────────────────────────────
2621
2622/// Format a diff entry in Git's raw diff format.
2623///
2624/// Example: `:100644 100644 abc1234... def5678... M\tfile.txt`
2625pub fn format_raw(entry: &DiffEntry) -> String {
2626    let path = match entry.status {
2627        DiffStatus::Renamed | DiffStatus::Copied => {
2628            format!(
2629                "{}\t{}",
2630                entry.old_path.as_deref().unwrap_or(""),
2631                entry.new_path.as_deref().unwrap_or("")
2632            )
2633        }
2634        _ => entry.path().to_owned(),
2635    };
2636
2637    let status_str = match (entry.status, entry.score) {
2638        (DiffStatus::Renamed, Some(s)) => format!("R{:03}", s),
2639        (DiffStatus::Copied, Some(s)) => format!("C{:03}", s),
2640        _ => entry.status.letter().to_string(),
2641    };
2642
2643    format!(
2644        ":{} {} {} {} {}\t{}",
2645        entry.old_mode, entry.new_mode, entry.old_oid, entry.new_oid, status_str, path
2646    )
2647}
2648
2649/// Format a diff entry with abbreviated OIDs.
2650pub fn format_raw_abbrev(entry: &DiffEntry, abbrev_len: usize) -> String {
2651    let ellipsis = if std::env::var("GIT_PRINT_SHA1_ELLIPSIS").ok().as_deref() == Some("yes") {
2652        "..."
2653    } else {
2654        ""
2655    };
2656    let old_hex = format!("{}", entry.old_oid);
2657    let new_hex = format!("{}", entry.new_oid);
2658    let old_abbrev = &old_hex[..abbrev_len.min(old_hex.len())];
2659    let new_abbrev = &new_hex[..abbrev_len.min(new_hex.len())];
2660
2661    // Renames/copies carry a similarity score and a `<old>\t<new>` path pair.
2662    let path = match entry.status {
2663        DiffStatus::Renamed | DiffStatus::Copied => format!(
2664            "{}\t{}",
2665            entry.old_path.as_deref().unwrap_or(""),
2666            entry.new_path.as_deref().unwrap_or("")
2667        ),
2668        _ => entry.path().to_owned(),
2669    };
2670    let status_str = match (entry.status, entry.score) {
2671        (DiffStatus::Renamed, Some(s)) => format!("R{s:03}"),
2672        (DiffStatus::Copied, Some(s)) => format!("C{s:03}"),
2673        _ => entry.status.letter().to_string(),
2674    };
2675
2676    format!(
2677        ":{} {} {}{} {}{} {}\t{}",
2678        entry.old_mode,
2679        entry.new_mode,
2680        old_abbrev,
2681        ellipsis,
2682        new_abbrev,
2683        ellipsis,
2684        status_str,
2685        path
2686    )
2687}
2688
2689/// Generate a unified diff patch for two blobs.
2690///
2691/// # Parameters
2692///
2693/// - `old_content` — the old file content (empty for added files).
2694/// - `new_content` — the new file content (empty for deleted files).
2695/// - `old_path` — display path for the old side.
2696/// - `new_path` — display path for the new side.
2697/// - `context_lines` — number of context lines around changes (default: 3).
2698/// - Inter-hunk context defaults to `0` (see [`unified_diff_with_prefix`]).
2699///
2700/// # Returns
2701///
2702/// The unified diff as a string.
2703pub fn unified_diff(
2704    old_content: &str,
2705    new_content: &str,
2706    old_path: &str,
2707    new_path: &str,
2708    context_lines: usize,
2709    indent_heuristic: bool,
2710    quote_path_fully: bool,
2711) -> String {
2712    unified_diff_with_prefix(
2713        old_content,
2714        new_content,
2715        old_path,
2716        new_path,
2717        context_lines,
2718        0,
2719        "a/",
2720        "b/",
2721        indent_heuristic,
2722        quote_path_fully,
2723    )
2724}
2725
2726/// Same as `unified_diff` but with configurable source/destination prefixes.
2727///
2728/// `inter_hunk_context` is Git's `--inter-hunk-context`: adjacent hunks merge when
2729/// the unchanged gap between them is at most `2 * context_lines + inter_hunk_context` lines.
2730#[allow(clippy::too_many_arguments)] // Mirrors Git-style unified diff parameters.
2731pub fn unified_diff_with_prefix(
2732    old_content: &str,
2733    new_content: &str,
2734    old_path: &str,
2735    new_path: &str,
2736    context_lines: usize,
2737    inter_hunk_context: usize,
2738    src_prefix: &str,
2739    dst_prefix: &str,
2740    indent_heuristic: bool,
2741    quote_path_fully: bool,
2742) -> String {
2743    unified_diff_with_prefix_and_funcname(
2744        old_content,
2745        new_content,
2746        old_path,
2747        new_path,
2748        context_lines,
2749        inter_hunk_context,
2750        src_prefix,
2751        dst_prefix,
2752        None,
2753        indent_heuristic,
2754        quote_path_fully,
2755    )
2756}
2757
2758/// Same as [`unified_diff_with_prefix`] with optional custom hunk-header
2759/// function-name matching.
2760#[allow(clippy::too_many_arguments)]
2761pub fn unified_diff_with_prefix_and_funcname(
2762    old_content: &str,
2763    new_content: &str,
2764    old_path: &str,
2765    new_path: &str,
2766    context_lines: usize,
2767    inter_hunk_context: usize,
2768    src_prefix: &str,
2769    dst_prefix: &str,
2770    funcname_matcher: Option<&FuncnameMatcher>,
2771    indent_heuristic: bool,
2772    quote_path_fully: bool,
2773) -> String {
2774    unified_diff_with_prefix_and_funcname_and_algorithm(
2775        old_content,
2776        new_content,
2777        old_path,
2778        new_path,
2779        context_lines,
2780        inter_hunk_context,
2781        src_prefix,
2782        dst_prefix,
2783        funcname_matcher,
2784        similar::Algorithm::Myers,
2785        false,
2786        false,
2787        indent_heuristic,
2788        quote_path_fully,
2789    )
2790}
2791
2792/// Same as [`unified_diff_with_prefix_and_funcname`] but allows callers to
2793/// choose the line diff algorithm used for hunk generation.
2794///
2795/// When `function_context` is true (`git diff -W`), hunks are expanded to
2796/// whole logical functions using the same rules as Git's `XDL_EMIT_FUNCCONTEXT`.
2797#[allow(clippy::too_many_arguments)]
2798pub fn unified_diff_with_prefix_and_funcname_and_algorithm(
2799    old_content: &str,
2800    new_content: &str,
2801    old_path: &str,
2802    new_path: &str,
2803    context_lines: usize,
2804    inter_hunk_context: usize,
2805    src_prefix: &str,
2806    dst_prefix: &str,
2807    funcname_matcher: Option<&FuncnameMatcher>,
2808    algorithm: similar::Algorithm,
2809    function_context: bool,
2810    use_git_histogram: bool,
2811    indent_heuristic: bool,
2812    quote_path_fully: bool,
2813) -> String {
2814    if use_git_histogram {
2815        return unified_diff_histogram_with_prefix_and_funcname(
2816            old_content,
2817            new_content,
2818            old_path,
2819            new_path,
2820            context_lines,
2821            inter_hunk_context,
2822            src_prefix,
2823            dst_prefix,
2824            funcname_matcher,
2825            quote_path_fully,
2826        );
2827    }
2828
2829    if function_context {
2830        return unified_diff_with_function_context(
2831            old_content,
2832            new_content,
2833            old_path,
2834            new_path,
2835            context_lines,
2836            inter_hunk_context,
2837            src_prefix,
2838            dst_prefix,
2839            funcname_matcher,
2840            algorithm,
2841            indent_heuristic,
2842            quote_path_fully,
2843        );
2844    }
2845
2846    use crate::quote_path::format_diff_path_with_prefix;
2847    use similar::{udiff::UnifiedDiffHunk, TextDiff};
2848
2849    let diff = TextDiff::configure()
2850        .algorithm(algorithm)
2851        .diff_lines(old_content, new_content);
2852    let compacted_ops = diff_indent_heuristic::diff_lines_ops_compacted(
2853        old_content,
2854        new_content,
2855        algorithm,
2856        indent_heuristic,
2857    );
2858
2859    let mut output = String::new();
2860    if old_path == "/dev/null" {
2861        output.push_str("--- /dev/null\n");
2862    } else if src_prefix.is_empty() {
2863        // Callers (e.g. `diff-tree`, `diff-index`) may pass a fully formatted token
2864        // (already includes `a/` and any C-style quoting).
2865        output.push_str(&format!("--- {old_path}\n"));
2866    } else {
2867        output.push_str("--- ");
2868        output.push_str(&format_diff_path_with_prefix(
2869            src_prefix,
2870            old_path,
2871            quote_path_fully,
2872        ));
2873        output.push('\n');
2874    }
2875    if new_path == "/dev/null" {
2876        output.push_str("+++ /dev/null\n");
2877    } else if dst_prefix.is_empty() {
2878        output.push_str(&format!("+++ {new_path}\n"));
2879    } else {
2880        output.push_str("+++ ");
2881        output.push_str(&format_diff_path_with_prefix(
2882            dst_prefix,
2883            new_path,
2884            quote_path_fully,
2885        ));
2886        output.push('\n');
2887    }
2888
2889    let old_lines: Vec<&str> = old_content.lines().collect();
2890
2891    // Git's xdiff merges adjacent changes while the gap between them in the old file is at most
2892    // `2 * context_lines + inter_hunk_context` (see `xdl_get_hunk` in xemit.c).
2893    // `similar::group_diff_ops` couples the split threshold and the displayed edge context to a
2894    // single radius (split at `> 2n`), which over-merges when the gap limit is odd
2895    // (t4032: `-U0 --inter-hunk-context=1` with 2 common lines must stay 2 hunks).
2896    let max_common_gap = context_lines
2897        .saturating_mul(2)
2898        .saturating_add(inter_hunk_context);
2899    let op_groups = group_diff_ops_gap(compacted_ops, context_lines, max_common_gap);
2900
2901    for ops in op_groups {
2902        if ops.is_empty() {
2903            continue;
2904        }
2905        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
2906        let hunk_str = format!("{hunk}");
2907        // The similar crate outputs @@ -a,b +c,d @@\n but Git adds
2908        // function context after the closing @@. Extract the hunk header
2909        // and add function context.
2910        if let Some(first_newline) = hunk_str.find('\n') {
2911            let header_line = &hunk_str[..first_newline];
2912            let rest = &hunk_str[first_newline..];
2913
2914            // Parse the old start line from the @@ header
2915            if let Some(func_ctx) =
2916                extract_function_context(header_line, &old_lines, funcname_matcher)
2917            {
2918                output.push_str(header_line);
2919                output.push(' ');
2920                output.push_str(&func_ctx);
2921                output.push_str(rest);
2922            } else {
2923                output.push_str(&hunk_str);
2924            }
2925        } else {
2926            output.push_str(&hunk_str);
2927        }
2928    }
2929
2930    output
2931}
2932
2933/// Group diff ops into hunks like Git's `xdl_get_hunk`: two changes merge into one hunk while
2934/// the run of unchanged lines between them is at most `max_common_gap`
2935/// (`2 * context + inter_hunk_context`), and each hunk keeps at most `context` unchanged lines
2936/// at its edges. Unlike `similar::group_diff_ops`, the split threshold is decoupled from the
2937/// edge context so odd gap limits group exactly like Git.
2938fn group_diff_ops_gap(
2939    mut ops: Vec<similar::DiffOp>,
2940    context: usize,
2941    max_common_gap: usize,
2942) -> Vec<Vec<similar::DiffOp>> {
2943    use similar::DiffOp;
2944    if ops.is_empty() {
2945        return vec![];
2946    }
2947
2948    let mut pending_group = Vec::new();
2949    let mut rv = Vec::new();
2950
2951    if let Some(DiffOp::Equal {
2952        old_index,
2953        new_index,
2954        len,
2955    }) = ops.first_mut()
2956    {
2957        let offset = (*len).saturating_sub(context);
2958        *old_index += offset;
2959        *new_index += offset;
2960        *len -= offset;
2961    }
2962
2963    if let Some(DiffOp::Equal { len, .. }) = ops.last_mut() {
2964        *len -= (*len).saturating_sub(context);
2965    }
2966
2967    for op in ops.into_iter() {
2968        if let DiffOp::Equal {
2969            old_index,
2970            new_index,
2971            len,
2972        } = op
2973        {
2974            // End the current group and start a new one whenever the unchanged
2975            // run is too long to fuse the surrounding changes.
2976            if len > max_common_gap {
2977                pending_group.push(DiffOp::Equal {
2978                    old_index,
2979                    new_index,
2980                    len: context,
2981                });
2982                rv.push(pending_group);
2983                let offset = len.saturating_sub(context);
2984                pending_group = vec![DiffOp::Equal {
2985                    old_index: old_index + offset,
2986                    new_index: new_index + offset,
2987                    len: len - offset,
2988                }];
2989                continue;
2990            }
2991        }
2992        pending_group.push(op);
2993    }
2994
2995    match &pending_group[..] {
2996        &[] | &[similar::DiffOp::Equal { .. }] => {}
2997        _ => rv.push(pending_group),
2998    }
2999
3000    rv
3001}
3002
3003/// `git diff -W`: expand each hunk to include full function bodies (see Git `xemit.c`).
3004fn unified_diff_with_function_context(
3005    old_content: &str,
3006    new_content: &str,
3007    old_path: &str,
3008    new_path: &str,
3009    context_lines: usize,
3010    inter_hunk_context: usize,
3011    src_prefix: &str,
3012    dst_prefix: &str,
3013    funcname_matcher: Option<&FuncnameMatcher>,
3014    algorithm: similar::Algorithm,
3015    indent_heuristic: bool,
3016    quote_path_fully: bool,
3017) -> String {
3018    use crate::quote_path::format_diff_path_with_prefix;
3019    use similar::{group_diff_ops, udiff::UnifiedDiffHunk, TextDiff};
3020
3021    let diff = TextDiff::configure()
3022        .algorithm(algorithm)
3023        .diff_lines(old_content, new_content);
3024
3025    let old_lines: Vec<&str> = old_content.lines().collect();
3026    let new_lines: Vec<&str> = new_content.lines().collect();
3027    let n_old = old_lines.len();
3028    let n_new = new_lines.len();
3029
3030    let group_radius = context_lines
3031        .saturating_mul(2)
3032        .saturating_add(inter_hunk_context);
3033    let all_ops = diff.ops().to_vec();
3034    let op_groups = group_diff_ops(all_ops.clone(), group_radius);
3035
3036    let mut ranges: Vec<(usize, usize, usize, usize)> = Vec::new();
3037
3038    for ops in op_groups {
3039        if ops.is_empty() {
3040            continue;
3041        }
3042        let i1_anchor = func_context_old_anchor(&ops, n_old);
3043        let i1_end = hunk_old_change_end_exclusive(&ops);
3044        let skip_preimage_pull =
3045            append_with_whole_function_added(&ops, n_old, n_new, &new_lines, funcname_matcher);
3046        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
3047        let hunk_str = format!("{hunk}");
3048        let header_line = hunk_str
3049            .lines()
3050            .next()
3051            .unwrap_or("")
3052            .trim_end_matches(['\r', '\n']);
3053        let Some((base_s1, base_e1, _base_s2, _base_e2)) =
3054            parse_unified_hunk_header_ranges(header_line)
3055        else {
3056            continue;
3057        };
3058
3059        let ctx = context_lines;
3060        let (s1, e1, s2, e2) = if skip_preimage_pull {
3061            let s = n_old.saturating_sub(ctx);
3062            let s2 = map_old_line_to_new(&all_ops, s, n_new).min(n_new);
3063            (s, n_old, s2, n_new)
3064        } else {
3065            let mut s1 = base_s1.saturating_sub(ctx);
3066            let mut s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
3067
3068            let base_pre_s1 = i1_anchor.saturating_sub(ctx);
3069            if base_pre_s1 < s1 {
3070                s1 = base_pre_s1;
3071                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
3072            }
3073
3074            let fs1 = expand_func_pre_start(s1, i1_anchor, n_old, &old_lines, funcname_matcher);
3075            if fs1 < s1 {
3076                s1 = fs1;
3077                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
3078            }
3079
3080            let mut e1 = (base_e1 + ctx).min(n_old);
3081            let mut e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
3082            let fe1 = expand_func_post_end(e1, i1_end, n_old, &old_lines, funcname_matcher);
3083            if fe1 > e1 {
3084                e1 = fe1;
3085                e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
3086            }
3087            (s1, e1, s2, e2)
3088        };
3089
3090        ranges.push((s1, e1, s2, e2));
3091    }
3092
3093    let mut output = String::new();
3094    if old_path == "/dev/null" {
3095        output.push_str("--- /dev/null\n");
3096    } else if src_prefix.is_empty() {
3097        output.push_str(&format!("--- {old_path}\n"));
3098    } else {
3099        output.push_str("--- ");
3100        output.push_str(&format_diff_path_with_prefix(
3101            src_prefix,
3102            old_path,
3103            quote_path_fully,
3104        ));
3105        output.push('\n');
3106    }
3107    if new_path == "/dev/null" {
3108        output.push_str("+++ /dev/null\n");
3109    } else if dst_prefix.is_empty() {
3110        output.push_str(&format!("+++ {new_path}\n"));
3111    } else {
3112        output.push_str("+++ ");
3113        output.push_str(&format_diff_path_with_prefix(
3114            dst_prefix,
3115            new_path,
3116            quote_path_fully,
3117        ));
3118        output.push('\n');
3119    }
3120
3121    for (s1, e1, s2, e2) in ranges {
3122        if s1 >= e1 && s2 >= e2 {
3123            continue;
3124        }
3125        let old_seg =
3126            line_slice_for_diff_with_eof_nl(&old_lines, s1, e1, old_content.ends_with('\n'));
3127        let new_seg =
3128            line_slice_for_diff_with_eof_nl(&new_lines, s2, e2, new_content.ends_with('\n'));
3129        let inner_ctx = old_seg.lines().count().max(new_seg.lines().count()).max(1);
3130        let piece = unified_diff_with_prefix_and_funcname_and_algorithm(
3131            &old_seg,
3132            &new_seg,
3133            old_path,
3134            new_path,
3135            inner_ctx,
3136            0,
3137            src_prefix,
3138            dst_prefix,
3139            funcname_matcher,
3140            algorithm,
3141            false,
3142            false,
3143            indent_heuristic,
3144            quote_path_fully,
3145        );
3146        let shifted = shift_unified_hunk_headers_to_full_file(&piece, s1, s2);
3147        let with_func =
3148            enrich_unified_hunk_headers_funcname(&shifted, &old_lines, funcname_matcher);
3149        for line in with_func.lines() {
3150            if line.starts_with("--- ") || line.starts_with("+++ ") {
3151                continue;
3152            }
3153            output.push_str(line);
3154            output.push('\n');
3155        }
3156    }
3157
3158    output
3159}
3160
3161/// `piece` is a unified diff for a slice of the file; hunk headers use 1-based
3162/// coordinates relative to that slice. Shift them by `delta_old` / `delta_new`
3163/// (0-based offsets of the slice in the full file) so the combined patch applies
3164/// to the whole file.
3165fn shift_unified_hunk_headers_to_full_file(
3166    patch: &str,
3167    delta_old: usize,
3168    delta_new: usize,
3169) -> String {
3170    if delta_old == 0 && delta_new == 0 {
3171        return patch.to_owned();
3172    }
3173    let mut out = String::with_capacity(patch.len());
3174    for line in patch.lines() {
3175        if let Some(shifted) = shift_one_unified_hunk_header(line, delta_old, delta_new) {
3176            out.push_str(&shifted);
3177        } else {
3178            out.push_str(line);
3179        }
3180        out.push('\n');
3181    }
3182    out
3183}
3184
3185fn shift_one_unified_hunk_header(line: &str, delta_old: usize, delta_new: usize) -> Option<String> {
3186    let rest = line.strip_prefix("@@ ")?;
3187    let (old_chunk, after_plus) = rest.split_once(" +")?;
3188    let old_spec = old_chunk.strip_prefix('-')?;
3189    let (new_spec, suffix) = after_plus.split_once(" @@")?;
3190    let shifted_old = shift_unified_range_spec(old_spec, delta_old)?;
3191    let shifted_new = shift_unified_range_spec(new_spec, delta_new)?;
3192    Some(format!("@@ -{shifted_old} +{shifted_new} @@{suffix}"))
3193}
3194
3195fn shift_unified_range_spec(spec: &str, delta: usize) -> Option<String> {
3196    let spec = spec.trim();
3197    if let Some((start_s, count_s)) = spec.split_once(',') {
3198        let start: usize = start_s.parse().ok()?;
3199        let count: usize = count_s.parse().ok()?;
3200        Some(format!("{},{}", start.saturating_add(delta), count))
3201    } else {
3202        let start: usize = spec.parse().ok()?;
3203        Some(format!("{}", start.saturating_add(delta)))
3204    }
3205}
3206
3207/// Re-attach `@@ ... @@ <funcname>` using full-file line indices (inner diffs use slices).
3208fn enrich_unified_hunk_headers_funcname(
3209    patch: &str,
3210    full_old_lines: &[&str],
3211    funcname_matcher: Option<&FuncnameMatcher>,
3212) -> String {
3213    let mut out = String::with_capacity(patch.len());
3214    for line in patch.lines() {
3215        if let Some(fixed) = enrich_one_hunk_header_funcname(line, full_old_lines, funcname_matcher)
3216        {
3217            out.push_str(&fixed);
3218        } else {
3219            out.push_str(line);
3220        }
3221        out.push('\n');
3222    }
3223    out
3224}
3225
3226fn enrich_one_hunk_header_funcname(
3227    line: &str,
3228    full_old_lines: &[&str],
3229    funcname_matcher: Option<&FuncnameMatcher>,
3230) -> Option<String> {
3231    let after_at = line.strip_prefix("@@ ")?;
3232    let idx = after_at.find(" @@")?;
3233    let mid = after_at[..idx].trim();
3234    let tail = after_at[idx + 3..].trim_start();
3235    let header_for_parse = format!("@@ {mid} @@");
3236    let func = extract_function_context(&header_for_parse, full_old_lines, funcname_matcher);
3237    Some(if let Some(f) = func {
3238        format!("@@ {mid} @@ {f}")
3239    } else if !tail.is_empty() {
3240        format!("@@ {mid} @@ {tail}")
3241    } else {
3242        format!("@@ {mid} @@")
3243    })
3244}
3245
3246fn line_slice_for_diff_with_eof_nl(
3247    lines: &[&str],
3248    start: usize,
3249    end: usize,
3250    full_file_ends_with_newline: bool,
3251) -> String {
3252    if start >= end {
3253        return String::new();
3254    }
3255    let mut s = lines[start..end].join("\n");
3256    let slice_is_suffix_of_file = end == lines.len();
3257    let need_trailing_nl = if slice_is_suffix_of_file {
3258        full_file_ends_with_newline
3259    } else {
3260        true
3261    };
3262    if need_trailing_nl && !s.ends_with('\n') {
3263        s.push('\n');
3264    }
3265    s
3266}
3267
3268/// Map a 0-based old line index to the corresponding 0-based new line index using the full-file
3269/// diff ops (Git aligns context across deletions/insertions).
3270fn map_old_line_to_new(ops: &[similar::DiffOp], old_line: usize, n_new: usize) -> usize {
3271    use similar::DiffOp;
3272    let mut n = 0usize;
3273    for op in ops {
3274        match *op {
3275            DiffOp::Equal {
3276                old_index,
3277                new_index,
3278                len,
3279            } => {
3280                if old_index + len <= old_line {
3281                    n = new_index + len;
3282                    continue;
3283                }
3284                if old_index < old_line {
3285                    let take = old_line - old_index;
3286                    return (new_index + take).min(n_new);
3287                }
3288                return new_index.min(n_new);
3289            }
3290            DiffOp::Delete {
3291                old_index,
3292                old_len,
3293                new_index,
3294            } => {
3295                if old_index + old_len <= old_line {
3296                    n = new_index;
3297                    continue;
3298                }
3299                if old_index < old_line {
3300                    return new_index.min(n_new);
3301                }
3302            }
3303            DiffOp::Insert {
3304                old_index,
3305                new_index,
3306                new_len,
3307            } => {
3308                if old_index < old_line {
3309                    n = new_index + new_len;
3310                    continue;
3311                }
3312                if old_index == old_line {
3313                    // `old_line` is an exclusive end or insertion point aligned with this insert
3314                    // (e.g. EOF append maps to after the inserted block).
3315                    return (new_index + new_len).min(n_new);
3316                }
3317                return new_index.min(n_new);
3318            }
3319            DiffOp::Replace {
3320                old_index,
3321                old_len,
3322                new_index,
3323                new_len,
3324            } => {
3325                if old_index + old_len <= old_line {
3326                    n = new_index + new_len;
3327                    continue;
3328                }
3329                if old_index < old_line {
3330                    let into_old = old_line - old_index;
3331                    let mapped = new_index + into_old.min(new_len);
3332                    return mapped.min(n_new);
3333                }
3334                return new_index.min(n_new);
3335            }
3336        }
3337    }
3338    n.min(n_new)
3339}
3340
3341/// Parse `@@ -old +new @@` into 0-based half-open ranges in each file.
3342fn parse_unified_hunk_header_ranges(header: &str) -> Option<(usize, usize, usize, usize)> {
3343    let rest = header.strip_prefix("@@ ")?;
3344    let (old_tok, rest2) = rest.split_once(" +")?;
3345    let old_tok = old_tok.strip_prefix('-')?;
3346    let new_tok = rest2.split_once(" @@").map(|(a, _)| a)?;
3347
3348    fn parse_side(spec: &str) -> Option<(usize, usize)> {
3349        let spec = spec.trim();
3350        let (start_one_based, count) = if let Some((a, b)) = spec.split_once(',') {
3351            (a.parse::<usize>().ok()?, b.parse::<usize>().ok()?)
3352        } else {
3353            let s = spec.parse::<usize>().ok()?;
3354            (s, 1usize)
3355        };
3356        let s0 = start_one_based.saturating_sub(1);
3357        let e0 = s0.saturating_add(count);
3358        Some((s0, e0))
3359    }
3360
3361    let (os, oe) = parse_side(old_tok)?;
3362    let (ns, ne) = parse_side(new_tok)?;
3363    Some((os, oe, ns, ne))
3364}
3365
3366/// Git `xemit.c`: when a hunk only inserts at EOF (first inserted line is `new_index == n_old`)
3367/// and the added text already contains a funcname line, do not pull extra context from the preimage.
3368fn append_with_whole_function_added(
3369    ops: &[similar::DiffOp],
3370    n_old: usize,
3371    n_new: usize,
3372    new_lines: &[&str],
3373    matcher: Option<&FuncnameMatcher>,
3374) -> bool {
3375    use similar::DiffOp;
3376    if n_old == 0 {
3377        return false;
3378    }
3379    let mut only_ins_or_eq = true;
3380    let mut min_new_ins = usize::MAX;
3381    for op in ops {
3382        match *op {
3383            DiffOp::Equal { .. } => {}
3384            DiffOp::Insert {
3385                new_index, new_len, ..
3386            } => {
3387                min_new_ins = min_new_ins.min(new_index);
3388                if new_len == 0 {
3389                    only_ins_or_eq = false;
3390                }
3391            }
3392            DiffOp::Delete { .. } | DiffOp::Replace { .. } => {
3393                only_ins_or_eq = false;
3394            }
3395        }
3396    }
3397    let mut insert_at_eof = false;
3398    for op in ops {
3399        if let DiffOp::Insert { old_index, .. } = *op {
3400            if old_index == n_old {
3401                insert_at_eof = true;
3402                break;
3403            }
3404        }
3405    }
3406    let append_at_eof = min_new_ins == n_old || insert_at_eof;
3407    if !only_ins_or_eq || !append_at_eof || min_new_ins == usize::MAX {
3408        return false;
3409    }
3410    // Git only skips preimage pull when the inserted block is clearly a new logical
3411    // function (see `xemit.c` walking `xdf2` for `is_func_rec`). A loose "any line
3412    // looks like a function" check would match `return` / `printf` and break `-W`
3413    // hunks that still need preimage context (t4051 `extended`).
3414    let mut j = min_new_ins;
3415    while j < n_new {
3416        let line = new_lines[j];
3417        if line.trim().is_empty() {
3418            j += 1;
3419            continue;
3420        }
3421        if let Some(m) = matcher {
3422            if m.match_line(line).is_some() {
3423                return true;
3424            }
3425        } else if inserted_block_starts_with_c_like_function_definition(line) {
3426            return true;
3427        }
3428        j += 1;
3429    }
3430    false
3431}
3432
3433fn inserted_block_starts_with_c_like_function_definition(line: &str) -> bool {
3434    let t = line.trim_start();
3435    let Some(open_paren) = t.find('(') else {
3436        return false;
3437    };
3438    let head = &t[..open_paren];
3439    let tokens: Vec<&str> = head.split_whitespace().collect();
3440    if tokens.len() < 2 {
3441        // `printf(...)`, `return (`, etc. — not `return_type name(`.
3442        return false;
3443    }
3444    let nameish = tokens.last().copied().unwrap_or("");
3445    let name = nameish.trim_end_matches(['*', '&']);
3446    if name.is_empty() || !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
3447        return false;
3448    }
3449    let type_or_modifier = |tok: &str| {
3450        matches!(
3451            tok,
3452            "static"
3453                | "extern"
3454                | "inline"
3455                | "void"
3456                | "int"
3457                | "char"
3458                | "short"
3459                | "long"
3460                | "float"
3461                | "double"
3462                | "unsigned"
3463                | "signed"
3464                | "struct"
3465                | "enum"
3466                | "union"
3467                | "const"
3468                | "volatile"
3469                | "typedef"
3470        )
3471    };
3472    tokens[..tokens.len() - 1]
3473        .iter()
3474        .any(|tok| type_or_modifier(tok))
3475}
3476
3477fn hunk_old_change_end_exclusive(ops: &[similar::DiffOp]) -> usize {
3478    use similar::DiffOp;
3479    let mut max_o = 0usize;
3480    for op in ops {
3481        match *op {
3482            DiffOp::Delete {
3483                old_index, old_len, ..
3484            } => {
3485                max_o = max_o.max(old_index + old_len);
3486            }
3487            DiffOp::Replace {
3488                old_index, old_len, ..
3489            } => {
3490                max_o = max_o.max(old_index + old_len);
3491            }
3492            DiffOp::Insert { old_index, .. } => {
3493                // Pure insertions do not consume old lines; Git's post-context anchor is the
3494                // insertion point (`old_index`), not 0 (t4051 `extended`).
3495                max_o = max_o.max(old_index);
3496            }
3497            DiffOp::Equal { .. } => {}
3498        }
3499    }
3500    max_o
3501}
3502
3503fn func_context_old_anchor(ops: &[similar::DiffOp], n_old: usize) -> usize {
3504    use similar::DiffOp;
3505    let mut has_delete_or_replace = false;
3506    let mut min_del = usize::MAX;
3507    let mut min_ins_old = usize::MAX;
3508
3509    for op in ops {
3510        match *op {
3511            DiffOp::Delete {
3512                old_index, old_len, ..
3513            } => {
3514                has_delete_or_replace = true;
3515                min_del = min_del.min(old_index);
3516                min_del = min_del.min(old_index + old_len.saturating_sub(1));
3517            }
3518            DiffOp::Replace {
3519                old_index, old_len, ..
3520            } => {
3521                has_delete_or_replace = true;
3522                min_del = min_del.min(old_index);
3523                min_del = min_del.min(old_index + old_len.saturating_sub(1));
3524            }
3525            DiffOp::Insert { old_index, .. } => {
3526                min_ins_old = min_ins_old.min(old_index);
3527            }
3528            DiffOp::Equal { .. } => {}
3529        }
3530    }
3531
3532    let mut i1 = if has_delete_or_replace {
3533        min_del
3534    } else if min_ins_old != usize::MAX {
3535        min_ins_old
3536    } else {
3537        0
3538    };
3539
3540    let pure_insert = ops
3541        .iter()
3542        .all(|op| matches!(op, DiffOp::Insert { .. } | DiffOp::Equal { .. }))
3543        && ops.iter().any(|op| matches!(op, DiffOp::Insert { .. }));
3544
3545    if pure_insert && i1 >= n_old && n_old > 0 {
3546        i1 = n_old - 1;
3547    }
3548
3549    i1.min(n_old.saturating_sub(1))
3550}
3551
3552fn expand_func_pre_start(
3553    s1: usize,
3554    i1: usize,
3555    n_old: usize,
3556    old_lines: &[&str],
3557    matcher: Option<&FuncnameMatcher>,
3558) -> usize {
3559    if n_old == 0 {
3560        return s1;
3561    }
3562    let i1 = i1.min(n_old.saturating_sub(1));
3563    let mut fs1 = get_func_line_backward(old_lines, i1, matcher).unwrap_or(i1);
3564    while fs1 > 0
3565        && !is_line_empty_for_func_context(old_lines[fs1 - 1])
3566        && !is_func_line(old_lines[fs1 - 1], matcher)
3567    {
3568        fs1 -= 1;
3569    }
3570    s1.min(fs1)
3571}
3572
3573fn expand_func_post_end(
3574    e1: usize,
3575    i1_end: usize,
3576    n_old: usize,
3577    old_lines: &[&str],
3578    matcher: Option<&FuncnameMatcher>,
3579) -> usize {
3580    let from = i1_end.min(n_old);
3581    let fe1 = get_func_line_forward(old_lines, from, matcher).unwrap_or(n_old);
3582    let mut fe1_adj = fe1;
3583    while fe1_adj > 0 && is_line_empty_for_func_context(old_lines[fe1_adj - 1]) {
3584        fe1_adj -= 1;
3585    }
3586    e1.max(fe1_adj).min(n_old)
3587}
3588
3589fn is_line_empty_for_func_context(line: &str) -> bool {
3590    line.chars().all(|c| c.is_whitespace())
3591}
3592
3593fn is_func_line(line: &str, matcher: Option<&FuncnameMatcher>) -> bool {
3594    if let Some(m) = matcher {
3595        return m.match_line(line).is_some();
3596    }
3597    let t = line.trim_end_matches(['\n', '\r']);
3598    if t.is_empty() {
3599        return false;
3600    }
3601    let b = t.as_bytes()[0];
3602    b.is_ascii_alphabetic() || b == b'_' || b == b'$'
3603}
3604
3605fn get_func_line_backward(
3606    old_lines: &[&str],
3607    start: usize,
3608    matcher: Option<&FuncnameMatcher>,
3609) -> Option<usize> {
3610    let mut l = start.min(old_lines.len().saturating_sub(1));
3611    if old_lines.is_empty() {
3612        return None;
3613    }
3614    loop {
3615        if is_func_line(old_lines[l], matcher) {
3616            return Some(l);
3617        }
3618        if l == 0 {
3619            break;
3620        }
3621        l -= 1;
3622    }
3623    None
3624}
3625
3626fn get_func_line_forward(
3627    old_lines: &[&str],
3628    start: usize,
3629    matcher: Option<&FuncnameMatcher>,
3630) -> Option<usize> {
3631    let mut l = start;
3632    while l < old_lines.len() {
3633        if is_func_line(old_lines[l], matcher) {
3634            return Some(l);
3635        }
3636        l += 1;
3637    }
3638    None
3639}
3640
3641/// Compute a unified diff with anchored lines.
3642///
3643/// Anchored lines that appear exactly once in both old and new content are
3644/// forced to match, splitting the diff into segments around those anchor points.
3645/// This produces diffs where the anchored text stays as context and surrounding
3646/// lines are shown as additions/removals.
3647///
3648/// Segment diffs use `algorithm`. When `use_git_histogram` is true, histogram uses imara-diff
3649/// (Git-compatible); otherwise `algorithm` is passed to `similar`.
3650pub fn anchored_unified_diff(
3651    old_content: &str,
3652    new_content: &str,
3653    old_path: &str,
3654    new_path: &str,
3655    context_lines: usize,
3656    anchors: &[String],
3657    algorithm: similar::Algorithm,
3658    use_git_histogram: bool,
3659    indent_heuristic: bool,
3660    quote_path_fully: bool,
3661) -> String {
3662    use crate::quote_path::format_diff_path_with_prefix;
3663    use similar::TextDiff;
3664
3665    let old_lines: Vec<&str> = old_content.lines().collect();
3666    let new_lines: Vec<&str> = new_content.lines().collect();
3667
3668    // Find anchored lines that appear exactly once in both old and new
3669    let mut anchor_pairs: Vec<(usize, usize)> = Vec::new(); // (old_idx, new_idx)
3670
3671    for anchor in anchors {
3672        let anchor_str = anchor.as_str();
3673
3674        // Count occurrences in old
3675        let old_positions: Vec<usize> = old_lines
3676            .iter()
3677            .enumerate()
3678            .filter(|(_, l)| l.trim_end() == anchor_str)
3679            .map(|(i, _)| i)
3680            .collect();
3681
3682        // Count occurrences in new
3683        let new_positions: Vec<usize> = new_lines
3684            .iter()
3685            .enumerate()
3686            .filter(|(_, l)| l.trim_end() == anchor_str)
3687            .map(|(i, _)| i)
3688            .collect();
3689
3690        // Only anchor if unique in both
3691        if old_positions.len() == 1 && new_positions.len() == 1 {
3692            anchor_pairs.push((old_positions[0], new_positions[0]));
3693        }
3694    }
3695
3696    // If no valid anchors, fall back to normal diff
3697    if anchor_pairs.is_empty() {
3698        return unified_diff_with_prefix_and_funcname_and_algorithm(
3699            old_content,
3700            new_content,
3701            old_path,
3702            new_path,
3703            context_lines,
3704            0,
3705            "a/",
3706            "b/",
3707            None,
3708            algorithm,
3709            false,
3710            use_git_histogram,
3711            indent_heuristic,
3712            quote_path_fully,
3713        );
3714    }
3715
3716    // Sort anchor pairs by their position in the old file
3717    anchor_pairs.sort_by_key(|&(old_idx, _)| old_idx);
3718
3719    // Filter to only keep pairs where new positions are also increasing
3720    // (longest increasing subsequence of new positions)
3721    let mut filtered: Vec<(usize, usize)> = Vec::new();
3722    for &pair in &anchor_pairs {
3723        if filtered.is_empty() || filtered.last().is_some_and(|last| pair.1 > last.1) {
3724            filtered.push(pair);
3725        }
3726    }
3727    let anchor_pairs = filtered;
3728
3729    // Build a modified version of old/new where we diff segments between anchors.
3730    // We'll construct the diff by processing segments:
3731    // - Before first anchor
3732    // - Between consecutive anchors
3733    // - After last anchor
3734    // Each anchor line itself is a fixed context match.
3735
3736    // Collect all diff operations
3737    struct LineDiffOp {
3738        tag: char, // ' ', '+', '-'
3739        line: String,
3740    }
3741
3742    let append_segment_diff =
3743        |ops: &mut Vec<LineDiffOp>, old_seg_input: &str, new_seg_input: &str| {
3744            use similar::ChangeTag;
3745            let old_ls: Vec<&str> = old_seg_input.lines().collect();
3746            let new_ls: Vec<&str> = new_seg_input.lines().collect();
3747            if old_ls.is_empty() && new_ls.is_empty() {
3748                return;
3749            }
3750            let seg_diff = TextDiff::configure()
3751                .algorithm(algorithm)
3752                .diff_slices(&old_ls, &new_ls);
3753            let raw = seg_diff.ops().to_vec();
3754            let compacted = diff_indent_heuristic::apply_change_compact_to_ops(
3755                &raw,
3756                &old_ls,
3757                &new_ls,
3758                indent_heuristic,
3759            );
3760            for op in &compacted {
3761                for ch in op.iter_changes(&old_ls, &new_ls) {
3762                    let t = match ch.tag() {
3763                        ChangeTag::Equal => ' ',
3764                        ChangeTag::Delete => '-',
3765                        ChangeTag::Insert => '+',
3766                    };
3767                    ops.push(LineDiffOp {
3768                        tag: t,
3769                        line: ch.value().to_string(),
3770                    });
3771                }
3772            }
3773        };
3774
3775    let mut ops: Vec<LineDiffOp> = Vec::new();
3776    let mut old_pos = 0usize;
3777    let mut new_pos = 0usize;
3778
3779    for &(old_anchor, new_anchor) in &anchor_pairs {
3780        // Diff the segment before this anchor
3781        let old_segment: Vec<&str> = old_lines[old_pos..old_anchor].to_vec();
3782        let new_segment: Vec<&str> = new_lines[new_pos..new_anchor].to_vec();
3783
3784        let old_seg_text = old_segment.join("\n");
3785        let new_seg_text = new_segment.join("\n");
3786
3787        if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
3788            let old_seg_input = if old_seg_text.is_empty() {
3789                String::new()
3790            } else {
3791                format!("{}\n", old_seg_text)
3792            };
3793            let new_seg_input = if new_seg_text.is_empty() {
3794                String::new()
3795            } else {
3796                format!("{}\n", new_seg_text)
3797            };
3798            append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
3799        }
3800
3801        // The anchor line itself is always context
3802        ops.push(LineDiffOp {
3803            tag: ' ',
3804            line: old_lines[old_anchor].to_string(),
3805        });
3806
3807        old_pos = old_anchor + 1;
3808        new_pos = new_anchor + 1;
3809    }
3810
3811    // Diff the remaining segment after the last anchor
3812    let old_segment: Vec<&str> = old_lines[old_pos..].to_vec();
3813    let new_segment: Vec<&str> = new_lines[new_pos..].to_vec();
3814    let old_seg_text = old_segment.join("\n");
3815    let new_seg_text = new_segment.join("\n");
3816
3817    if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
3818        let old_seg_input = if old_seg_text.is_empty() {
3819            String::new()
3820        } else {
3821            format!("{}\n", old_seg_text)
3822        };
3823        let new_seg_input = if new_seg_text.is_empty() {
3824            String::new()
3825        } else {
3826            format!("{}\n", new_seg_text)
3827        };
3828        append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
3829    }
3830
3831    // Now format as unified diff with hunks
3832    let mut output = String::new();
3833    if old_path == "/dev/null" {
3834        output.push_str("--- /dev/null\n");
3835    } else {
3836        output.push_str("--- ");
3837        output.push_str(&format_diff_path_with_prefix(
3838            "a/",
3839            old_path,
3840            quote_path_fully,
3841        ));
3842        output.push('\n');
3843    }
3844    if new_path == "/dev/null" {
3845        output.push_str("+++ /dev/null\n");
3846    } else {
3847        output.push_str("+++ ");
3848        output.push_str(&format_diff_path_with_prefix(
3849            "b/",
3850            new_path,
3851            quote_path_fully,
3852        ));
3853        output.push('\n');
3854    }
3855
3856    // Group ops into hunks with context
3857    let total_ops = ops.len();
3858    if total_ops == 0 {
3859        return output;
3860    }
3861
3862    // Find ranges of changes
3863    let mut hunks: Vec<(usize, usize)> = Vec::new(); // (start, end) indices into ops
3864    let mut i = 0;
3865    while i < total_ops {
3866        if ops[i].tag != ' ' {
3867            let start = i.saturating_sub(context_lines);
3868            let mut end = i;
3869            // Extend to include consecutive changes and their context
3870            while end < total_ops {
3871                if ops[end].tag != ' ' {
3872                    end += 1;
3873                    continue;
3874                }
3875                // Check if there's another change within context_lines
3876                let mut next_change = end;
3877                while next_change < total_ops && ops[next_change].tag == ' ' {
3878                    next_change += 1;
3879                }
3880                if next_change < total_ops && next_change - end <= context_lines * 2 {
3881                    end = next_change + 1;
3882                } else {
3883                    end = (end + context_lines).min(total_ops);
3884                    break;
3885                }
3886            }
3887            // Merge with previous hunk if overlapping
3888            if let Some(last) = hunks.last_mut() {
3889                if start <= last.1 {
3890                    last.1 = end;
3891                } else {
3892                    hunks.push((start, end));
3893                }
3894            } else {
3895                hunks.push((start, end));
3896            }
3897            i = end;
3898        } else {
3899            i += 1;
3900        }
3901    }
3902
3903    // Output each hunk
3904    for (start, end) in hunks {
3905        // Count old/new lines in this hunk
3906        let mut old_start = 1usize;
3907        let mut new_start = 1usize;
3908        // Calculate line numbers by counting ops before this hunk
3909        for op in &ops[..start] {
3910            match op.tag {
3911                ' ' => {
3912                    old_start += 1;
3913                    new_start += 1;
3914                }
3915                '-' => {
3916                    old_start += 1;
3917                }
3918                '+' => {
3919                    new_start += 1;
3920                }
3921                _ => {}
3922            }
3923        }
3924        let mut old_count = 0usize;
3925        let mut new_count = 0usize;
3926        for op in &ops[start..end] {
3927            match op.tag {
3928                ' ' => {
3929                    old_count += 1;
3930                    new_count += 1;
3931                }
3932                '-' => {
3933                    old_count += 1;
3934                }
3935                '+' => {
3936                    new_count += 1;
3937                }
3938                _ => {}
3939            }
3940        }
3941
3942        output.push_str(&format!(
3943            "@@ -{},{} +{},{} @@\n",
3944            old_start, old_count, new_start, new_count
3945        ));
3946        for op in &ops[start..end] {
3947            output.push(op.tag);
3948            output.push_str(&op.line);
3949            output.push('\n');
3950        }
3951    }
3952
3953    output
3954}
3955
3956/// Extract function context for a hunk header.
3957///
3958/// Given a hunk header like `@@ -8,7 +8,7 @@`, find the last line
3959/// before line 8 in the old content that looks like a function header
3960/// (starts with a non-whitespace character, like Git's default).
3961fn extract_function_context(
3962    header: &str,
3963    old_lines: &[&str],
3964    funcname_matcher: Option<&FuncnameMatcher>,
3965) -> Option<String> {
3966    // Parse the old start line number from "@@ -<start>,<count> ..."
3967    let at_pos = header.find("-")?;
3968    let rest = &header[at_pos + 1..];
3969    let comma_or_space = rest.find([',', ' '])?;
3970    let start_str = &rest[..comma_or_space];
3971    let start_line: usize = start_str.parse().ok()?;
3972
3973    // Parse the old line count; "@@ -<start>,<count> ..." (no comma means count 1).
3974    // Only look for the comma inside the old-range token itself — searching the
3975    // whole remainder would pick up the comma from the new side (e.g. "+0,0").
3976    let old_token_end = rest.find([' ', '\t']).unwrap_or(rest.len());
3977    let old_token = &rest[..old_token_end];
3978    let old_count: usize = if let Some(comma) = old_token.find(',') {
3979        old_token[comma + 1..].parse().unwrap_or(1)
3980    } else {
3981        1
3982    };
3983
3984    if start_line == 0 {
3985        return None;
3986    }
3987
3988    // Look backwards for a line that matches the funcname pattern. start_line is
3989    // 1-indexed. For a normal hunk the first changed pre-image line is
3990    // old_lines[start_line-1], so we search lines strictly before it
3991    // (old_lines[0..start_line-1]). For a pure insertion (old count 0) the
3992    // content is inserted *after* old line start_line, so Git's function search
3993    // begins at that line itself: search old_lines[0..start_line].
3994    let search_end = if old_count == 0 {
3995        start_line.min(old_lines.len())
3996    } else {
3997        if start_line <= 1 {
3998            return None;
3999        }
4000        (start_line - 1).min(old_lines.len())
4001    };
4002    let truncate = |text: &str| {
4003        if text.len() > 80 {
4004            let mut end = 80;
4005            while end > 0 && !text.is_char_boundary(end) {
4006                end -= 1;
4007            }
4008            text[..end].to_owned()
4009        } else {
4010            text.to_owned()
4011        }
4012    };
4013
4014    for i in (0..search_end).rev() {
4015        let line = old_lines[i];
4016        if line.is_empty() {
4017            continue;
4018        }
4019        if let Some(matcher) = funcname_matcher {
4020            if let Some(matched) = matcher.match_line(line) {
4021                return Some(truncate(&matched));
4022            }
4023            continue;
4024        }
4025
4026        let first = line.as_bytes()[0];
4027        if first.is_ascii_alphabetic() || first == b'_' || first == b'$' {
4028            return Some(truncate(line.trim_end_matches(char::is_whitespace)));
4029        }
4030    }
4031    None
4032}
4033
4034/// Generate diff stat output (file name + insertions/deletions).
4035///
4036/// Returns a single line like: ` file.txt | 5 ++---`
4037pub fn format_stat_line(
4038    path: &str,
4039    insertions: usize,
4040    deletions: usize,
4041    max_path_len: usize,
4042) -> String {
4043    format_stat_line_width(path, insertions, deletions, max_path_len, 0)
4044}
4045
4046pub fn format_stat_line_width(
4047    path: &str,
4048    insertions: usize,
4049    deletions: usize,
4050    max_path_len: usize,
4051    count_width: usize,
4052) -> String {
4053    let total = insertions + deletions;
4054    let plus = "+".repeat(insertions.min(50));
4055    let minus = "-".repeat(deletions.min(50));
4056    let cw = if count_width > 0 {
4057        count_width
4058    } else {
4059        format!("{}", total).len()
4060    };
4061    let bar = format!("{}{}", plus, minus);
4062    if bar.is_empty() {
4063        format!(
4064            " {:<width$} | {:>cw$}",
4065            path,
4066            total,
4067            width = max_path_len,
4068            cw = cw
4069        )
4070    } else {
4071        format!(
4072            " {:<width$} | {:>cw$} {}",
4073            path,
4074            total,
4075            bar,
4076            width = max_path_len,
4077            cw = cw
4078        )
4079    }
4080}
4081
4082/// Normalise one line like Git's `-b` / `--ignore-space-change`.
4083#[must_use]
4084pub fn normalize_ignore_space_change_line(line: &str) -> String {
4085    let mut result = String::with_capacity(line.len());
4086    let mut in_space = false;
4087    for c in line.chars() {
4088        if c.is_whitespace() {
4089            if !in_space {
4090                result.push(' ');
4091                in_space = true;
4092            }
4093        } else {
4094            result.push(c);
4095            in_space = false;
4096        }
4097    }
4098    while result.ends_with(' ') {
4099        result.pop();
4100    }
4101    result
4102}
4103
4104/// Normalise text like Git's `-b` / `--ignore-space-change`: on each line, collapse runs of
4105/// whitespace to a single ASCII space and trim trailing spaces.
4106///
4107/// Line breaks are preserved by splitting on [`str::lines`] and rejoining with `\n` (same approach
4108/// as the porcelain `diff` whitespace handling in `grit`).
4109#[must_use]
4110pub fn normalize_ignore_space_change(content: &str) -> String {
4111    content
4112        .lines()
4113        .map(normalize_ignore_space_change_line)
4114        .collect::<Vec<_>>()
4115        .join("\n")
4116}
4117
4118/// Count insertions and deletions between two strings.
4119///
4120/// Returns `(insertions, deletions)`.
4121pub fn count_changes(old_content: &str, new_content: &str) -> (usize, usize) {
4122    count_changes_with_algorithm(old_content, new_content, similar::Algorithm::Myers, false)
4123}
4124
4125/// Count insertions and deletions using the given line-diff algorithm.
4126///
4127/// Git's `--stat` / `--numstat` follow the configured diff algorithm; this mirrors that by
4128/// running [`similar::TextDiff`] with an explicit [`similar::Algorithm`].
4129#[must_use]
4130pub fn count_changes_with_algorithm(
4131    old_content: &str,
4132    new_content: &str,
4133    algorithm: similar::Algorithm,
4134    use_git_histogram: bool,
4135) -> (usize, usize) {
4136    if use_git_histogram {
4137        use imara_diff::{Algorithm, Diff, InternedInput};
4138        let input = InternedInput::new(old_content, new_content);
4139        let mut d = Diff::compute(Algorithm::Histogram, &input);
4140        d.postprocess_lines(&input);
4141        return (d.count_additions() as usize, d.count_removals() as usize);
4142    }
4143
4144    use similar::{ChangeTag, TextDiff};
4145
4146    let diff = TextDiff::configure()
4147        .algorithm(algorithm)
4148        .diff_lines(old_content, new_content);
4149    let mut ins = 0;
4150    let mut del = 0;
4151
4152    for change in diff.iter_all_changes() {
4153        match change.tag() {
4154            ChangeTag::Insert => ins += 1,
4155            ChangeTag::Delete => del += 1,
4156            ChangeTag::Equal => {}
4157        }
4158    }
4159
4160    (ins, del)
4161}
4162
4163/// Line count for diffstat/`--numstat`, matching Git's `count_lines()` in `diff.c`.
4164///
4165/// Counts newline-terminated lines; a final line without trailing newline still counts as one line.
4166/// An empty buffer yields `0`.
4167#[must_use]
4168pub fn count_git_lines(data: &[u8]) -> usize {
4169    if data.is_empty() {
4170        return 0;
4171    }
4172    let mut count = 0usize;
4173    let mut nl_just_seen = false;
4174    for &ch in data {
4175        if ch == b'\n' {
4176            count += 1;
4177            nl_just_seen = true;
4178        } else {
4179            nl_just_seen = false;
4180        }
4181    }
4182    if !nl_just_seen {
4183        count += 1;
4184    }
4185    count
4186}
4187
4188/// Internal maximum diff score used by Git rename/break heuristics (`MAX_SCORE` in `diffcore.h`).
4189pub const GIT_DIFF_MAX_SCORE: u64 = 60_000;
4190const DIFF_MAX_SCORE: u64 = GIT_DIFF_MAX_SCORE;
4191const DIFF_MINIMUM_BREAK_SIZE: usize = 400;
4192const DIFF_DEFAULT_BREAK_SCORE: u64 = 30_000;
4193/// Default break threshold (`DEFAULT_BREAK_SCORE` in `diffcore.h`), internal 0–[`GIT_DIFF_MAX_SCORE`] scale.
4194pub const GIT_DIFF_DEFAULT_BREAK_SCORE: u64 = DIFF_DEFAULT_BREAK_SCORE;
4195/// Default merge threshold after a break (`DEFAULT_MERGE_SCORE` in `diffcore.h`): pairs broken for
4196/// rename/copy but not consumed are merged back when deletion-weight is below this (60% by default).
4197pub const GIT_DIFF_DEFAULT_MERGE_SCORE_AFTER_BREAK: u64 = 36_000;
4198const DIFF_HASHBASE: u32 = 107_927;
4199
4200#[derive(Clone, Copy, Default)]
4201struct SpanSlot {
4202    hashval: u32,
4203    cnt: u32,
4204}
4205
4206struct SpanHashTop {
4207    alloc_log2: u8,
4208    free_slots: i32,
4209    data: Vec<SpanSlot>,
4210}
4211
4212impl SpanHashTop {
4213    fn new(initial_log2: u8) -> Self {
4214        let cap = 1usize << initial_log2;
4215        Self {
4216            alloc_log2: initial_log2,
4217            free_slots: initial_free(initial_log2),
4218            data: vec![SpanSlot::default(); cap],
4219        }
4220    }
4221
4222    fn len(&self) -> usize {
4223        1usize << self.alloc_log2
4224    }
4225
4226    fn add_span(&mut self, hashval: u32, cnt: u32) {
4227        loop {
4228            let lim = self.len();
4229            let mut bucket = (hashval as usize) & (lim - 1);
4230            loop {
4231                let h = &mut self.data[bucket];
4232                if h.cnt == 0 {
4233                    h.hashval = hashval;
4234                    h.cnt = cnt;
4235                    self.free_slots -= 1;
4236                    if self.free_slots < 0 {
4237                        self.rehash();
4238                        break;
4239                    }
4240                    return;
4241                }
4242                if h.hashval == hashval {
4243                    h.cnt = h.cnt.saturating_add(cnt);
4244                    return;
4245                }
4246                bucket += 1;
4247                if bucket >= lim {
4248                    bucket = 0;
4249                }
4250            }
4251        }
4252    }
4253
4254    fn rehash(&mut self) {
4255        let old = std::mem::take(&mut self.data);
4256        let old_log = self.alloc_log2;
4257        self.alloc_log2 = old_log.saturating_add(1);
4258        let new_len = 1usize << self.alloc_log2;
4259        self.free_slots = initial_free(self.alloc_log2);
4260        self.data = vec![SpanSlot::default(); new_len];
4261        let old_sz = 1usize << old_log;
4262        for o in old.iter().take(old_sz) {
4263            let o = *o;
4264            if o.cnt == 0 {
4265                continue;
4266            }
4267            self.add_span_after_rehash(o.hashval, o.cnt);
4268        }
4269    }
4270
4271    fn add_span_after_rehash(&mut self, hashval: u32, cnt: u32) {
4272        loop {
4273            let lim = self.len();
4274            let mut bucket = (hashval as usize) & (lim - 1);
4275            loop {
4276                let h = &mut self.data[bucket];
4277                if h.cnt == 0 {
4278                    h.hashval = hashval;
4279                    h.cnt = cnt;
4280                    self.free_slots -= 1;
4281                    if self.free_slots < 0 {
4282                        self.rehash();
4283                        break;
4284                    }
4285                    return;
4286                }
4287                if h.hashval == hashval {
4288                    h.cnt = h.cnt.saturating_add(cnt);
4289                    return;
4290                }
4291                bucket += 1;
4292                if bucket >= lim {
4293                    bucket = 0;
4294                }
4295            }
4296        }
4297    }
4298
4299    fn sort_by_hashval(&mut self) {
4300        let sz = self.len();
4301        self.data[..sz].sort_by(|a, b| {
4302            if a.cnt == 0 {
4303                return std::cmp::Ordering::Greater;
4304            }
4305            if b.cnt == 0 {
4306                return std::cmp::Ordering::Less;
4307            }
4308            a.hashval.cmp(&b.hashval)
4309        });
4310    }
4311}
4312
4313fn initial_free(sz_log2: u8) -> i32 {
4314    let sz = sz_log2 as i32;
4315    ((1i32 << sz_log2) * (sz - 3) / sz).max(0)
4316}
4317
4318fn hash_blob_spans(buf: &[u8], is_text: bool) -> SpanHashTop {
4319    let mut hash = SpanHashTop::new(9);
4320    let mut n = 0u32;
4321    let mut accum1: u32 = 0;
4322    let mut accum2: u32 = 0;
4323    let mut i = 0usize;
4324    while i < buf.len() {
4325        let c = buf[i] as u32;
4326        let old_1 = accum1;
4327        i += 1;
4328
4329        if is_text && c == b'\r' as u32 && i < buf.len() && buf[i] == b'\n' {
4330            continue;
4331        }
4332
4333        accum1 = accum1.wrapping_shl(7) ^ accum2.wrapping_shr(25);
4334        accum2 = accum2.wrapping_shl(7) ^ old_1.wrapping_shr(25);
4335        accum1 = accum1.wrapping_add(c);
4336        n += 1;
4337        if n < 64 && c != b'\n' as u32 {
4338            continue;
4339        }
4340        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
4341        hash.add_span(hashval, n);
4342        n = 0;
4343        accum1 = 0;
4344        accum2 = 0;
4345    }
4346    if n > 0 {
4347        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
4348        hash.add_span(hashval, n);
4349    }
4350    hash.sort_by_hashval();
4351    hash
4352}
4353
4354/// Approximate copied vs added material between two blobs (Git `diffcore_count_changes`).
4355///
4356/// Returns `(copied_bytes_from_src, literal_added_bytes_in_dst)` matching Git's
4357/// `diffcore_count_changes` semantics (used for `--dirstat=changes` damage).
4358#[must_use]
4359pub fn diffcore_count_changes(old: &[u8], new: &[u8]) -> (u64, u64) {
4360    let src_is_text = !crate::merge_file::is_binary(old);
4361    let dst_is_text = !crate::merge_file::is_binary(new);
4362    let src_count = hash_blob_spans(old, src_is_text);
4363    let dst_count = hash_blob_spans(new, dst_is_text);
4364    let mut sc: u64 = 0;
4365    let mut la: u64 = 0;
4366    let mut si = 0usize;
4367    let mut di = 0usize;
4368    let src_len = src_count.len();
4369    let dst_len = dst_count.len();
4370    loop {
4371        if si >= src_len || src_count.data[si].cnt == 0 {
4372            break;
4373        }
4374        let s_hash = src_count.data[si].hashval;
4375        let s_cnt = u64::from(src_count.data[si].cnt);
4376        while di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval < s_hash {
4377            la += u64::from(dst_count.data[di].cnt);
4378            di += 1;
4379        }
4380        let mut dst_cnt = 0u64;
4381        if di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval == s_hash {
4382            dst_cnt = u64::from(dst_count.data[di].cnt);
4383            di += 1;
4384        }
4385        if s_cnt < dst_cnt {
4386            la += dst_cnt - s_cnt;
4387            sc += s_cnt;
4388        } else {
4389            sc += dst_cnt;
4390        }
4391        si += 1;
4392    }
4393    while di < dst_len && dst_count.data[di].cnt != 0 {
4394        la += u64::from(dst_count.data[di].cnt);
4395        di += 1;
4396    }
4397    (sc, la)
4398}
4399
4400/// Whether this modified blob pair should use Git's "complete rewrite" diffstat path when
4401/// `--break-rewrites` is in effect (`should_break` in `diffcore-break.c`).
4402#[must_use]
4403pub fn should_break_rewrite_for_stat(old: &[u8], new: &[u8]) -> bool {
4404    should_break_rewrite_inner(old, new, DIFF_DEFAULT_BREAK_SCORE)
4405}
4406
4407/// Whether an in-place blob edit should be split into delete+create for rename/copy (`should_break`
4408/// in `diffcore-break.c`). `break_score` is on the internal 0–[`GIT_DIFF_MAX_SCORE`] scale (default
4409/// [`DIFF_DEFAULT_BREAK_SCORE`]).
4410#[must_use]
4411pub fn should_break_rewrite_pair(old: &[u8], new: &[u8], break_score: u64) -> bool {
4412    should_break_rewrite_inner(old, new, break_score)
4413}
4414
4415/// Parse a single Git `parse_rename_score` token (`50`, `50%`, decimal forms) into internal
4416/// 0–[`GIT_DIFF_MAX_SCORE`] units.
4417pub fn parse_diff_rename_score_token(arg: &str) -> Option<u64> {
4418    let mut num: u64 = 0;
4419    let mut scale: u64 = 1;
4420    let mut dot = false;
4421    let mut saw_digit = false;
4422    for ch in arg.chars() {
4423        if !dot && ch == '.' {
4424            scale = 1;
4425            dot = true;
4426            continue;
4427        }
4428        if ch == '%' {
4429            scale = if dot { scale.saturating_mul(100) } else { 100 };
4430            break;
4431        }
4432        if ch.is_ascii_digit() {
4433            saw_digit = true;
4434            if scale < 100_000 {
4435                scale = scale.saturating_mul(10);
4436                num = num.saturating_mul(10) + u64::from(ch as u8 - b'0');
4437            }
4438        } else {
4439            break;
4440        }
4441    }
4442    if !saw_digit {
4443        return None;
4444    }
4445    Some(if num >= scale {
4446        GIT_DIFF_MAX_SCORE
4447    } else {
4448        GIT_DIFF_MAX_SCORE * num / scale
4449    })
4450}
4451
4452/// Git `merge_score` from `diffcore-break.c` when a pair is considered broken: how much of the
4453/// source blob was removed (0–[`DIFF_MAX_SCORE`] scale). Used for `dissimilarity index` metadata.
4454#[must_use]
4455pub fn rewrite_merge_score(old: &[u8], new: &[u8]) -> Option<u64> {
4456    if old.is_empty() {
4457        return None;
4458    }
4459    let max_size = old.len().max(new.len());
4460    if max_size < DIFF_MINIMUM_BREAK_SIZE {
4461        return None;
4462    }
4463    let (src_copied, _) = diffcore_count_changes(old, new);
4464    let src_copied = src_copied.min(old.len() as u64);
4465    let src_removed = (old.len() as u64).saturating_sub(src_copied);
4466    Some(src_removed * DIFF_MAX_SCORE / old.len() as u64)
4467}
4468
4469/// Percentage shown in `dissimilarity index N%` for a rewrite (`similarity_index` in Git's diff.c).
4470#[must_use]
4471pub fn rewrite_dissimilarity_index_percent(old: &[u8], new: &[u8]) -> Option<u32> {
4472    let score = rewrite_merge_score(old, new)?;
4473    Some((score * 100 / DIFF_MAX_SCORE).min(100) as u32)
4474}
4475
4476fn should_break_rewrite_inner(src: &[u8], dst: &[u8], break_score: u64) -> bool {
4477    if src.is_empty() {
4478        return false;
4479    }
4480    let max_size = src.len().max(dst.len());
4481    if max_size < DIFF_MINIMUM_BREAK_SIZE {
4482        return false;
4483    }
4484    let (src_copied, literal_added) = diffcore_count_changes(src, dst);
4485    let src_copied = src_copied.min(src.len() as u64);
4486    let mut literal_added = literal_added;
4487    let dst_len = dst.len() as u64;
4488    if src_copied < dst_len && literal_added + src_copied > dst_len {
4489        literal_added = dst_len.saturating_sub(src_copied);
4490    }
4491    let src_removed = (src.len() as u64).saturating_sub(src_copied);
4492    let merge_score = src_removed * DIFF_MAX_SCORE / src.len() as u64;
4493    if merge_score > break_score {
4494        return true;
4495    }
4496    let delta_size = src_removed.saturating_add(literal_added);
4497    if delta_size * DIFF_MAX_SCORE / (max_size as u64) < break_score {
4498        return false;
4499    }
4500    let s = src.len() as u64;
4501    if (s * break_score < src_removed * DIFF_MAX_SCORE)
4502        && (literal_added * 20 < src_removed)
4503        && (literal_added * 20 < src_copied)
4504    {
4505        return false;
4506    }
4507    true
4508}
4509
4510// ── Helpers ─────────────────────────────────────────────────────────
4511
4512/// Flatten a tree object recursively into a sorted list of (path, mode, oid).
4513struct FlatEntry {
4514    path: String,
4515    mode: u32,
4516    oid: ObjectId,
4517}
4518
4519fn flatten_tree(odb: &Odb, tree_oid: &ObjectId, prefix: &str) -> Result<Vec<FlatEntry>> {
4520    let entries = read_tree(odb, tree_oid)?;
4521    let mut result = Vec::new();
4522
4523    for entry in entries {
4524        let name_str = String::from_utf8_lossy(&entry.name);
4525        let path = format_path(prefix, &name_str);
4526        if is_tree_mode(entry.mode) {
4527            let nested = flatten_tree(odb, &entry.oid, &path)?;
4528            result.extend(nested);
4529        } else {
4530            result.push(FlatEntry {
4531                path,
4532                mode: entry.mode,
4533                oid: entry.oid,
4534            });
4535        }
4536    }
4537
4538    Ok(result)
4539}
4540
4541/// Paths present in `HEAD`'s tree with mode and blob/commit OID (for status porcelain v2).
4542pub fn head_path_states(
4543    odb: &Odb,
4544    head_tree: Option<&ObjectId>,
4545) -> Result<std::collections::BTreeMap<String, (u32, ObjectId)>> {
4546    let mut m = std::collections::BTreeMap::new();
4547    let Some(t) = head_tree else {
4548        return Ok(m);
4549    };
4550    for fe in flatten_tree(odb, t, "")? {
4551        m.insert(fe.path, (fe.mode, fe.oid));
4552    }
4553    Ok(m)
4554}
4555
4556/// Whether a mode represents a tree (directory).
4557fn is_tree_mode(mode: u32) -> bool {
4558    mode == 0o040000
4559}
4560
4561/// Build a path with an optional prefix.
4562fn format_path(prefix: &str, name: &str) -> String {
4563    if prefix.is_empty() {
4564        name.to_owned()
4565    } else {
4566        format!("{prefix}/{name}")
4567    }
4568}
4569
4570/// Format a numeric mode as a zero-padded octal string.
4571pub fn format_mode(mode: u32) -> String {
4572    format!("{mode:06o}")
4573}
4574
4575/// Read the HEAD commit OID from a submodule checkout directory.
4576///
4577/// Returns `None` if the path is missing, not a submodule checkout, or has no resolvable HEAD.
4578#[must_use]
4579pub fn read_submodule_head_for_checkout(sub_dir: &Path) -> Option<ObjectId> {
4580    read_submodule_head(sub_dir)
4581}
4582
4583/// First line of a commit's message for `git diff --submodule=log` output.
4584///
4585/// Honors `encoding` in the commit object (Latin-1 vs UTF-8) using the same
4586/// rules as Git's submodule summary.
4587#[must_use]
4588pub fn submodule_commit_subject_line(c: &CommitData) -> String {
4589    let enc = c.encoding.as_deref().unwrap_or("UTF-8");
4590    let is_latin1 = enc.eq_ignore_ascii_case("ISO8859-1")
4591        || enc.eq_ignore_ascii_case("ISO-8859-1")
4592        || enc.eq_ignore_ascii_case("LATIN1")
4593        || enc.eq_ignore_ascii_case("ISO-8859-15");
4594    if let Some(raw) = c.raw_message.as_deref() {
4595        let line = raw.split(|b| *b == b'\n').next().unwrap_or(raw);
4596        if is_latin1 {
4597            return line
4598                .iter()
4599                .map(|&b| b as char)
4600                .collect::<String>()
4601                .trim()
4602                .to_owned();
4603        }
4604        return String::from_utf8_lossy(line).trim().to_string();
4605    }
4606    c.message.lines().next().unwrap_or("").trim().to_owned()
4607}
4608
4609/// True when `sub_dir` is an empty directory (or missing), i.e. the placeholder left by
4610/// `git apply --index` before `git submodule update`.
4611fn submodule_worktree_is_unpopulated_placeholder(sub_dir: &Path) -> bool {
4612    match fs::read_dir(sub_dir) {
4613        Ok(mut it) => it.next().is_none(),
4614        Err(e) if e.kind() == std::io::ErrorKind::NotFound => true,
4615        Err(_) => false,
4616    }
4617}
4618
4619fn read_submodule_head(sub_dir: &Path) -> Option<ObjectId> {
4620    read_submodule_head_oid(sub_dir)
4621}
4622
4623/// Resolve the embedded git directory for a submodule work tree (`sub_dir/.git`).
4624#[must_use]
4625pub fn submodule_embedded_git_dir(sub_dir: &Path) -> Option<PathBuf> {
4626    let gitfile = sub_dir.join(".git");
4627    if gitfile.is_file() {
4628        let content = fs::read_to_string(&gitfile).ok()?;
4629        let gitdir = content
4630            .lines()
4631            .find_map(|l| l.strip_prefix("gitdir: "))?
4632            .trim();
4633        Some(if Path::new(gitdir).is_absolute() {
4634            PathBuf::from(gitdir)
4635        } else {
4636            sub_dir.join(gitdir)
4637        })
4638    } else if gitfile.is_dir() {
4639        Some(gitfile)
4640    } else {
4641        None
4642    }
4643}
4644
4645/// Walk upward from `sub_dir` to find the nearest containing Git work tree.
4646fn find_superproject_git(sub_dir: &Path) -> Option<(PathBuf, PathBuf)> {
4647    let mut cur = sub_dir.parent()?;
4648    loop {
4649        let git_path = cur.join(".git");
4650        if git_path.exists() {
4651            let gd = if git_path.is_file() {
4652                let content = fs::read_to_string(&git_path).ok()?;
4653                let line = content
4654                    .lines()
4655                    .find_map(|l| l.strip_prefix("gitdir: "))?
4656                    .trim();
4657                if Path::new(line).is_absolute() {
4658                    PathBuf::from(line)
4659                } else {
4660                    cur.join(line)
4661                }
4662            } else {
4663                git_path
4664            };
4665            return Some((cur.to_path_buf(), gd));
4666        }
4667        cur = cur.parent()?;
4668    }
4669}
4670
4671/// Read the HEAD commit OID from a submodule working tree directory.
4672///
4673/// Handles both embedded `.git` directories and `gitdir:` gitfiles pointing at
4674/// `.git/modules/...` (or other locations). Returns `None` if the path is not
4675/// a checkout or has no resolvable HEAD.
4676pub fn read_submodule_head_oid(sub_dir: &Path) -> Option<ObjectId> {
4677    // Submodule `.git` may be a gitfile pointing at `.git/modules/<name>` in another superproject
4678    // after `cp -R`. Prefer the current superproject's module dir when present.
4679    let mut git_dir = submodule_embedded_git_dir(sub_dir)?;
4680    if let Some((super_wt, super_git_dir)) = find_superproject_git(sub_dir) {
4681        let rel = sub_dir.strip_prefix(&super_wt).ok()?;
4682        let rel_str = rel.to_string_lossy().replace('\\', "/");
4683        let local_mod = super_git_dir
4684            .join("modules")
4685            .join(rel_str.trim_start_matches('/'));
4686        if local_mod.join("HEAD").exists() {
4687            let sg = super_git_dir.canonicalize().unwrap_or(super_git_dir);
4688            let cur = git_dir.canonicalize().unwrap_or_else(|_| git_dir.clone());
4689            if !cur.starts_with(&sg) {
4690                git_dir = local_mod;
4691            }
4692        }
4693    }
4694    let head_content = fs::read_to_string(git_dir.join("HEAD")).ok()?;
4695    let head_trimmed = head_content.trim();
4696    if head_trimmed.starts_with("ref: ") {
4697        // Use the full ref resolver so packed-refs and worktrees match Git. If `HEAD` is a stale
4698        // symref (e.g. still `refs/heads/master` while only `main` exists), fall back like
4699        // `resolve_gitlink_ref` / `git add` on embedded repos (`t6437-submodule-merge`).
4700        match crate::refs::resolve_ref(&git_dir, "HEAD") {
4701            Ok(oid) => Some(oid),
4702            Err(_) => {
4703                let mut found = None;
4704                for branch in ["main", "master"] {
4705                    let p = git_dir.join("refs/heads").join(branch);
4706                    if let Ok(s) = fs::read_to_string(&p) {
4707                        if let Ok(o) = ObjectId::from_hex(s.trim()) {
4708                            found = Some(o);
4709                            break;
4710                        }
4711                    }
4712                }
4713                found
4714            }
4715        }
4716    } else {
4717        ObjectId::from_hex(head_trimmed).ok()
4718    }
4719}
4720
4721/// True when a checked-out submodule at `rel_path` has modified or untracked content relative to
4722/// the gitlink `recorded_oid` stored in the superproject (used for `git diff <tree>` parity).
4723fn submodule_has_dirty_worktree_for_super_diff(
4724    super_worktree: &Path,
4725    rel_path: &str,
4726    recorded_oid: &ObjectId,
4727) -> bool {
4728    let flags = submodule_porcelain_flags(super_worktree, rel_path, *recorded_oid);
4729    flags.modified || flags.untracked
4730}
4731
4732/// Submodule dirty bits aligned with Git's `DIRTY_SUBMODULE_*` / porcelain v2 `S???` token.
4733#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
4734pub struct SubmodulePorcelainFlags {
4735    /// Submodule checkout HEAD differs from the gitlink OID recorded in the parent index.
4736    pub new_commits: bool,
4737    /// The submodule has its own staged or unstaged changes (`DIRTY_SUBMODULE_MODIFIED`).
4738    pub modified: bool,
4739    /// The submodule work tree contains paths not in its index (`DIRTY_SUBMODULE_UNTRACKED`).
4740    pub untracked: bool,
4741}
4742
4743/// Inspect a checked-out submodule at `rel_path` (relative to `super_worktree`) and return
4744/// flags used for `git status --porcelain=v2` submodule tokens.
4745///
4746/// `recorded_oid` is the gitlink OID stored in the **parent** index (stage 0). When the
4747/// submodule is not checked out or cannot be opened, returns [`Default::default()`].
4748pub fn submodule_porcelain_flags(
4749    super_worktree: &Path,
4750    rel_path: &str,
4751    recorded_oid: ObjectId,
4752) -> SubmodulePorcelainFlags {
4753    let sub_dir = super_worktree.join(rel_path);
4754    let Some(sub_git_dir) = submodule_embedded_git_dir(&sub_dir) else {
4755        return SubmodulePorcelainFlags::default();
4756    };
4757    let Some(sub_head) = read_submodule_head_oid(&sub_dir) else {
4758        return SubmodulePorcelainFlags::default();
4759    };
4760
4761    let new_commits = sub_head != recorded_oid;
4762
4763    let index_path = sub_git_dir.join("index");
4764    let sub_index = match crate::index::Index::load(&index_path) {
4765        Ok(ix) => ix,
4766        Err(_) => {
4767            return SubmodulePorcelainFlags {
4768                new_commits,
4769                ..Default::default()
4770            }
4771        }
4772    };
4773
4774    let tracked: std::collections::BTreeSet<String> = sub_index
4775        .entries
4776        .iter()
4777        .filter(|e| e.stage() == 0)
4778        .map(|e| String::from_utf8_lossy(&e.path).into_owned())
4779        .collect();
4780    let untracked = submodule_dir_has_untracked_inner(&sub_dir, &sub_dir, &tracked, &sub_index);
4781
4782    let objects_dir = sub_git_dir.join("objects");
4783    let odb = Odb::new(&objects_dir);
4784
4785    let sub_head_tree = (|| -> Option<ObjectId> {
4786        let h = fs::read_to_string(sub_git_dir.join("HEAD")).ok()?;
4787        let h_str = h.trim();
4788        let commit_oid = if let Some(r) = h_str.strip_prefix("ref: ") {
4789            let oid_hex = fs::read_to_string(sub_git_dir.join(r)).ok()?;
4790            ObjectId::from_hex(oid_hex.trim()).ok()?
4791        } else {
4792            ObjectId::from_hex(h_str).ok()?
4793        };
4794        let obj = odb.read(&commit_oid).ok()?;
4795        let commit = parse_commit(&obj.data).ok()?;
4796        Some(commit.tree)
4797    })();
4798
4799    let staged_dirty = sub_head_tree
4800        .as_ref()
4801        .map(|t| diff_index_to_tree(&odb, &sub_index, Some(t), false).map(|v| !v.is_empty()))
4802        .unwrap_or(Ok(false));
4803    let staged_dirty = staged_dirty.unwrap_or(false);
4804
4805    let unstaged_dirty = diff_index_to_worktree(&odb, &sub_index, &sub_dir, false, true)
4806        .map(|v| !v.is_empty())
4807        .unwrap_or(false);
4808
4809    let mut modified = staged_dirty || unstaged_dirty;
4810
4811    // Nested submodule has its own index: OR `modified` from immediate gitlink children so a
4812    // dirty nested checkout (e.g. staged `file` under `sub1/sub2`) marks the parent gitlink as
4813    // modified in the superproject (t7506). Do **not** OR `untracked` — untracked-only inside a
4814    // nested submodule must stay `S..U` on the parent, not `S.U` / `S.M.`.
4815    for e in &sub_index.entries {
4816        if e.stage() != 0 || e.mode != 0o160000 {
4817            continue;
4818        }
4819        let child = String::from_utf8_lossy(&e.path).into_owned();
4820        let full_rel = if rel_path.is_empty() {
4821            child
4822        } else {
4823            format!("{rel_path}/{child}")
4824        };
4825        let nested = submodule_porcelain_flags(super_worktree, &full_rel, e.oid);
4826        modified |= nested.modified;
4827    }
4828
4829    SubmodulePorcelainFlags {
4830        new_commits,
4831        modified,
4832        untracked,
4833    }
4834}
4835
4836fn submodule_dir_has_untracked_inner(
4837    dir: &Path,
4838    root: &Path,
4839    tracked: &std::collections::BTreeSet<String>,
4840    owning_index: &Index,
4841) -> bool {
4842    let entries = match fs::read_dir(dir) {
4843        Ok(e) => e,
4844        Err(_) => return false,
4845    };
4846    let mut sorted: Vec<_> = entries.filter_map(|e| e.ok()).collect();
4847    sorted.sort_by_key(|e| e.file_name());
4848
4849    for entry in sorted {
4850        let name = entry.file_name().to_string_lossy().to_string();
4851        if name == ".git" {
4852            continue;
4853        }
4854        let path = entry.path();
4855        let rel = path
4856            .strip_prefix(root)
4857            .map(|p| p.to_string_lossy().to_string())
4858            .unwrap_or_else(|_| name.clone());
4859
4860        let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
4861        if is_dir {
4862            let is_gitlink = owning_index
4863                .get(rel.as_bytes(), 0)
4864                .is_some_and(|e| e.mode == 0o160000);
4865            if is_gitlink {
4866                let Some(nested_git) = submodule_embedded_git_dir(&path) else {
4867                    continue;
4868                };
4869                let nested_index_path = nested_git.join("index");
4870                let Ok(nested_ix) = crate::index::Index::load(&nested_index_path) else {
4871                    continue;
4872                };
4873                let nested_tracked: std::collections::BTreeSet<String> = nested_ix
4874                    .entries
4875                    .iter()
4876                    .filter(|e| e.stage() == 0)
4877                    .map(|e| String::from_utf8_lossy(&e.path).into_owned())
4878                    .collect();
4879                if submodule_dir_has_untracked_inner(&path, &path, &nested_tracked, &nested_ix) {
4880                    return true;
4881                }
4882            } else if submodule_dir_has_untracked_inner(&path, root, tracked, owning_index) {
4883                return true;
4884            }
4885        } else if !tracked.contains(&rel) {
4886            return true;
4887        }
4888    }
4889    false
4890}