Skip to main content

grit_lib/
diff.rs

1//! Diff machinery — compare trees, index entries, and working tree files.
2//!
3//! # Overview
4//!
5//! This module provides the core diffing infrastructure shared by `diff`,
6//! `diff-index`, `status`, `log`, `show`, `commit`, and `merge`.
7//!
8//! ## Levels of comparison
9//!
10//! 1. **Tree-to-tree** — compare two tree objects (e.g. for `log`/`show`).
11//! 2. **Tree-to-index** — compare a tree (usually HEAD) against the index
12//!    (staged changes, used by `diff --cached` and `status`).
13//! 3. **Index-to-worktree** — compare index against the working directory
14//!    (unstaged changes, used by `diff` and `status`).
15//!
16//! ## Content diff
17//!
18//! Line-level diffing uses the `similar` crate (Myers, patience, minimal) and,
19//! for Git's `histogram` algorithm, `imara-diff` for output compatible with upstream Git.
20//! Output formats: unified patch, raw (`:old-mode new-mode ...`), stat,
21//! numstat.
22
23use std::fs;
24#[cfg(unix)]
25use std::os::unix::fs::MetadataExt;
26use std::path::{Path, PathBuf};
27
28use crate::config::ConfigSet;
29use crate::diff_indent_heuristic;
30use crate::error::{Error, Result};
31use crate::index::{Index, IndexEntry};
32use crate::objects::{parse_commit, parse_tree, CommitData, ObjectId, ObjectKind, TreeEntry};
33use crate::odb::Odb;
34use crate::userdiff::FuncnameMatcher;
35
36/// Splits imara-diff unified body (concatenated hunks) into per-hunk slices for post-processing.
37fn imara_unified_hunk_slices(body: &str) -> Vec<&str> {
38    let mut starts: Vec<usize> = Vec::new();
39    if body.starts_with("@@") {
40        starts.push(0);
41    }
42    for (idx, _) in body.match_indices("\n@@ ") {
43        starts.push(idx + 1);
44    }
45    starts.push(body.len());
46    starts.windows(2).map(|w| &body[w[0]..w[1]]).collect()
47}
48
49fn histogram_unified_body_raw(
50    old_content: &str,
51    new_content: &str,
52    context_lines: usize,
53    inter_hunk_context: usize,
54) -> String {
55    use imara_diff::{Algorithm, Diff, Hunk, InternedInput};
56    use std::fmt::Write as _;
57
58    let input = InternedInput::new(old_content, new_content);
59    let mut diff = Diff::compute(Algorithm::Histogram, &input);
60    diff.postprocess_lines(&input);
61
62    // Assemble hunks ourselves: imara's `UnifiedDiff` printer starts the first
63    // hunk's context at line 0 whenever the first change is within
64    // `2 * context_len` of the file start, emitting more leading context than
65    // its own header claims (t4061), and its gap threshold cannot express
66    // Git's odd `2 * U + inter_hunk_context` fuse limits (t4032).
67    let hunks: Vec<Hunk> = diff.hunks().collect();
68    if hunks.is_empty() {
69        return String::new();
70    }
71
72    let ctx = context_lines.min(u32::MAX as usize) as u32;
73    let max_gap = (2usize.saturating_mul(context_lines))
74        .saturating_add(inter_hunk_context)
75        .min(u32::MAX as usize) as u32;
76    let before_len = input.before.len() as u32;
77    let after_len = input.after.len() as u32;
78
79    // Fuse hunks whose unchanged gap is at most `max_gap` (Git xdl_get_hunk).
80    let mut groups: Vec<&[Hunk]> = Vec::new();
81    let mut group_start = 0usize;
82    for i in 1..hunks.len() {
83        if hunks[i].before.start - hunks[i - 1].before.end > max_gap {
84            groups.push(&hunks[group_start..i]);
85            group_start = i;
86        }
87    }
88    groups.push(&hunks[group_start..]);
89
90    fn push_line(out: &mut String, prefix: char, text: &str) {
91        out.push(prefix);
92        out.push_str(text);
93        if !text.ends_with('\n') {
94            out.push('\n');
95        }
96    }
97
98    // Git hunk header range: 1-based start (the preceding line when the range
99    // is empty) with the `,count` part omitted when the count is exactly 1.
100    fn fmt_side(start: u32, count: u32) -> String {
101        let shown_start = if count == 0 { start } else { start + 1 };
102        if count == 1 {
103            format!("{shown_start}")
104        } else {
105            format!("{shown_start},{count}")
106        }
107    }
108
109    let mut out = String::new();
110    for group in groups {
111        let first = &group[0];
112        let last = &group[group.len() - 1];
113        let b_start = first.before.start.saturating_sub(ctx);
114        let a_start = first.after.start.saturating_sub(ctx);
115        let b_end = (last.before.end.saturating_add(ctx)).min(before_len);
116        let a_end = (last.after.end.saturating_add(ctx)).min(after_len);
117
118        let _ = writeln!(
119            out,
120            "@@ -{} +{} @@",
121            fmt_side(b_start, b_end - b_start),
122            fmt_side(a_start, a_end - a_start)
123        );
124
125        let mut pos = b_start;
126        for hunk in group {
127            for &token in &input.before[pos as usize..hunk.before.start as usize] {
128                push_line(&mut out, ' ', input.interner[token]);
129            }
130            for &token in &input.before[hunk.before.start as usize..hunk.before.end as usize] {
131                push_line(&mut out, '-', input.interner[token]);
132            }
133            for &token in &input.after[hunk.after.start as usize..hunk.after.end as usize] {
134                push_line(&mut out, '+', input.interner[token]);
135            }
136            pos = hunk.before.end;
137        }
138        for &token in &input.before[pos as usize..b_end as usize] {
139            push_line(&mut out, ' ', input.interner[token]);
140        }
141    }
142
143    out
144}
145
146/// Build the unified-diff body (no `---`/`+++` header) using Git's histogram
147/// algorithm, but applying `--ignore-blank-lines` / `-I` semantics through a
148/// faithful port of Git's xdiff change-record machinery
149/// (`xdl_mark_ignorable_lines` + `xdl_get_hunk` + `xdl_emit_diff`).
150///
151/// `is_ignorable_change` is called with the slices of removed and added lines
152/// for one change record (each item is the line text without its trailing
153/// newline); it returns `true` when the whole change is ignorable (Git's
154/// `xch->ignore`). Ignorable changes that are far from substantive changes are
155/// dropped, and hunk boundaries are computed with Git's `max_ignorable`
156/// distance rules so the line numbers and surviving context match Git exactly.
157///
158/// Returns the body only; an empty string means no substantive change survives.
159/// Optional `--function-context` (`-W`) hunk expansion is applied *after*
160/// ignorable change-record pruning so the boundaries/funcname header match Git.
161#[allow(clippy::too_many_arguments)]
162pub(crate) fn histogram_unified_body_ignore_fc<F>(
163    old_content: &str,
164    new_content: &str,
165    context_lines: usize,
166    inter_hunk_context: usize,
167    function_context: bool,
168    funcname_matcher: Option<&FuncnameMatcher>,
169    is_ignorable_change: F,
170) -> String
171where
172    F: Fn(&[&str], &[&str]) -> bool,
173{
174    use imara_diff::{Algorithm, Diff, Hunk, InternedInput};
175    use std::fmt::Write as _;
176
177    let input = InternedInput::new(old_content, new_content);
178    let mut diff = Diff::compute(Algorithm::Histogram, &input);
179    diff.postprocess_lines(&input);
180
181    let raw_hunks: Vec<Hunk> = diff.hunks().collect();
182    if raw_hunks.is_empty() {
183        return String::new();
184    }
185
186    let before_len = input.before.len() as i64;
187    let after_len = input.after.len() as i64;
188    let ctxlen = context_lines as i64;
189    let interhunk = inter_hunk_context as i64;
190    // Git's xdl_get_hunk fuse limit: 2*ctxlen + interhunkctxlen.
191    let max_common = ctxlen.saturating_add(ctxlen).saturating_add(interhunk);
192    let max_ignorable = ctxlen;
193
194    // A change record mirrors Git's xdchange_t.
195    struct Change {
196        i1: i64,
197        chg1: i64,
198        i2: i64,
199        chg2: i64,
200        ignore: bool,
201    }
202
203    let mut changes: Vec<Change> = Vec::with_capacity(raw_hunks.len());
204    for h in &raw_hunks {
205        let i1 = h.before.start as i64;
206        let chg1 = (h.before.end - h.before.start) as i64;
207        let i2 = h.after.start as i64;
208        let chg2 = (h.after.end - h.after.start) as i64;
209        let removed: Vec<&str> = input.before[h.before.start as usize..h.before.end as usize]
210            .iter()
211            .map(|&t| input.interner[t])
212            .collect();
213        let added: Vec<&str> = input.after[h.after.start as usize..h.after.end as usize]
214            .iter()
215            .map(|&t| input.interner[t])
216            .collect();
217        let ignore = is_ignorable_change(&removed, &added);
218        changes.push(Change {
219            i1,
220            chg1,
221            i2,
222            chg2,
223            ignore,
224        });
225    }
226
227    // Helper to fetch a line of either side (token interner over lines).
228    let before_line = |idx: i64| -> &str { input.interner[input.before[idx as usize]] };
229    let after_line = |idx: i64| -> &str { input.interner[input.after[idx as usize]] };
230
231    // Port of xdl_get_hunk: starting at change index `start`, return
232    // `(new_start, last)` — the possibly-advanced first change index (leading
233    // ignorable changes dropped) and the index of the last change to include in
234    // this hunk — or `None` when the leading ignorable changes are all dropped
235    // and nothing remains.
236    let get_hunk = |start: usize| -> Option<(usize, usize)> {
237        // Remove ignorable changes that are too far before other changes.
238        // Faithful port of xdl_get_hunk's leading loop: walk every leading
239        // ignorable change; whenever its successor is absent or far enough away
240        // (>= max_ignorable), advance the hunk start past it. `scr` is the new
241        // start index, or `changes.len()` when everything is dropped.
242        let mut scr = start;
243        let mut xchp = start;
244        while xchp < changes.len() && changes[xchp].ignore {
245            let next = xchp + 1;
246            if next >= changes.len()
247                || changes[next].i1 - (changes[xchp].i1 + changes[xchp].chg1) >= max_ignorable
248            {
249                scr = next;
250            }
251            xchp = next;
252        }
253        if scr >= changes.len() {
254            return None;
255        }
256
257        let mut lxch = scr;
258        let mut ignored: i64 = 0;
259        let mut xchp = scr;
260        let mut idx = scr + 1;
261        while idx < changes.len() {
262            let xch = &changes[idx];
263            let prev = &changes[xchp];
264            let distance = xch.i1 - (prev.i1 + prev.chg1);
265            if distance > max_common {
266                break;
267            }
268            if distance < max_ignorable && (!xch.ignore || lxch == xchp) {
269                lxch = idx;
270                ignored = 0;
271            } else if distance < max_ignorable && xch.ignore {
272                ignored += xch.chg2;
273            } else if lxch != xchp
274                && xch.i1 + ignored - (changes[lxch].i1 + changes[lxch].chg1) > max_common
275            {
276                break;
277            } else if !xch.ignore {
278                lxch = idx;
279                ignored = 0;
280            } else {
281                ignored += xch.chg2;
282            }
283            xchp = idx;
284            idx += 1;
285        }
286        Some((scr, lxch))
287    };
288
289    fn push_line(out: &mut String, prefix: char, text: &str) {
290        out.push(prefix);
291        out.push_str(text);
292        if !text.ends_with('\n') {
293            out.push('\n');
294        }
295    }
296
297    // Git hunk header range (xdl_emit_hunk_hdr): `,count` part omitted when 1.
298    fn fmt_side(start: i64, count: i64) -> String {
299        if count == 1 {
300            format!("{start}")
301        } else {
302            format!("{start},{count}")
303        }
304    }
305
306    let mut out = String::new();
307    let mut cursor = 0usize;
308    while cursor < changes.len() {
309        let Some((xch_idx, xche_idx)) = get_hunk(cursor) else {
310            break;
311        };
312        let xch = &changes[xch_idx];
313        let xche = &changes[xche_idx];
314
315        // pre-context
316        let mut s1 = (xch.i1 - ctxlen).max(0);
317        let mut s2 = (xch.i2 - ctxlen).max(0);
318        // post-context (clamped so it does not run past either side's EOF).
319        let mut lctx = ctxlen;
320        lctx = lctx.min(before_len - (xche.i1 + xche.chg1));
321        lctx = lctx.min(after_len - (xche.i2 + xche.chg2));
322        let mut e1 = xche.i1 + xche.chg1 + lctx;
323        let mut e2 = xche.i2 + xche.chg2 + lctx;
324
325        // `--function-context`: expand the hunk up to the enclosing function's
326        // header and down to the line before the next function (Git's
327        // XDL_EMIT_FUNCCONTEXT in xemit.c), using the already ignore-pruned
328        // change boundaries.
329        if function_context {
330            // Upward: find the function line at or before xch.i1, then climb to
331            // the start of that function (past non-empty non-func lines).
332            let mut fs1 = xch.i1;
333            while fs1 >= 0 && !is_func_line(before_line(fs1), funcname_matcher) {
334                fs1 -= 1;
335            }
336            while fs1 > 0
337                && before_line(fs1 - 1).trim().is_empty() == false
338                && !is_func_line(before_line(fs1 - 1), funcname_matcher)
339            {
340                fs1 -= 1;
341            }
342            if fs1 < 0 {
343                fs1 = 0;
344            }
345            if fs1 < s1 {
346                s2 = (s2 - (s1 - fs1)).max(0);
347                s1 = fs1;
348            }
349            // Downward: find the next function line after the hunk end, climb up
350            // past trailing empty lines, and extend e1/e2 to it.
351            let end1 = xche.i1 + xche.chg1;
352            let mut fe1 = end1;
353            while fe1 < before_len && !is_func_line(before_line(fe1), funcname_matcher) {
354                fe1 += 1;
355            }
356            while fe1 > 0 && before_line(fe1 - 1).trim().is_empty() {
357                fe1 -= 1;
358            }
359            if fe1 > before_len {
360                fe1 = before_len;
361            }
362            if fe1 > e1 {
363                e2 = (e2 + (fe1 - e1)).min(after_len);
364                e1 = fe1;
365            }
366        }
367
368        let _ = writeln!(
369            out,
370            "@@ -{} +{} @@",
371            fmt_side(s1 + 1, e1 - s1),
372            fmt_side(s2 + 1, e2 - s2)
373        );
374
375        // Emit pre-context from the new side.
376        let mut s2c = s2;
377        while s2c < xch.i2 {
378            push_line(&mut out, ' ', after_line(s2c));
379            s2c += 1;
380        }
381
382        // Emit each change record and the context between merged records.
383        let mut s1c = xch.i1;
384        let mut s2c = xch.i2;
385        let mut k = xch_idx;
386        loop {
387            let c = &changes[k];
388            // Context between previous and current change atom.
389            while s1c < c.i1 && s2c < c.i2 {
390                push_line(&mut out, ' ', after_line(s2c));
391                s1c += 1;
392                s2c += 1;
393            }
394            // Removed lines from old side.
395            let mut r = c.i1;
396            while r < c.i1 + c.chg1 {
397                push_line(&mut out, '-', before_line(r));
398                r += 1;
399            }
400            // Added lines from new side.
401            let mut a = c.i2;
402            while a < c.i2 + c.chg2 {
403                push_line(&mut out, '+', after_line(a));
404                a += 1;
405            }
406            if k == xche_idx {
407                break;
408            }
409            s1c = c.i1 + c.chg1;
410            s2c = c.i2 + c.chg2;
411            k += 1;
412        }
413
414        // Post-context.
415        let mut s2p = xche.i2 + xche.chg2;
416        while s2p < e2 {
417            push_line(&mut out, ' ', after_line(s2p));
418            s2p += 1;
419        }
420
421        cursor = xche_idx + 1;
422    }
423
424    out
425}
426
427/// Unified diff hunks for Git's histogram algorithm (no `---` / `+++` lines).
428///
429/// Used by `--no-index` when whitespace normalization is off so the patch matches upstream Git.
430#[must_use]
431pub fn unified_diff_histogram_hunks_only(
432    old_content: &str,
433    new_content: &str,
434    context_lines: usize,
435    inter_hunk_context: usize,
436) -> String {
437    histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context)
438}
439
440/// Full unified diff (`---` / `+++` / hunks) using Git's histogram algorithm.
441#[must_use]
442pub fn unified_diff_histogram_with_prefix_and_funcname(
443    old_content: &str,
444    new_content: &str,
445    old_path: &str,
446    new_path: &str,
447    context_lines: usize,
448    inter_hunk_context: usize,
449    src_prefix: &str,
450    dst_prefix: &str,
451    funcname_matcher: Option<&FuncnameMatcher>,
452    quote_path_fully: bool,
453) -> String {
454    use crate::quote_path::format_diff_path_with_prefix;
455
456    let body =
457        histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context);
458
459    let mut output = String::new();
460    if old_path == "/dev/null" {
461        output.push_str("--- /dev/null\n");
462    } else if src_prefix.is_empty() {
463        output.push_str(&format!("--- {old_path}\n"));
464    } else {
465        output.push_str("--- ");
466        output.push_str(&format_diff_path_with_prefix(
467            src_prefix,
468            old_path,
469            quote_path_fully,
470        ));
471        output.push('\n');
472    }
473    if new_path == "/dev/null" {
474        output.push_str("+++ /dev/null\n");
475    } else if dst_prefix.is_empty() {
476        output.push_str(&format!("+++ {new_path}\n"));
477    } else {
478        output.push_str("+++ ");
479        output.push_str(&format_diff_path_with_prefix(
480            dst_prefix,
481            new_path,
482            quote_path_fully,
483        ));
484        output.push('\n');
485    }
486
487    let old_lines: Vec<&str> = old_content.lines().collect();
488    for hunk_str in imara_unified_hunk_slices(&body) {
489        if hunk_str.is_empty() {
490            continue;
491        }
492        if let Some(first_newline) = hunk_str.find('\n') {
493            let header_line = &hunk_str[..first_newline];
494            let rest = &hunk_str[first_newline..];
495            if let Some(func_ctx) =
496                extract_function_context(header_line, &old_lines, funcname_matcher)
497            {
498                output.push_str(header_line);
499                output.push(' ');
500                output.push_str(&func_ctx);
501                output.push_str(rest);
502            } else {
503                output.push_str(hunk_str);
504            }
505        } else {
506            output.push_str(hunk_str);
507        }
508    }
509
510    output
511}
512
513/// Full unified diff (`---` / `+++` / hunks) using Git's histogram algorithm,
514/// applying `--ignore-blank-lines` / `-I` change-record suppression
515/// ([`histogram_unified_body_ignore`]) and then attaching function-name hunk
516/// headers exactly like [`unified_diff_histogram_with_prefix_and_funcname`].
517///
518/// `is_ignorable_change` is given the removed/added line slices (without
519/// trailing newlines) of one change record and returns whether the whole change
520/// is ignorable. Returns an empty string when no substantive change survives so
521/// the caller can hide the file entry.
522#[allow(clippy::too_many_arguments)]
523pub fn unified_diff_histogram_ignore_with_prefix_and_funcname<F>(
524    old_content: &str,
525    new_content: &str,
526    old_path: &str,
527    new_path: &str,
528    context_lines: usize,
529    inter_hunk_context: usize,
530    src_prefix: &str,
531    dst_prefix: &str,
532    funcname_matcher: Option<&FuncnameMatcher>,
533    quote_path_fully: bool,
534    function_context: bool,
535    is_ignorable_change: F,
536) -> String
537where
538    F: Fn(&[&str], &[&str]) -> bool,
539{
540    use crate::quote_path::format_diff_path_with_prefix;
541
542    let body = histogram_unified_body_ignore_fc(
543        old_content,
544        new_content,
545        context_lines,
546        inter_hunk_context,
547        function_context,
548        funcname_matcher,
549        is_ignorable_change,
550    );
551    if body.is_empty() {
552        return String::new();
553    }
554
555    let mut output = String::new();
556    if old_path == "/dev/null" {
557        output.push_str("--- /dev/null\n");
558    } else if src_prefix.is_empty() {
559        output.push_str(&format!("--- {old_path}\n"));
560    } else {
561        output.push_str("--- ");
562        output.push_str(&format_diff_path_with_prefix(
563            src_prefix,
564            old_path,
565            quote_path_fully,
566        ));
567        output.push('\n');
568    }
569    if new_path == "/dev/null" {
570        output.push_str("+++ /dev/null\n");
571    } else if dst_prefix.is_empty() {
572        output.push_str(&format!("+++ {new_path}\n"));
573    } else {
574        output.push_str("+++ ");
575        output.push_str(&format_diff_path_with_prefix(
576            dst_prefix,
577            new_path,
578            quote_path_fully,
579        ));
580        output.push('\n');
581    }
582
583    let old_lines: Vec<&str> = old_content.lines().collect();
584    for hunk_str in imara_unified_hunk_slices(&body) {
585        if hunk_str.is_empty() {
586            continue;
587        }
588        if let Some(first_newline) = hunk_str.find('\n') {
589            let header_line = &hunk_str[..first_newline];
590            let rest = &hunk_str[first_newline..];
591            if let Some(func_ctx) =
592                extract_function_context(header_line, &old_lines, funcname_matcher)
593            {
594                output.push_str(header_line);
595                output.push(' ');
596                output.push_str(&func_ctx);
597                output.push_str(rest);
598            } else {
599                output.push_str(hunk_str);
600            }
601        } else {
602            output.push_str(hunk_str);
603        }
604    }
605
606    output
607}
608
609/// `diff.indentHeuristic` from config (Git defaults to true when unset).
610#[must_use]
611pub fn indent_heuristic_from_config(config: &ConfigSet) -> bool {
612    match config.get_bool("diff.indentHeuristic") {
613        Some(Ok(b)) => b,
614        Some(Err(_)) | None => true,
615    }
616}
617
618/// Resolve indent heuristic: `--no-indent-heuristic` and `--indent-heuristic` override config.
619#[must_use]
620pub fn resolve_indent_heuristic(
621    config: &ConfigSet,
622    cli_indent_heuristic: bool,
623    cli_no_indent_heuristic: bool,
624) -> bool {
625    if cli_no_indent_heuristic {
626        false
627    } else if cli_indent_heuristic {
628        true
629    } else {
630        indent_heuristic_from_config(config)
631    }
632}
633
634/// Parse `--indent-heuristic` / `--no-indent-heuristic` from a plumbing argv slice (last occurrence wins).
635#[must_use]
636pub fn parse_indent_heuristic_cli_flags(argv: &[String]) -> (bool, bool) {
637    let mut indent_heuristic = false;
638    let mut no_indent_heuristic = false;
639    for a in argv {
640        match a.as_str() {
641            "--indent-heuristic" => {
642                indent_heuristic = true;
643                no_indent_heuristic = false;
644            }
645            "--no-indent-heuristic" => {
646                no_indent_heuristic = true;
647                indent_heuristic = false;
648            }
649            _ => {}
650        }
651    }
652    (indent_heuristic, no_indent_heuristic)
653}
654
655// ---------------------------------------------------------------------------
656// Faithful port of Git's xdiff Myers engine (xdiff/xdiffi.c + xprepare.c),
657// restricted to what the word-diff path needs (no rename/ignore-whitespace
658// flags). Git's word diff runs the *default* xdiff Myers over the per-word
659// token streams; matching Git's exact change-record selection requires
660// reproducing its preprocessing (record classification + `xdl_cleanup_records`
661// + `xdl_trim_ends`) and its divide-and-conquer split, because `imara-diff`
662// (and `similar`) pick different — but equally minimal — alignments for inputs
663// with many repeated tokens (e.g. the `{`/`}`/`,` in the `bibtex` driver).
664mod git_xdiff {
665    const XDL_MAX_COST_MIN: i64 = 256;
666    const XDL_HEUR_MIN_COST: i64 = 256;
667    const XDL_SNAKE_CNT: i64 = 20;
668    const XDL_K_HEUR: i64 = 4;
669    const XDL_LINE_MAX: i64 = i64::MAX;
670    const XDL_KPDIS_RUN: i64 = 4;
671    const XDL_MAX_EQLIMIT: i64 = 1024;
672    const XDL_SIMSCAN_WINDOW: i64 = 100;
673
674    // record-classification actions (xprepare.c)
675    const DISCARD: u8 = 0;
676    const KEEP: u8 = 1;
677    const INVESTIGATE: u8 = 2;
678
679    /// Classical integer square root approximation using shifts (`xdl_bogosqrt`).
680    fn bogosqrt(mut n: i64) -> i64 {
681        let mut i: i64 = 1;
682        while n > 0 {
683            i <<= 1;
684            n >>= 2;
685        }
686        i
687    }
688
689    struct XdAlgoEnv {
690        mxcost: i64,
691        snake_cnt: i64,
692        heur_min: i64,
693    }
694
695    struct XdpSplit {
696        i1: i64,
697        i2: i64,
698        min_lo: bool,
699        min_hi: bool,
700    }
701
702    /// A single side of the diff: token ids plus the `changed` flags and the
703    /// `reference_index` mapping from effective record -> original record.
704    struct XdFile<'a> {
705        ids: &'a [u32],
706        changed: Vec<bool>,
707        reference_index: Vec<usize>,
708        dstart: i64,
709        dend: i64,
710        nreff: i64,
711    }
712
713    /// Hash/id lookup over the *effective* records: index into `reference_index`.
714    #[inline]
715    fn get_id(xdf: &XdFile<'_>, idx: i64) -> u32 {
716        xdf.ids[xdf.reference_index[idx as usize]]
717    }
718
719    /// Faithful port of `xdl_split` (the middle-snake finder). `kvdf`/`kvdb` are
720    /// the forward/backward k-vectors, offset so index 0 maps to diagonal 0.
721    #[allow(clippy::too_many_arguments)]
722    fn xdl_split(
723        xdf1: &XdFile<'_>,
724        off1: i64,
725        lim1: i64,
726        xdf2: &XdFile<'_>,
727        off2: i64,
728        lim2: i64,
729        kvdf: &mut [i64],
730        kvdb: &mut [i64],
731        koff: i64,
732        need_min: bool,
733        spl: &mut XdpSplit,
734        xenv: &XdAlgoEnv,
735    ) -> i64 {
736        let dmin = off1 - lim2;
737        let dmax = lim1 - off2;
738        let fmid = off1 - off2;
739        let bmid = lim1 - lim2;
740        let odd = ((fmid - bmid) & 1) != 0;
741        let mut fmin = fmid;
742        let mut fmax = fmid;
743        let mut bmin = bmid;
744        let mut bmax = bmid;
745
746        let kf = |k: i64| -> usize { (k + koff) as usize };
747
748        kvdf[kf(fmid)] = off1;
749        kvdb[kf(bmid)] = lim1;
750
751        let mut ec: i64 = 1;
752        loop {
753            let mut got_snake = false;
754
755            if fmin > dmin {
756                fmin -= 1;
757                kvdf[kf(fmin - 1)] = -1;
758            } else {
759                fmin += 1;
760            }
761            if fmax < dmax {
762                fmax += 1;
763                kvdf[kf(fmax + 1)] = -1;
764            } else {
765                fmax -= 1;
766            }
767
768            let mut d = fmax;
769            while d >= fmin {
770                let mut i1 = if kvdf[kf(d - 1)] >= kvdf[kf(d + 1)] {
771                    kvdf[kf(d - 1)] + 1
772                } else {
773                    kvdf[kf(d + 1)]
774                };
775                let prev1 = i1;
776                let mut i2 = i1 - d;
777                while i1 < lim1 && i2 < lim2 && get_id(xdf1, i1) == get_id(xdf2, i2) {
778                    i1 += 1;
779                    i2 += 1;
780                }
781                if i1 - prev1 > xenv.snake_cnt {
782                    got_snake = true;
783                }
784                kvdf[kf(d)] = i1;
785                if odd && bmin <= d && d <= bmax && kvdb[kf(d)] <= i1 {
786                    spl.i1 = i1;
787                    spl.i2 = i2;
788                    spl.min_lo = true;
789                    spl.min_hi = true;
790                    return ec;
791                }
792                d -= 2;
793            }
794
795            if bmin > dmin {
796                bmin -= 1;
797                kvdb[kf(bmin - 1)] = XDL_LINE_MAX;
798            } else {
799                bmin += 1;
800            }
801            if bmax < dmax {
802                bmax += 1;
803                kvdb[kf(bmax + 1)] = XDL_LINE_MAX;
804            } else {
805                bmax -= 1;
806            }
807
808            let mut d = bmax;
809            while d >= bmin {
810                let mut i1 = if kvdb[kf(d - 1)] < kvdb[kf(d + 1)] {
811                    kvdb[kf(d - 1)]
812                } else {
813                    kvdb[kf(d + 1)] - 1
814                };
815                let prev1 = i1;
816                let mut i2 = i1 - d;
817                while i1 > off1 && i2 > off2 && get_id(xdf1, i1 - 1) == get_id(xdf2, i2 - 1) {
818                    i1 -= 1;
819                    i2 -= 1;
820                }
821                if prev1 - i1 > xenv.snake_cnt {
822                    got_snake = true;
823                }
824                kvdb[kf(d)] = i1;
825                if !odd && fmin <= d && d <= fmax && i1 <= kvdf[kf(d)] {
826                    spl.i1 = i1;
827                    spl.i2 = i2;
828                    spl.min_lo = true;
829                    spl.min_hi = true;
830                    return ec;
831                }
832                d -= 2;
833            }
834
835            if need_min {
836                ec += 1;
837                continue;
838            }
839
840            if got_snake && ec > xenv.heur_min {
841                let mut best = 0i64;
842                let mut d = fmax;
843                while d >= fmin {
844                    let dd = if d > fmid { d - fmid } else { fmid - d };
845                    let i1 = kvdf[kf(d)];
846                    let i2 = i1 - d;
847                    let v = (i1 - off1) + (i2 - off2) - dd;
848                    if v > XDL_K_HEUR * ec
849                        && v > best
850                        && off1 + xenv.snake_cnt <= i1
851                        && i1 < lim1
852                        && off2 + xenv.snake_cnt <= i2
853                        && i2 < lim2
854                    {
855                        let mut k = 1i64;
856                        while get_id(xdf1, i1 - k) == get_id(xdf2, i2 - k) {
857                            if k == xenv.snake_cnt {
858                                best = v;
859                                spl.i1 = i1;
860                                spl.i2 = i2;
861                                break;
862                            }
863                            k += 1;
864                        }
865                    }
866                    d -= 2;
867                }
868                if best > 0 {
869                    spl.min_lo = true;
870                    spl.min_hi = false;
871                    return ec;
872                }
873
874                let mut best = 0i64;
875                let mut d = bmax;
876                while d >= bmin {
877                    let dd = if d > bmid { d - bmid } else { bmid - d };
878                    let i1 = kvdb[kf(d)];
879                    let i2 = i1 - d;
880                    let v = (lim1 - i1) + (lim2 - i2) - dd;
881                    if v > XDL_K_HEUR * ec
882                        && v > best
883                        && off1 < i1
884                        && i1 <= lim1 - xenv.snake_cnt
885                        && off2 < i2
886                        && i2 <= lim2 - xenv.snake_cnt
887                    {
888                        let mut k = 0i64;
889                        while get_id(xdf1, i1 + k) == get_id(xdf2, i2 + k) {
890                            if k == xenv.snake_cnt - 1 {
891                                best = v;
892                                spl.i1 = i1;
893                                spl.i2 = i2;
894                                break;
895                            }
896                            k += 1;
897                        }
898                    }
899                    d -= 2;
900                }
901                if best > 0 {
902                    spl.min_lo = false;
903                    spl.min_hi = true;
904                    return ec;
905                }
906            }
907
908            if ec >= xenv.mxcost {
909                let mut fbest = -1i64;
910                let mut fbest1 = -1i64;
911                let mut d = fmax;
912                while d >= fmin {
913                    let mut i1 = kvdf[kf(d)].min(lim1);
914                    let mut i2 = i1 - d;
915                    if lim2 < i2 {
916                        i1 = lim2 + d;
917                        i2 = lim2;
918                    }
919                    if fbest < i1 + i2 {
920                        fbest = i1 + i2;
921                        fbest1 = i1;
922                    }
923                    d -= 2;
924                }
925
926                let mut bbest = XDL_LINE_MAX;
927                let mut bbest1 = XDL_LINE_MAX;
928                let mut d = bmax;
929                while d >= bmin {
930                    let mut i1 = off1.max(kvdb[kf(d)]);
931                    let mut i2 = i1 - d;
932                    if i2 < off2 {
933                        i1 = off2 + d;
934                        i2 = off2;
935                    }
936                    if i1 + i2 < bbest {
937                        bbest = i1 + i2;
938                        bbest1 = i1;
939                    }
940                    d -= 2;
941                }
942
943                if (lim1 + lim2) - bbest < fbest - (off1 + off2) {
944                    spl.i1 = fbest1;
945                    spl.i2 = fbest - fbest1;
946                    spl.min_lo = true;
947                    spl.min_hi = false;
948                } else {
949                    spl.i1 = bbest1;
950                    spl.i2 = bbest - bbest1;
951                    spl.min_lo = false;
952                    spl.min_hi = true;
953                }
954                return ec;
955            }
956
957            ec += 1;
958        }
959    }
960
961    /// Faithful port of `xdl_recs_cmp` (divide & conquer). Marks `changed` flags.
962    #[allow(clippy::too_many_arguments)]
963    fn xdl_recs_cmp(
964        xdf1: &mut XdFile<'_>,
965        mut off1: i64,
966        mut lim1: i64,
967        xdf2: &mut XdFile<'_>,
968        mut off2: i64,
969        mut lim2: i64,
970        kvdf: &mut [i64],
971        kvdb: &mut [i64],
972        koff: i64,
973        need_min: bool,
974        xenv: &XdAlgoEnv,
975    ) {
976        while off1 < lim1 && off2 < lim2 && get_id(xdf1, off1) == get_id(xdf2, off2) {
977            off1 += 1;
978            off2 += 1;
979        }
980        while off1 < lim1 && off2 < lim2 && get_id(xdf1, lim1 - 1) == get_id(xdf2, lim2 - 1) {
981            lim1 -= 1;
982            lim2 -= 1;
983        }
984
985        if off1 == lim1 {
986            while off2 < lim2 {
987                let r = xdf2.reference_index[off2 as usize];
988                xdf2.changed[r] = true;
989                off2 += 1;
990            }
991        } else if off2 == lim2 {
992            while off1 < lim1 {
993                let r = xdf1.reference_index[off1 as usize];
994                xdf1.changed[r] = true;
995                off1 += 1;
996            }
997        } else {
998            let mut spl = XdpSplit {
999                i1: 0,
1000                i2: 0,
1001                min_lo: false,
1002                min_hi: false,
1003            };
1004            xdl_split(
1005                xdf1, off1, lim1, xdf2, off2, lim2, kvdf, kvdb, koff, need_min, &mut spl, xenv,
1006            );
1007            xdl_recs_cmp(
1008                xdf1, off1, spl.i1, xdf2, off2, spl.i2, kvdf, kvdb, koff, spl.min_lo, xenv,
1009            );
1010            xdl_recs_cmp(
1011                xdf1, spl.i1, lim1, xdf2, spl.i2, lim2, kvdf, kvdb, koff, spl.min_hi, xenv,
1012            );
1013        }
1014    }
1015
1016    /// `xdl_clean_mmatch`: decide whether a multimatch record should be discarded.
1017    fn clean_mmatch(action: &[u8], i: i64, mut s: i64, mut e: i64) -> bool {
1018        if i - s > XDL_SIMSCAN_WINDOW {
1019            s = i - XDL_SIMSCAN_WINDOW;
1020        }
1021        if e - i > XDL_SIMSCAN_WINDOW {
1022            e = i + XDL_SIMSCAN_WINDOW;
1023        }
1024
1025        let mut rdis0 = 0i64;
1026        let mut rpdis0 = 1i64;
1027        let mut r = 1i64;
1028        while i - r >= s {
1029            match action[(i - r) as usize] {
1030                DISCARD => rdis0 += 1,
1031                INVESTIGATE => rpdis0 += 1,
1032                _ => break, // KEEP
1033            }
1034            r += 1;
1035        }
1036        if rdis0 == 0 {
1037            return false;
1038        }
1039        let mut rdis1 = 0i64;
1040        let mut rpdis1 = 1i64;
1041        let mut r = 1i64;
1042        while i + r <= e {
1043            match action[(i + r) as usize] {
1044                DISCARD => rdis1 += 1,
1045                INVESTIGATE => rpdis1 += 1,
1046                _ => break, // KEEP
1047            }
1048            r += 1;
1049        }
1050        if rdis1 == 0 {
1051            return false;
1052        }
1053        rdis1 += rdis0;
1054        rpdis1 += rpdis0;
1055        rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1)
1056    }
1057
1058    /// Build the `changed` flags for both token streams using Git's xdiff Myers
1059    /// pipeline. `ids1`/`ids2` are interned token ids (equal id <=> equal token).
1060    /// Returns `(changed1, changed2)`, one bool per original token.
1061    pub fn changed_flags(ids1: &[u32], ids2: &[u32]) -> (Vec<bool>, Vec<bool>) {
1062        let nrec1 = ids1.len();
1063        let nrec2 = ids2.len();
1064
1065        // Per-token occurrence counts on each side (class len1/len2).
1066        use std::collections::HashMap;
1067        let mut count1: HashMap<u32, i64> = HashMap::new();
1068        let mut count2: HashMap<u32, i64> = HashMap::new();
1069        for &id in ids1 {
1070            *count1.entry(id).or_insert(0) += 1;
1071        }
1072        for &id in ids2 {
1073            *count2.entry(id).or_insert(0) += 1;
1074        }
1075
1076        let mut xdf1 = XdFile {
1077            ids: ids1,
1078            changed: vec![false; nrec1],
1079            reference_index: Vec::new(),
1080            dstart: 0,
1081            dend: nrec1 as i64 - 1,
1082            nreff: 0,
1083        };
1084        let mut xdf2 = XdFile {
1085            ids: ids2,
1086            changed: vec![false; nrec2],
1087            reference_index: Vec::new(),
1088            dstart: 0,
1089            dend: nrec2 as i64 - 1,
1090            nreff: 0,
1091        };
1092
1093        // xdl_trim_ends: trim leading/trailing matching records.
1094        let lim = nrec1.min(nrec2) as i64;
1095        let mut i = 0i64;
1096        while i < lim && ids1[i as usize] == ids2[i as usize] {
1097            i += 1;
1098        }
1099        xdf1.dstart = i;
1100        xdf2.dstart = i;
1101        let mut j = 0i64;
1102        let rem = lim - i;
1103        while j < rem && ids1[nrec1 - 1 - j as usize] == ids2[nrec2 - 1 - j as usize] {
1104            j += 1;
1105        }
1106        xdf1.dend = nrec1 as i64 - j - 1;
1107        xdf2.dend = nrec2 as i64 - j - 1;
1108
1109        // xdl_cleanup_records: classify and reduce to effective records.
1110        let mut action1 = vec![0u8; nrec1 + 1];
1111        let mut action2 = vec![0u8; nrec2 + 1];
1112
1113        let mut mlim = bogosqrt(nrec1 as i64);
1114        if mlim > XDL_MAX_EQLIMIT {
1115            mlim = XDL_MAX_EQLIMIT;
1116        }
1117        let mut idx = xdf1.dstart;
1118        while idx <= xdf1.dend {
1119            let id = ids1[idx as usize];
1120            let nm = *count2.get(&id).unwrap_or(&0);
1121            action1[idx as usize] = if nm == 0 {
1122                DISCARD
1123            } else if nm >= mlim {
1124                INVESTIGATE
1125            } else {
1126                KEEP
1127            };
1128            idx += 1;
1129        }
1130
1131        let mut mlim = bogosqrt(nrec2 as i64);
1132        if mlim > XDL_MAX_EQLIMIT {
1133            mlim = XDL_MAX_EQLIMIT;
1134        }
1135        let mut idx = xdf2.dstart;
1136        while idx <= xdf2.dend {
1137            let id = ids2[idx as usize];
1138            let nm = *count1.get(&id).unwrap_or(&0);
1139            action2[idx as usize] = if nm == 0 {
1140                DISCARD
1141            } else if nm >= mlim {
1142                INVESTIGATE
1143            } else {
1144                KEEP
1145            };
1146            idx += 1;
1147        }
1148
1149        let mut idx = xdf1.dstart;
1150        while idx <= xdf1.dend {
1151            let a = action1[idx as usize];
1152            if a == KEEP
1153                || (a == INVESTIGATE && !clean_mmatch(&action1, idx, xdf1.dstart, xdf1.dend))
1154            {
1155                xdf1.reference_index.push(idx as usize);
1156            } else {
1157                xdf1.changed[idx as usize] = true;
1158            }
1159            idx += 1;
1160        }
1161        xdf1.nreff = xdf1.reference_index.len() as i64;
1162
1163        let mut idx = xdf2.dstart;
1164        while idx <= xdf2.dend {
1165            let a = action2[idx as usize];
1166            if a == KEEP
1167                || (a == INVESTIGATE && !clean_mmatch(&action2, idx, xdf2.dstart, xdf2.dend))
1168            {
1169                xdf2.reference_index.push(idx as usize);
1170            } else {
1171                xdf2.changed[idx as usize] = true;
1172            }
1173            idx += 1;
1174        }
1175        xdf2.nreff = xdf2.reference_index.len() as i64;
1176
1177        // Allocate K vectors (xdl_do_diff). koff lets us use negative diagonals.
1178        let ndiags = xdf1.nreff + xdf2.nreff + 3;
1179        let kvd_len = (2 * ndiags + 2) as usize;
1180        let mut kvd = vec![0i64; kvd_len];
1181        // kvdf base offset = nreff2 + 1; kvdb base offset = ndiags + nreff2 + 1.
1182        let koff = xdf2.nreff + 1;
1183        let (kvdf_slice, kvdb_slice) = kvd.split_at_mut(ndiags as usize);
1184        // Both slices are indexed as [k + koff]; their length covers the diagonal range.
1185
1186        let xenv = XdAlgoEnv {
1187            mxcost: bogosqrt(ndiags).max(XDL_MAX_COST_MIN),
1188            snake_cnt: XDL_SNAKE_CNT,
1189            heur_min: XDL_HEUR_MIN_COST,
1190        };
1191
1192        let nreff1 = xdf1.nreff;
1193        let nreff2 = xdf2.nreff;
1194        xdl_recs_cmp(
1195            &mut xdf1, 0, nreff1, &mut xdf2, 0, nreff2, kvdf_slice, kvdb_slice, koff, false, &xenv,
1196        );
1197
1198        (xdf1.changed, xdf2.changed)
1199    }
1200}
1201
1202/// Convert per-token `changed` flags (Git xdiff style) into `similar::DiffOp`s.
1203fn changed_flags_to_ops(
1204    changed1: &[bool],
1205    changed2: &[bool],
1206    old_len: usize,
1207    new_len: usize,
1208) -> Vec<similar::DiffOp> {
1209    use similar::DiffOp;
1210    let mut ops: Vec<DiffOp> = Vec::new();
1211    let mut i = 0usize;
1212    let mut j = 0usize;
1213    while i < old_len || j < new_len {
1214        let del = i < old_len && changed1[i];
1215        let ins = j < new_len && changed2[j];
1216        if !del && !ins {
1217            // Equal run.
1218            let start_i = i;
1219            let start_j = j;
1220            while i < old_len && j < new_len && !changed1[i] && !changed2[j] {
1221                i += 1;
1222                j += 1;
1223            }
1224            ops.push(DiffOp::Equal {
1225                old_index: start_i,
1226                new_index: start_j,
1227                len: i - start_i,
1228            });
1229        } else {
1230            // Changed run: consecutive deletions then/and insertions.
1231            let start_i = i;
1232            let start_j = j;
1233            while i < old_len && changed1[i] {
1234                i += 1;
1235            }
1236            while j < new_len && changed2[j] {
1237                j += 1;
1238            }
1239            let dlen = i - start_i;
1240            let ilen = j - start_j;
1241            if dlen > 0 && ilen > 0 {
1242                ops.push(DiffOp::Replace {
1243                    old_index: start_i,
1244                    old_len: dlen,
1245                    new_index: start_j,
1246                    new_len: ilen,
1247                });
1248            } else if dlen > 0 {
1249                ops.push(DiffOp::Delete {
1250                    old_index: start_i,
1251                    old_len: dlen,
1252                    new_index: start_j,
1253                });
1254            } else if ilen > 0 {
1255                ops.push(DiffOp::Insert {
1256                    old_index: start_i,
1257                    new_index: start_j,
1258                    new_len: ilen,
1259                });
1260            }
1261        }
1262    }
1263    ops
1264}
1265
1266/// Diff two token streams with a faithful port of Git's xdiff Myers engine and return the
1267/// result as `similar::DiffOp`s. Used by the word-diff machinery, where matching Git's exact
1268/// record selection and tie-breaking matters: `imara-diff`/`similar` pick different — but
1269/// equally minimal — alignments for streams with many repeated tokens (e.g. the `bibtex`
1270/// driver's `{`/`}`/`,`), mismatching Git's reference output.
1271#[must_use]
1272pub fn word_diff_ops_imara(old_words: &[&str], new_words: &[&str]) -> Vec<similar::DiffOp> {
1273    use std::collections::HashMap;
1274
1275    // Intern tokens to ids (equal id <=> equal token), mirroring xdiff's record hashing.
1276    let mut interner: HashMap<&str, u32> = HashMap::new();
1277    let mut ids1: Vec<u32> = Vec::with_capacity(old_words.len());
1278    for &w in old_words {
1279        let next = interner.len() as u32;
1280        let id = *interner.entry(w).or_insert(next);
1281        ids1.push(id);
1282    }
1283    let mut ids2: Vec<u32> = Vec::with_capacity(new_words.len());
1284    for &w in new_words {
1285        let next = interner.len() as u32;
1286        let id = *interner.entry(w).or_insert(next);
1287        ids2.push(id);
1288    }
1289
1290    let (changed1, changed2) = git_xdiff::changed_flags(&ids1, &ids2);
1291    let ops = changed_flags_to_ops(&changed1, &changed2, old_words.len(), new_words.len());
1292
1293    // Slide changed runs to Git's canonical position (`xdl_change_compact`); the word
1294    // diff never enables the indent heuristic.
1295    diff_indent_heuristic::apply_change_compact_to_ops(&ops, old_words, new_words, false)
1296}
1297
1298/// Line-diff ops for string slices after Git `xdl_change_compact` (and optional indent heuristic).
1299#[must_use]
1300pub fn diff_slice_ops_compacted(
1301    old_lines: &[&str],
1302    new_lines: &[&str],
1303    algorithm: similar::Algorithm,
1304    indent_heuristic: bool,
1305) -> Vec<similar::DiffOp> {
1306    diff_indent_heuristic::diff_slice_ops_compacted(
1307        old_lines,
1308        new_lines,
1309        algorithm,
1310        indent_heuristic,
1311    )
1312}
1313
1314/// Map each line in `new_joined` to its origin in `old_joined` after Git-style compaction (for blame).
1315#[must_use]
1316pub fn map_new_to_old_lines_compacted(
1317    old_joined: &str,
1318    new_joined: &str,
1319    algorithm: similar::Algorithm,
1320    indent_heuristic: bool,
1321    new_line_count: usize,
1322) -> Vec<Option<usize>> {
1323    let ops = diff_indent_heuristic::diff_lines_ops_compacted(
1324        old_joined,
1325        new_joined,
1326        algorithm,
1327        indent_heuristic,
1328    );
1329    diff_indent_heuristic::map_new_to_old_from_ops(&ops, new_line_count)
1330}
1331
1332/// The kind of change between two sides of a diff.
1333#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1334pub enum DiffStatus {
1335    /// File was added.
1336    Added,
1337    /// File was deleted.
1338    Deleted,
1339    /// File was modified (content or mode change).
1340    Modified,
1341    /// File was renamed (with optional content change).
1342    Renamed,
1343    /// File was copied.
1344    Copied,
1345    /// File type changed (e.g. regular → symlink).
1346    TypeChanged,
1347    /// Unmerged (conflict).
1348    Unmerged,
1349}
1350
1351impl DiffStatus {
1352    /// Single-character status letter used in raw diff output.
1353    #[must_use]
1354    pub fn letter(&self) -> char {
1355        match self {
1356            Self::Added => 'A',
1357            Self::Deleted => 'D',
1358            Self::Modified => 'M',
1359            Self::Renamed => 'R',
1360            Self::Copied => 'C',
1361            Self::TypeChanged => 'T',
1362            Self::Unmerged => 'U',
1363        }
1364    }
1365}
1366
1367/// A single diff entry representing one changed path.
1368#[derive(Debug, Clone, PartialEq, Eq)]
1369pub struct DiffEntry {
1370    /// The status of this change.
1371    pub status: DiffStatus,
1372    /// Path in the "old" side (None for Added).
1373    pub old_path: Option<String>,
1374    /// Path in the "new" side (None for Deleted).
1375    pub new_path: Option<String>,
1376    /// Old file mode (as octal string, e.g. "100644").
1377    pub old_mode: String,
1378    /// New file mode.
1379    pub new_mode: String,
1380    /// Old object ID (zero OID for Added).
1381    pub old_oid: ObjectId,
1382    /// New object ID (zero OID for Deleted).
1383    pub new_oid: ObjectId,
1384    /// Similarity score (0–100) for renames/copies.
1385    pub score: Option<u32>,
1386}
1387
1388impl DiffEntry {
1389    /// The primary path for display (new_path for adds, old_path for deletes).
1390    #[must_use]
1391    pub fn path(&self) -> &str {
1392        self.new_path
1393            .as_deref()
1394            .or(self.old_path.as_deref())
1395            .unwrap_or("")
1396    }
1397
1398    /// Return a human-oriented path display for this entry.
1399    ///
1400    /// For renames and copies this returns `old -> new`; for all other entry
1401    /// kinds this returns the primary path.
1402    #[must_use]
1403    pub fn display_path(&self) -> String {
1404        match self.status {
1405            DiffStatus::Renamed | DiffStatus::Copied => {
1406                let old = self.old_path.as_deref().unwrap_or("");
1407                let new = self.new_path.as_deref().unwrap_or("");
1408                if old.is_empty() || new.is_empty() {
1409                    self.path().to_owned()
1410                } else {
1411                    format!("{old} -> {new}")
1412                }
1413            }
1414            _ => self.path().to_owned(),
1415        }
1416    }
1417}
1418
1419/// The zero (null) object ID used for "no object" in diff output.
1420pub const ZERO_OID: &str = "0000000000000000000000000000000000000000";
1421
1422/// Return the zero ObjectId.
1423#[must_use]
1424pub fn zero_oid() -> ObjectId {
1425    ObjectId::from_bytes(&[0u8; 20]).unwrap_or_else(|_| {
1426        // This should never fail since we pass exactly 20 bytes
1427        panic!("internal error: failed to create zero OID");
1428    })
1429}
1430
1431/// Return the ObjectId for the empty blob object.
1432#[must_use]
1433pub fn empty_blob_oid() -> ObjectId {
1434    ObjectId::from_hex("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391").unwrap_or_else(|_| {
1435        // This should never fail since the object ID literal is valid.
1436        panic!("internal error: failed to create empty blob OID");
1437    })
1438}
1439
1440// ── Tree-to-tree diff ───────────────────────────────────────────────
1441
1442/// Compare two trees and return the list of changed entries.
1443///
1444/// # Parameters
1445///
1446/// - `odb` — object database to read tree objects from.
1447/// - `old_tree_oid` — OID of the old tree (or `None` for comparison against empty).
1448/// - `new_tree_oid` — OID of the new tree (or `None` for comparison against empty).
1449/// - `prefix` — path prefix for nested tree recursion (empty string for root).
1450///
1451/// # Errors
1452///
1453/// Returns errors from object database reads.
1454pub fn diff_trees(
1455    odb: &Odb,
1456    old_tree_oid: Option<&ObjectId>,
1457    new_tree_oid: Option<&ObjectId>,
1458    prefix: &str,
1459) -> Result<Vec<DiffEntry>> {
1460    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, false)
1461}
1462
1463/// Like `diff_trees` but with `show_trees` flag: when true, emit entries for
1464/// tree objects themselves in addition to their recursive contents (the `-t`
1465/// flag of `diff-tree`).
1466pub fn diff_trees_show_tree_entries(
1467    odb: &Odb,
1468    old_tree_oid: Option<&ObjectId>,
1469    new_tree_oid: Option<&ObjectId>,
1470    prefix: &str,
1471) -> Result<Vec<DiffEntry>> {
1472    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, true)
1473}
1474
1475fn diff_trees_opts(
1476    odb: &Odb,
1477    old_tree_oid: Option<&ObjectId>,
1478    new_tree_oid: Option<&ObjectId>,
1479    prefix: &str,
1480    show_trees: bool,
1481) -> Result<Vec<DiffEntry>> {
1482    let old_entries = match old_tree_oid {
1483        Some(oid) => read_tree(odb, oid)?,
1484        None => Vec::new(),
1485    };
1486    let new_entries = match new_tree_oid {
1487        Some(oid) => read_tree(odb, oid)?,
1488        None => Vec::new(),
1489    };
1490
1491    let mut result = Vec::new();
1492    diff_tree_entries_opts(
1493        odb,
1494        &old_entries,
1495        &new_entries,
1496        prefix,
1497        show_trees,
1498        &mut result,
1499    )?;
1500    Ok(result)
1501}
1502
1503/// Read and parse a tree object from the ODB.
1504fn read_tree(odb: &Odb, oid: &ObjectId) -> Result<Vec<TreeEntry>> {
1505    let obj = odb.read(oid)?;
1506    if obj.kind != ObjectKind::Tree {
1507        return Err(Error::CorruptObject(format!(
1508            "expected tree, got {}",
1509            obj.kind.as_str()
1510        )));
1511    }
1512    parse_tree(&obj.data)
1513}
1514
1515/// Compare two sorted lists of tree entries, recursing into subtrees.
1516fn diff_tree_entries_opts(
1517    odb: &Odb,
1518    old: &[TreeEntry],
1519    new: &[TreeEntry],
1520    prefix: &str,
1521    show_trees: bool,
1522    result: &mut Vec<DiffEntry>,
1523) -> Result<()> {
1524    let mut oi = 0;
1525    let mut ni = 0;
1526
1527    while oi < old.len() || ni < new.len() {
1528        match (old.get(oi), new.get(ni)) {
1529            (Some(o), Some(n)) => {
1530                let cmp = crate::objects::tree_entry_cmp(
1531                    &o.name,
1532                    is_tree_mode(o.mode),
1533                    &n.name,
1534                    is_tree_mode(n.mode),
1535                );
1536                match cmp {
1537                    std::cmp::Ordering::Less => {
1538                        // Old entry not in new → deleted
1539                        emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1540                        oi += 1;
1541                    }
1542                    std::cmp::Ordering::Greater => {
1543                        // New entry not in old → added
1544                        emit_added_opts(odb, n, prefix, show_trees, result)?;
1545                        ni += 1;
1546                    }
1547                    std::cmp::Ordering::Equal => {
1548                        // Both present — check for changes
1549                        if o.oid != n.oid || o.mode != n.mode {
1550                            let name_str = String::from_utf8_lossy(&o.name);
1551                            let path = format_path(prefix, &name_str);
1552                            if is_tree_mode(o.mode) && is_tree_mode(n.mode) {
1553                                // Both are trees
1554                                if show_trees {
1555                                    result.push(DiffEntry {
1556                                        status: DiffStatus::Modified,
1557                                        old_path: Some(path.clone()),
1558                                        new_path: Some(path.clone()),
1559                                        old_mode: format_mode(o.mode),
1560                                        new_mode: format_mode(n.mode),
1561                                        old_oid: o.oid,
1562                                        new_oid: n.oid,
1563                                        score: None,
1564                                    });
1565                                }
1566                                // Recurse
1567                                let nested = diff_trees_opts(
1568                                    odb,
1569                                    Some(&o.oid),
1570                                    Some(&n.oid),
1571                                    &path,
1572                                    show_trees,
1573                                )?;
1574                                result.extend(nested);
1575                            } else if is_tree_mode(o.mode) && !is_tree_mode(n.mode) {
1576                                // Tree → blob: delete tree contents, add blob
1577                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1578                                emit_added_opts(odb, n, prefix, show_trees, result)?;
1579                            } else if !is_tree_mode(o.mode) && is_tree_mode(n.mode) {
1580                                // Blob → tree: delete blob, add tree contents
1581                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1582                                emit_added_opts(odb, n, prefix, show_trees, result)?;
1583                            } else {
1584                                // Both blobs — modified.
1585                                // A mode-only change (e.g. chmod) is Modified.
1586                                // TypeChanged is only for actual type changes (blob ↔ symlink).
1587                                let old_type = o.mode & 0o170000;
1588                                let new_type = n.mode & 0o170000;
1589                                result.push(DiffEntry {
1590                                    status: if old_type != new_type {
1591                                        DiffStatus::TypeChanged
1592                                    } else {
1593                                        DiffStatus::Modified
1594                                    },
1595                                    old_path: Some(path.clone()),
1596                                    new_path: Some(path),
1597                                    old_mode: format_mode(o.mode),
1598                                    new_mode: format_mode(n.mode),
1599                                    old_oid: o.oid,
1600                                    new_oid: n.oid,
1601                                    score: None,
1602                                });
1603                            }
1604                        }
1605                        oi += 1;
1606                        ni += 1;
1607                    }
1608                }
1609            }
1610            (Some(o), None) => {
1611                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1612                oi += 1;
1613            }
1614            (None, Some(n)) => {
1615                emit_added_opts(odb, n, prefix, show_trees, result)?;
1616                ni += 1;
1617            }
1618            (None, None) => break,
1619        }
1620    }
1621
1622    Ok(())
1623}
1624
1625fn emit_deleted_opts(
1626    odb: &Odb,
1627    entry: &TreeEntry,
1628    prefix: &str,
1629    show_trees: bool,
1630    result: &mut Vec<DiffEntry>,
1631) -> Result<()> {
1632    let name_str = String::from_utf8_lossy(&entry.name);
1633    let path = format_path(prefix, &name_str);
1634    if is_tree_mode(entry.mode) {
1635        if show_trees {
1636            result.push(DiffEntry {
1637                status: DiffStatus::Deleted,
1638                old_path: Some(path.clone()),
1639                new_path: None,
1640                old_mode: format_mode(entry.mode),
1641                new_mode: "000000".to_owned(),
1642                old_oid: entry.oid,
1643                new_oid: zero_oid(),
1644                score: None,
1645            });
1646        }
1647        // Recurse into deleted tree
1648        let nested = diff_trees_opts(odb, Some(&entry.oid), None, &path, show_trees)?;
1649        result.extend(nested);
1650    } else {
1651        result.push(DiffEntry {
1652            status: DiffStatus::Deleted,
1653            old_path: Some(path.clone()),
1654            new_path: None,
1655            old_mode: format_mode(entry.mode),
1656            new_mode: "000000".to_owned(),
1657            old_oid: entry.oid,
1658            new_oid: zero_oid(),
1659            score: None,
1660        });
1661    }
1662    Ok(())
1663}
1664
1665fn emit_added_opts(
1666    odb: &Odb,
1667    entry: &TreeEntry,
1668    prefix: &str,
1669    show_trees: bool,
1670    result: &mut Vec<DiffEntry>,
1671) -> Result<()> {
1672    let name_str = String::from_utf8_lossy(&entry.name);
1673    let path = format_path(prefix, &name_str);
1674    if is_tree_mode(entry.mode) {
1675        if show_trees {
1676            result.push(DiffEntry {
1677                status: DiffStatus::Added,
1678                old_path: None,
1679                new_path: Some(path.clone()),
1680                old_mode: "000000".to_owned(),
1681                new_mode: format_mode(entry.mode),
1682                old_oid: zero_oid(),
1683                new_oid: entry.oid,
1684                score: None,
1685            });
1686        }
1687        // Recurse into added tree
1688        let nested = diff_trees_opts(odb, None, Some(&entry.oid), &path, show_trees)?;
1689        result.extend(nested);
1690    } else {
1691        result.push(DiffEntry {
1692            status: DiffStatus::Added,
1693            old_path: None,
1694            new_path: Some(path),
1695            old_mode: "000000".to_owned(),
1696            new_mode: format_mode(entry.mode),
1697            old_oid: zero_oid(),
1698            new_oid: entry.oid,
1699            score: None,
1700        });
1701    }
1702    Ok(())
1703}
1704
1705// ── Index-to-tree diff (staged changes) ─────────────────────────────
1706
1707/// Compare the index against a tree (usually HEAD's tree).
1708///
1709/// This shows "staged" changes — what would be committed.
1710///
1711/// # Parameters
1712///
1713/// - `odb` — object database.
1714/// - `index` — the current index.
1715/// - `tree_oid` — the tree to compare against (e.g. HEAD's tree), or `None`
1716///   for comparison against an empty tree (initial commit).
1717///
1718/// # Errors
1719///
1720/// Returns errors from ODB reads.
1721///
1722/// When `ignore_submodules` is true, gitlink (`160000`) paths are omitted from the diff, matching
1723/// Git's `require_clean_work_tree(..., ignore_submodules=1)` used by `git rebase` / `git pull`.
1724pub fn diff_index_to_tree(
1725    odb: &Odb,
1726    index: &Index,
1727    tree_oid: Option<&ObjectId>,
1728    ignore_submodules: bool,
1729) -> Result<Vec<DiffEntry>> {
1730    // Flatten the tree into a sorted list of (path, mode, oid)
1731    let tree_entries = match tree_oid {
1732        Some(oid) => flatten_tree(odb, oid, "")?,
1733        None => Vec::new(),
1734    };
1735
1736    // Build maps keyed by path
1737    let mut tree_map: std::collections::BTreeMap<&str, &FlatEntry> =
1738        std::collections::BTreeMap::new();
1739    for entry in &tree_entries {
1740        tree_map.insert(&entry.path, entry);
1741    }
1742
1743    let mut result = Vec::new();
1744    let mut stage0_paths = std::collections::BTreeSet::new();
1745    let mut unmerged_modes: std::collections::BTreeMap<String, (u8, u32)> =
1746        std::collections::BTreeMap::new();
1747
1748    // Check index entries against tree
1749    for ie in &index.entries {
1750        let path = String::from_utf8_lossy(&ie.path).to_string();
1751        if ie.stage() == 0 && ie.intent_to_add() {
1752            // Intent-to-add entries are not "staged" for diff-index / status
1753            // (matches Git: `git diff --cached` is empty for `-N` paths).
1754            continue;
1755        }
1756        if ie.stage() != 0 {
1757            let rank = match ie.stage() {
1758                2 => 0u8,
1759                3 => 1u8,
1760                1 => 2u8,
1761                _ => 3u8,
1762            };
1763            match unmerged_modes.get(&path) {
1764                Some((existing_rank, _)) if *existing_rank <= rank => {}
1765                _ => {
1766                    unmerged_modes.insert(path, (rank, ie.mode));
1767                }
1768            }
1769            continue;
1770        }
1771        if ignore_submodules && ie.mode == 0o160000 {
1772            let _ = tree_map.remove(path.as_str());
1773            stage0_paths.insert(path.clone());
1774            continue;
1775        }
1776        stage0_paths.insert(path.clone());
1777        match tree_map.remove(path.as_str()) {
1778            Some(te) => {
1779                // Present in both — check for differences
1780                if te.oid != ie.oid || te.mode != ie.mode {
1781                    result.push(DiffEntry {
1782                        status: DiffStatus::Modified,
1783                        old_path: Some(path.clone()),
1784                        new_path: Some(path),
1785                        old_mode: format_mode(te.mode),
1786                        new_mode: format_mode(ie.mode),
1787                        old_oid: te.oid,
1788                        new_oid: ie.oid,
1789                        score: None,
1790                    });
1791                }
1792            }
1793            None => {
1794                // In index but not tree → added
1795                result.push(DiffEntry {
1796                    status: DiffStatus::Added,
1797                    old_path: None,
1798                    new_path: Some(path),
1799                    old_mode: "000000".to_owned(),
1800                    new_mode: format_mode(ie.mode),
1801                    old_oid: zero_oid(),
1802                    new_oid: ie.oid,
1803                    score: None,
1804                });
1805            }
1806        }
1807    }
1808
1809    for (path, (_, mode)) in &unmerged_modes {
1810        if stage0_paths.contains(path) {
1811            continue;
1812        }
1813        tree_map.remove(path.as_str());
1814        result.push(DiffEntry {
1815            status: DiffStatus::Unmerged,
1816            old_path: Some(path.clone()),
1817            new_path: Some(path.clone()),
1818            old_mode: "000000".to_owned(),
1819            new_mode: format_mode(*mode),
1820            old_oid: zero_oid(),
1821            new_oid: zero_oid(),
1822            score: None,
1823        });
1824    }
1825
1826    // Remaining tree entries not in index → deleted
1827    for (path, te) in tree_map {
1828        if ignore_submodules && te.mode == 0o160000 {
1829            continue;
1830        }
1831        result.push(DiffEntry {
1832            status: DiffStatus::Deleted,
1833            old_path: Some(path.to_owned()),
1834            new_path: None,
1835            old_mode: format_mode(te.mode),
1836            new_mode: "000000".to_owned(),
1837            old_oid: te.oid,
1838            new_oid: zero_oid(),
1839            score: None,
1840        });
1841    }
1842
1843    result.sort_by(|a, b| a.path().cmp(b.path()));
1844    Ok(result)
1845}
1846
1847// ── Index-to-worktree diff (unstaged changes) ───────────────────────
1848
1849/// Compare the index against the working tree.
1850///
1851/// This shows "unstaged" changes — modifications not yet staged.
1852///
1853/// Entries with [`IndexEntry::assume_unchanged`] or [`IndexEntry::skip_worktree`] are treated as
1854/// matching the work tree without examining the filesystem (Git `CE_VALID` / skip-worktree).
1855///
1856/// # Parameters
1857///
1858/// - `odb` — object database (for hashing worktree files).
1859/// - `index` — the current index.
1860/// - `work_tree` — path to the working tree root.
1861/// - `ignore_submodule_untracked` — when true, gitlink entries are not dirty solely from untracked
1862///   files inside the submodule (matches `git status -uno`).
1863/// - `simplify_gitlinks` — when true, nested gitlink entries only compare the submodule checkout
1864///   HEAD to the recorded OID (ignore dirty work trees inside nested submodules). Used when
1865///   computing `submodule_porcelain_flags` so untracked files under a nested submodule do not set
1866///   the parent submodule's `modified` bit (Git `DIRTY_SUBMODULE_MODIFIED`; t7506).
1867///
1868/// # Errors
1869///
1870/// Returns errors from I/O or hashing.
1871pub fn diff_index_to_worktree(
1872    odb: &Odb,
1873    index: &Index,
1874    work_tree: &Path,
1875    ignore_submodule_untracked: bool,
1876    simplify_gitlinks: bool,
1877) -> Result<Vec<DiffEntry>> {
1878    diff_index_to_worktree_with_options(
1879        odb,
1880        index,
1881        work_tree,
1882        DiffIndexToWorktreeOptions {
1883            ignore_submodule_untracked,
1884            simplify_gitlinks,
1885            ..DiffIndexToWorktreeOptions::default()
1886        },
1887    )
1888}
1889
1890/// Additional inputs for [`diff_index_to_worktree_with_options`].
1891#[derive(Debug, Clone, Copy, Default)]
1892pub struct DiffIndexToWorktreeOptions {
1893    /// Optional index mtime pair `(sec, nsec)` sampled when the index was read.
1894    ///
1895    /// When provided, entries with matching stat data are still considered dirty candidates if
1896    /// their recorded mtime is "racy" (at or after this timestamp), matching Git's
1897    /// `is_racy_timestamp` behavior.
1898    pub index_mtime: Option<(u32, u32)>,
1899    /// When true, gitlink entries are not dirty solely from untracked files inside the submodule.
1900    pub ignore_submodule_untracked: bool,
1901    /// When true, nested gitlink entries only compare the submodule checkout HEAD to the recorded OID.
1902    pub simplify_gitlinks: bool,
1903    /// When true, a populated gitlink checkout whose `.git` indirection cannot resolve to a HEAD
1904    /// is returned as an error instead of a normal modified gitlink.
1905    pub error_on_broken_gitlinks: bool,
1906}
1907
1908/// Compare the index against the working tree with optional racy-timestamp context.
1909///
1910/// This variant enables a stat-trust fast path: if an entry's stat tuple matches and the mode is
1911/// unchanged, the worktree blob hash is skipped unless the entry is racy relative to the supplied
1912/// index mtime.
1913///
1914/// # Parameters
1915///
1916/// - `odb` — object database (for hashing worktree files).
1917/// - `index` — the current index.
1918/// - `work_tree` — path to the working tree root.
1919/// - `options` — optional context for racy timestamp checks.
1920///
1921/// # Errors
1922///
1923/// Returns errors from I/O or hashing.
1924pub fn diff_index_to_worktree_with_options(
1925    odb: &Odb,
1926    index: &Index,
1927    work_tree: &Path,
1928    options: DiffIndexToWorktreeOptions,
1929) -> Result<Vec<DiffEntry>> {
1930    use crate::config::ConfigSet;
1931    use crate::crlf;
1932
1933    let ignore_submodule_untracked = options.ignore_submodule_untracked;
1934    let simplify_gitlinks = options.simplify_gitlinks;
1935
1936    let git_dir = work_tree.join(".git");
1937    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1938    let conv = crlf::ConversionConfig::from_config(&config);
1939    let attrs = crlf::load_gitattributes(work_tree);
1940
1941    let mut result = Vec::new();
1942    let mut unmerged_base: std::collections::BTreeMap<String, (u8, &IndexEntry)> =
1943        std::collections::BTreeMap::new();
1944
1945    for ie in &index.entries {
1946        if ie.stage() != 0 {
1947            let path = String::from_utf8_lossy(&ie.path).to_string();
1948            let rank = match ie.stage() {
1949                2 => 0u8,
1950                3 => 1u8,
1951                1 => 2u8,
1952                _ => 3u8,
1953            };
1954            match unmerged_base.get(&path) {
1955                Some((existing_rank, _)) if *existing_rank <= rank => {}
1956                _ => {
1957                    unmerged_base.insert(path, (rank, ie));
1958                }
1959            }
1960            continue;
1961        }
1962        // Sparse checkout: paths outside the cone are not expected on disk; `assume_unchanged`
1963        // is treated as clean without reading the filesystem (wt-status.c).
1964        if ie.skip_worktree() || ie.assume_unchanged() {
1965            continue;
1966        }
1967        // Use str slice directly to avoid allocation for path joining;
1968        // only allocate String if we need it for DiffEntry output.
1969        let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
1970        let is_intent_to_add = ie.intent_to_add();
1971
1972        // Gitlink entries (submodules): Git's `diff-index` reports `M` when the recorded
1973        // commit differs from the submodule checkout **or** when the submodule work tree is
1974        // dirty (staged/unstaged/untracked) even if HEAD still matches the gitlink. For the
1975        // latter case the "new" OID column is the null OID (see `git diff-index` / t7506).
1976        if ie.mode == 0o160000 {
1977            let sub_dir = work_tree.join(path_str_ref);
1978            let sub_head_oid = read_submodule_head_oid(&sub_dir);
1979            // A gitlink whose worktree directory is entirely absent is a deleted submodule. Git's
1980            // `check_removed` (diff-lib.c) `lstat`s the path first: a missing directory is reported
1981            // as a removal (`D`, new mode 000000), *before* the submodule "not checked out" special
1982            // case (which only applies when the directory exists). An empty directory that exists is
1983            // a placeholder and stays unchanged. Skipped when `simplify_gitlinks` so callers that
1984            // only compare recorded HEADs keep their behaviour. (t4060 #50/#51.)
1985            if !simplify_gitlinks && sub_head_oid.is_none() && !sub_dir.exists() {
1986                let path_owned = path_str_ref.to_owned();
1987                result.push(DiffEntry {
1988                    status: DiffStatus::Deleted,
1989                    old_path: Some(path_owned.clone()),
1990                    new_path: Some(path_owned),
1991                    old_mode: format_mode(ie.mode),
1992                    new_mode: "000000".to_owned(),
1993                    old_oid: ie.oid,
1994                    new_oid: zero_oid(),
1995                    score: None,
1996                });
1997                continue;
1998            }
1999            let ref_matches = if let Some(oid) = sub_head_oid {
2000                oid == ie.oid
2001            } else {
2002                let is_placeholder = submodule_worktree_is_unpopulated_placeholder(&sub_dir);
2003                if options.error_on_broken_gitlinks
2004                    && !is_placeholder
2005                    && submodule_embedded_git_dir(&sub_dir).is_some()
2006                {
2007                    return Err(Error::ConfigError(format!(
2008                        "could not read submodule HEAD for '{path_str_ref}'"
2009                    )));
2010                }
2011                is_placeholder
2012            };
2013            if simplify_gitlinks {
2014                if !ref_matches {
2015                    let path_owned = path_str_ref.to_owned();
2016                    let new_oid = sub_head_oid.unwrap_or_else(zero_oid);
2017                    result.push(DiffEntry {
2018                        status: DiffStatus::Modified,
2019                        old_path: Some(path_owned.clone()),
2020                        new_path: Some(path_owned),
2021                        old_mode: format_mode(ie.mode),
2022                        new_mode: format_mode(ie.mode),
2023                        old_oid: ie.oid,
2024                        new_oid,
2025                        score: None,
2026                    });
2027                }
2028                continue;
2029            }
2030            // A populated submodule whose HEAD points at a commit object that is missing from its
2031            // own object store is corrupt. Git's `is_submodule_modified` shells `git status` into
2032            // the submodule, which fails, and Git aborts the surrounding status/diff. Mirror that:
2033            // a broken submodule is a hard error rather than a silently-clean gitlink (t5526 #38).
2034            if sub_head_oid.is_some() && submodule_head_object_broken(&sub_dir) {
2035                return Err(Error::ConfigError(format!(
2036                    "'git status --porcelain=2' failed in submodule {path_str_ref}"
2037                )));
2038            }
2039            let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
2040            if ignore_submodule_untracked {
2041                flags.untracked = false;
2042            }
2043            let inner_dirty = flags.modified || flags.untracked;
2044            if !ref_matches || inner_dirty {
2045                let path_owned = path_str_ref.to_owned();
2046                let new_oid = if !ref_matches {
2047                    sub_head_oid.unwrap_or_else(zero_oid)
2048                } else {
2049                    zero_oid()
2050                };
2051                result.push(DiffEntry {
2052                    status: DiffStatus::Modified,
2053                    old_path: Some(path_owned.clone()),
2054                    new_path: Some(path_owned),
2055                    old_mode: format_mode(ie.mode),
2056                    new_mode: format_mode(ie.mode),
2057                    old_oid: ie.oid,
2058                    new_oid,
2059                    score: None,
2060                });
2061            }
2062            continue;
2063        }
2064
2065        let file_path = work_tree.join(path_str_ref);
2066
2067        if is_intent_to_add {
2068            match fs::symlink_metadata(&file_path) {
2069                Ok(meta) => {
2070                    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
2071                    let worktree_oid = hash_worktree_file(
2072                        odb,
2073                        &file_path,
2074                        &meta,
2075                        &conv,
2076                        &file_attrs,
2077                        path_str_ref,
2078                        None,
2079                    )?;
2080                    let worktree_mode = mode_from_metadata(&meta);
2081                    result.push(DiffEntry {
2082                        status: DiffStatus::Added,
2083                        old_path: None,
2084                        new_path: Some(path_str_ref.to_owned()),
2085                        old_mode: "000000".to_owned(),
2086                        new_mode: format_mode(worktree_mode),
2087                        // `ita_invisible_in_index`: null OID on the index side for patch output
2088                        // (`index 0000000..`, t2203); index entry still stores the empty blob.
2089                        old_oid: zero_oid(),
2090                        new_oid: worktree_oid,
2091                        score: None,
2092                    });
2093                }
2094                Err(e)
2095                    if e.kind() == std::io::ErrorKind::NotFound
2096                        || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
2097                {
2098                    result.push(DiffEntry {
2099                        status: DiffStatus::Deleted,
2100                        old_path: Some(path_str_ref.to_owned()),
2101                        new_path: None,
2102                        old_mode: format_mode(ie.mode),
2103                        new_mode: "000000".to_owned(),
2104                        old_oid: ie.oid,
2105                        new_oid: zero_oid(),
2106                        score: None,
2107                    });
2108                }
2109                Err(e) => return Err(Error::Io(e)),
2110            }
2111            continue;
2112        }
2113
2114        // If any parent component of the path is a symlink, the file is effectively
2115        // deleted from the working tree (a symlink replaced a directory).
2116        if has_symlink_in_path(work_tree, path_str_ref) {
2117            result.push(DiffEntry {
2118                status: DiffStatus::Deleted,
2119                old_path: Some(path_str_ref.to_owned()),
2120                new_path: None,
2121                old_mode: format_mode(ie.mode),
2122                new_mode: "000000".to_owned(),
2123                old_oid: ie.oid,
2124                new_oid: zero_oid(),
2125                score: None,
2126            });
2127            continue;
2128        }
2129
2130        match fs::symlink_metadata(&file_path) {
2131            Ok(meta) if meta.is_dir() => {
2132                // A directory exists where the index expects a file. A populated submodule
2133                // checkout (`.git` present) is a blob→gitlink typechange with the submodule HEAD on
2134                // the new side (raw output re-zeros it); otherwise the indexed file is effectively
2135                // deleted. See t4041/t4060 #13.
2136                if file_path.join(".git").exists() {
2137                    let head = read_submodule_head_oid(&file_path).unwrap_or_else(zero_oid);
2138                    let path_owned = path_str_ref.to_owned();
2139                    result.push(DiffEntry {
2140                        status: DiffStatus::TypeChanged,
2141                        old_path: Some(path_owned.clone()),
2142                        new_path: Some(path_owned),
2143                        old_mode: format_mode(ie.mode),
2144                        new_mode: format_mode(0o160000),
2145                        old_oid: ie.oid,
2146                        new_oid: head,
2147                        score: None,
2148                    });
2149                    continue;
2150                }
2151                result.push(DiffEntry {
2152                    status: DiffStatus::Deleted,
2153                    old_path: Some(path_str_ref.to_owned()),
2154                    new_path: None,
2155                    old_mode: format_mode(ie.mode),
2156                    new_mode: String::new(),
2157                    old_oid: ie.oid,
2158                    new_oid: zero_oid(),
2159                    score: None,
2160                });
2161            }
2162            Ok(meta) => {
2163                let worktree_mode = mode_from_metadata(&meta);
2164                let stat_same = stat_matches(ie, &meta);
2165                // Mode-only change: stat still matches the index entry but executable bit differs.
2166                if stat_same && worktree_mode != ie.mode {
2167                    let path_owned = path_str_ref.to_owned();
2168                    result.push(DiffEntry {
2169                        status: DiffStatus::Modified,
2170                        old_path: Some(path_owned.clone()),
2171                        new_path: Some(path_owned),
2172                        old_mode: format_mode(ie.mode),
2173                        new_mode: format_mode(worktree_mode),
2174                        old_oid: ie.oid,
2175                        new_oid: ie.oid,
2176                        score: None,
2177                    });
2178                    continue;
2179                }
2180
2181                // Fast path: unchanged stat + unchanged mode + non-racy timestamp means this entry
2182                // is clean without re-hashing blob data.
2183                if stat_same && worktree_mode == ie.mode && !entry_is_racy(ie, options.index_mtime) {
2184                    continue;
2185                }
2186
2187                // Hash the worktree blob for uncertain/racy entries.
2188                let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
2189                let worktree_oid = hash_worktree_file(
2190                    odb,
2191                    &file_path,
2192                    &meta,
2193                    &conv,
2194                    &file_attrs,
2195                    path_str_ref,
2196                    Some(ie),
2197                )?;
2198
2199                // If clean conversion disagrees with the index but raw bytes match the
2200                // blob (e.g. mixed line endings committed with autocrlf off), Git reports
2201                // no diff (t0020: touch + git diff --exit-code).
2202                let mut eff_oid = worktree_oid;
2203                if eff_oid != ie.oid {
2204                    if let Ok(raw) = fs::read(&file_path) {
2205                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2206                        if raw_oid == ie.oid {
2207                            eff_oid = ie.oid;
2208                        }
2209                    }
2210                }
2211
2212                if eff_oid != ie.oid || worktree_mode != ie.mode {
2213                    let path_owned = path_str_ref.to_owned();
2214                    result.push(DiffEntry {
2215                        status: DiffStatus::Modified,
2216                        old_path: Some(path_owned.clone()),
2217                        new_path: Some(path_owned),
2218                        old_mode: format_mode(ie.mode),
2219                        new_mode: format_mode(worktree_mode),
2220                        old_oid: ie.oid,
2221                        new_oid: eff_oid,
2222                    score: None,
2223                    });
2224                }
2225            }
2226            Err(e) if e.kind() == std::io::ErrorKind::NotFound
2227                || e.raw_os_error() == Some(20) /* ENOTDIR */ => {
2228                // File deleted from working tree (or parent replaced by a file)
2229                result.push(DiffEntry {
2230                    status: DiffStatus::Deleted,
2231                    old_path: Some(path_str_ref.to_owned()),
2232                    new_path: None,
2233                    old_mode: format_mode(ie.mode),
2234                    new_mode: "000000".to_owned(),
2235                    old_oid: ie.oid,
2236                    new_oid: zero_oid(),
2237                    score: None,
2238                });
2239            }
2240            Err(e) => return Err(Error::Io(e)),
2241        }
2242    }
2243
2244    for (path, (_, base_entry)) in unmerged_base {
2245        let file_path = work_tree.join(&path);
2246        let wt_meta = match fs::symlink_metadata(&file_path) {
2247            Ok(meta) => Some(meta),
2248            Err(e)
2249                if e.kind() == std::io::ErrorKind::NotFound
2250                    || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
2251            {
2252                None
2253            }
2254            Err(e) => return Err(Error::Io(e)),
2255        };
2256
2257        let new_mode = wt_meta.as_ref().map_or_else(
2258            || "000000".to_owned(),
2259            |meta| format_mode(mode_from_metadata(meta)),
2260        );
2261        result.push(DiffEntry {
2262            status: DiffStatus::Unmerged,
2263            old_path: Some(path.clone()),
2264            new_path: Some(path.clone()),
2265            old_mode: "000000".to_owned(),
2266            new_mode,
2267            old_oid: zero_oid(),
2268            new_oid: zero_oid(),
2269            score: None,
2270        });
2271
2272        if let Some(meta) = wt_meta {
2273            let file_attrs = crlf::get_file_attrs(&attrs, &path, false, &config);
2274            let wt_oid = hash_worktree_file(
2275                odb,
2276                &file_path,
2277                &meta,
2278                &conv,
2279                &file_attrs,
2280                &path,
2281                Some(base_entry),
2282            )?;
2283            let wt_mode = mode_from_metadata(&meta);
2284            if wt_oid != base_entry.oid || wt_mode != base_entry.mode {
2285                result.push(DiffEntry {
2286                    status: DiffStatus::Modified,
2287                    old_path: Some(path.clone()),
2288                    new_path: Some(path),
2289                    old_mode: format_mode(base_entry.mode),
2290                    new_mode: format_mode(wt_mode),
2291                    old_oid: base_entry.oid,
2292                    new_oid: wt_oid,
2293                    score: None,
2294                });
2295            }
2296        }
2297    }
2298
2299    Ok(result)
2300}
2301
2302fn entry_is_racy(ie: &IndexEntry, index_mtime: Option<(u32, u32)>) -> bool {
2303    let Some((index_mtime_sec, index_mtime_nsec)) = index_mtime else {
2304        return false;
2305    };
2306    if index_mtime_sec == 0 {
2307        return false;
2308    }
2309    index_mtime_sec < ie.mtime_sec
2310        || (index_mtime_sec == ie.mtime_sec && index_mtime_nsec <= ie.mtime_nsec)
2311}
2312
2313/// Quick stat check: does the index entry's cached stat data match the file?
2314/// Returns true when the file at `ie`'s path differs from the index entry (mode or blob).
2315///
2316/// Used by commands such as `git mv` to detect "dirty" paths under sparse checkout.
2317/// Symlinks and submodules are compared in a Git-compatible way.
2318///
2319/// `ignore_submodule_untracked` mirrors [`diff_index_to_worktree`]'s same flag for gitlinks.
2320pub fn worktree_differs_from_index_entry(
2321    odb: &Odb,
2322    work_tree: &Path,
2323    ie: &IndexEntry,
2324    ignore_submodule_untracked: bool,
2325) -> Result<bool> {
2326    use crate::config::ConfigSet;
2327    use crate::crlf;
2328
2329    let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
2330    let file_path = work_tree.join(path_str_ref);
2331
2332    if ie.mode == 0o160000 {
2333        let sub_head_oid = read_submodule_head(&file_path);
2334        let ref_matches = match sub_head_oid {
2335            Some(oid) => oid == ie.oid,
2336            None => submodule_worktree_is_unpopulated_placeholder(&file_path),
2337        };
2338        let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
2339        if ignore_submodule_untracked {
2340            flags.untracked = false;
2341        }
2342        return Ok(!ref_matches || flags.modified || flags.untracked);
2343    }
2344
2345    let meta = match fs::symlink_metadata(&file_path) {
2346        Ok(m) => m,
2347        Err(e)
2348            if e.kind() == std::io::ErrorKind::NotFound
2349                || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
2350        {
2351            return Ok(true);
2352        }
2353        Err(e) => return Err(Error::Io(e)),
2354    };
2355
2356    if meta.is_dir() {
2357        return Ok(true);
2358    }
2359
2360    let worktree_mode = mode_from_metadata(&meta);
2361    if worktree_mode != ie.mode {
2362        return Ok(true);
2363    }
2364
2365    let git_dir = work_tree.join(".git");
2366    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
2367    let conv = crlf::ConversionConfig::from_config(&config);
2368    let attrs = crlf::load_gitattributes(work_tree);
2369    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
2370    let worktree_oid = hash_worktree_file(
2371        odb,
2372        &file_path,
2373        &meta,
2374        &conv,
2375        &file_attrs,
2376        path_str_ref,
2377        Some(ie),
2378    )?;
2379
2380    let mut eff_oid = worktree_oid;
2381    if eff_oid != ie.oid {
2382        if let Ok(raw) = fs::read(&file_path) {
2383            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2384            if raw_oid == ie.oid {
2385                eff_oid = ie.oid;
2386            }
2387        }
2388    }
2389
2390    Ok(eff_oid != ie.oid)
2391}
2392
2393pub fn stat_matches(ie: &IndexEntry, meta: &fs::Metadata) -> bool {
2394    // Compare size
2395    if meta.len() as u32 != ie.size {
2396        return false;
2397    }
2398    #[cfg(unix)]
2399    {
2400        use std::os::unix::fs::MetadataExt;
2401        // Compare mtime (seconds + nanoseconds)
2402        if meta.mtime() as u32 != ie.mtime_sec {
2403            return false;
2404        }
2405        if meta.mtime_nsec() as u32 != ie.mtime_nsec {
2406            return false;
2407        }
2408        // Compare ctime (seconds + nanoseconds)
2409        if meta.ctime() as u32 != ie.ctime_sec {
2410            return false;
2411        }
2412        if meta.ctime_nsec() as u32 != ie.ctime_nsec {
2413            return false;
2414        }
2415        // Compare inode and device
2416        if meta.ino() as u32 != ie.ino {
2417            return false;
2418        }
2419        if meta.dev() as u32 != ie.dev {
2420            return false;
2421        }
2422    }
2423    #[cfg(not(unix))]
2424    {
2425        use std::time::UNIX_EPOCH;
2426        if let Ok(mtime) = meta.modified() {
2427            if let Ok(dur) = mtime.duration_since(UNIX_EPOCH) {
2428                if dur.as_secs() as u32 != ie.mtime_sec {
2429                    return false;
2430                }
2431                if dur.subsec_nanos() != ie.mtime_nsec {
2432                    return false;
2433                }
2434            }
2435        }
2436    }
2437    true
2438}
2439
2440/// Refresh cached stat data for stage-0 file/symlink entries whose worktree content still matches
2441/// the recorded OID but whose on-disk stat went stale.
2442///
2443/// This mirrors Git's `refresh_index` / `refresh_cache_ent`: an entry is only marked clean (stat
2444/// adopted from the worktree) after its content is re-verified against the index OID. A genuinely
2445/// modified entry keeps its stale stat so `diff-files` / `status` continue to report it. Operations
2446/// that rewrite the worktree (`status`, `reset --mixed`, `stash`) call this before writing the
2447/// index so a subsequent `git diff-files` sees refreshed entries as clean.
2448///
2449/// Gitlinks, sparse (`skip_worktree`), `assume_unchanged` and intent-to-add entries are skipped.
2450/// The blob comparison is a raw-content hash, so a CRLF-smudged match is conservatively missed
2451/// (the entry simply stays stat-dirty and is re-hashed next time — never the reverse).
2452///
2453/// `index_mtime` is the on-disk index file's `(mtime_sec, mtime_nsec)` (see
2454/// `entry_is_racy` / Git `is_racy_timestamp`); pass `None` when unknown — racy detection is
2455/// then skipped, which is conservative for tree-built indexes whose zeroed stat never matches.
2456///
2457/// Returns `true` when at least one entry was refreshed or invalidated, so callers can write
2458/// the index opportunistically (Git only persists a refresh that changed something).
2459pub fn refresh_index_stat_content_verified(
2460    index: &mut Index,
2461    work_tree: &Path,
2462    index_mtime: Option<(u32, u32)>,
2463) -> bool {
2464    use crate::index::{MODE_EXECUTABLE, MODE_REGULAR, MODE_SYMLINK};
2465    let mut changed = false;
2466    for ie in &mut index.entries {
2467        if ie.stage() != 0 || ie.skip_worktree() || ie.assume_unchanged() || ie.intent_to_add() {
2468            continue;
2469        }
2470        if ie.mode != MODE_REGULAR && ie.mode != MODE_EXECUTABLE && ie.mode != MODE_SYMLINK {
2471            continue;
2472        }
2473        let Ok(path) = std::str::from_utf8(&ie.path) else {
2474            continue;
2475        };
2476        let abs = work_tree.join(path);
2477        let Ok(meta) = fs::symlink_metadata(&abs) else {
2478            continue;
2479        };
2480        if stat_matches(ie, &meta) {
2481            // Git `ie_match_stat`: a clean stat is trusted without reading the file unless the
2482            // entry is racy (written within the index's own mtime). Only then re-verify content;
2483            // stat can be refreshed from the work tree without matching the indexed blob (e.g.
2484            // after merge stat refresh while local edits remain) — invalidate so diff/status
2485            // re-hash.
2486            if entry_is_racy(ie, index_mtime)
2487                && !worktree_content_matches_index_oid(ie, &abs, &meta)
2488            {
2489                invalidate_index_stat_cache(ie);
2490                changed = true;
2491            }
2492            continue;
2493        }
2494        if !worktree_content_matches_index_oid(ie, &abs, &meta) {
2495            continue;
2496        }
2497        let refreshed = crate::index::entry_from_metadata(&meta, &ie.path, ie.oid, ie.mode);
2498        ie.ctime_sec = refreshed.ctime_sec;
2499        ie.ctime_nsec = refreshed.ctime_nsec;
2500        ie.mtime_sec = refreshed.mtime_sec;
2501        ie.mtime_nsec = refreshed.mtime_nsec;
2502        ie.dev = refreshed.dev;
2503        ie.ino = refreshed.ino;
2504        ie.uid = refreshed.uid;
2505        ie.gid = refreshed.gid;
2506        ie.size = refreshed.size;
2507        changed = true;
2508    }
2509    changed
2510}
2511
2512/// Whether the work tree blob at `abs` matches the index entry OID (raw bytes, no CRLF smudge).
2513fn worktree_content_matches_index_oid(ie: &IndexEntry, abs: &Path, meta: &fs::Metadata) -> bool {
2514    use crate::index::{MODE_EXECUTABLE, MODE_REGULAR, MODE_SYMLINK};
2515    if ie.mode == MODE_SYMLINK {
2516        if !meta.file_type().is_symlink() {
2517            return false;
2518        }
2519        use std::os::unix::ffi::OsStrExt as _;
2520        fs::read_link(abs)
2521            .map(|t| Odb::hash_object_data(ObjectKind::Blob, t.as_os_str().as_bytes()) == ie.oid)
2522            .unwrap_or(false)
2523    } else if ie.mode == MODE_REGULAR || ie.mode == MODE_EXECUTABLE {
2524        if !meta.file_type().is_file() {
2525            return false;
2526        }
2527        fs::read(abs)
2528            .map(|bytes| Odb::hash_object_data(ObjectKind::Blob, &bytes) == ie.oid)
2529            .unwrap_or(false)
2530    } else {
2531        false
2532    }
2533}
2534
2535/// Clear cached stat fields so the next diff/status pass re-reads the work tree.
2536fn invalidate_index_stat_cache(ie: &mut IndexEntry) {
2537    ie.ctime_sec = 0;
2538    ie.ctime_nsec = 0;
2539    ie.mtime_sec = 0;
2540    ie.mtime_nsec = 0;
2541    ie.dev = 0;
2542    ie.ino = 0;
2543    ie.size = 0;
2544}
2545
2546/// Hash a working tree file as a blob to get its OID.
2547/// Check if any parent component of `rel_path` (relative to `work_tree`) is a symlink.
2548fn has_symlink_in_path(work_tree: &Path, rel_path: &str) -> bool {
2549    let mut check = work_tree.to_path_buf();
2550    let components: Vec<&str> = rel_path.split('/').collect();
2551    // Check all components except the last one (which is the file itself)
2552    for component in &components[..components.len().saturating_sub(1)] {
2553        check.push(component);
2554        match fs::symlink_metadata(&check) {
2555            Ok(meta) if meta.file_type().is_symlink() => return true,
2556            _ => {}
2557        }
2558    }
2559    false
2560}
2561
2562pub fn hash_worktree_file(
2563    odb: &Odb,
2564    path: &Path,
2565    meta: &fs::Metadata,
2566    conv: &crate::crlf::ConversionConfig,
2567    file_attrs: &crate::crlf::FileAttrs,
2568    rel_path: &str,
2569    index_entry: Option<&IndexEntry>,
2570) -> Result<ObjectId> {
2571    let prior_blob: Option<Vec<u8>> = index_entry
2572        .filter(|e| e.oid != zero_oid())
2573        .and_then(|e| odb.read(&e.oid).ok().map(|o| o.data));
2574    let data = if meta.file_type().is_symlink() {
2575        // For symlinks, hash the target path
2576        let target = fs::read_link(path)?;
2577        target.to_string_lossy().into_owned().into_bytes()
2578    } else if meta.is_dir() {
2579        // `read()` on a directory fails with EISDIR; unmerged paths may leave an empty
2580        // placeholder directory (e.g. t4027 combined submodule conflict).
2581        Vec::new()
2582    } else {
2583        let raw = fs::read(path)?;
2584        // Apply clean conversion (CRLF→LF) so hash matches index blob.
2585        // Do not run safecrlf here: diff/commit use this for hashing and must not print warnings.
2586        let opts = crate::crlf::ConvertToGitOpts {
2587            index_blob: prior_blob.as_deref(),
2588            renormalize: false,
2589            check_safecrlf: false,
2590        };
2591        crate::crlf::convert_to_git_with_opts(&raw, rel_path, conv, file_attrs, opts).unwrap_or(raw)
2592    };
2593
2594    Ok(Odb::hash_object_data(ObjectKind::Blob, &data))
2595}
2596
2597/// Derive a Git file mode from filesystem metadata.
2598pub fn mode_from_metadata(meta: &fs::Metadata) -> u32 {
2599    if meta.file_type().is_symlink() {
2600        0o120000
2601    } else {
2602        #[cfg(unix)]
2603        {
2604            if meta.mode() & 0o111 != 0 {
2605                return 0o100755;
2606            }
2607        }
2608        0o100644
2609    }
2610}
2611
2612/// Compare a tree against the working tree.
2613///
2614/// Shows changes from `tree_oid` to the current working directory state.
2615/// Files tracked in the index but not in the tree are shown as Added.
2616/// Files in the tree but missing from the working tree are shown as Deleted.
2617///
2618/// # Parameters
2619///
2620/// - `odb` — object database.
2621/// - `tree_oid` — the tree to compare against (`None` for empty tree).
2622/// - `work_tree` — path to the working tree root.
2623/// - `index` — current index (used to discover new tracked files not in tree).
2624///
2625/// # Errors
2626///
2627/// Returns errors from ODB reads or I/O.
2628pub fn diff_tree_to_worktree(
2629    odb: &Odb,
2630    tree_oid: Option<&ObjectId>,
2631    work_tree: &Path,
2632    index: &Index,
2633) -> Result<Vec<DiffEntry>> {
2634    use crate::config::ConfigSet;
2635    use crate::crlf;
2636
2637    let git_dir = work_tree.join(".git");
2638    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
2639    let conv = crlf::ConversionConfig::from_config(&config);
2640    let attrs = crlf::load_gitattributes(work_tree);
2641
2642    // Flatten the tree into a BTreeMap keyed by path
2643    let tree_flat = match tree_oid {
2644        Some(oid) => flatten_tree(odb, oid, "")?,
2645        None => Vec::new(),
2646    };
2647    let tree_map: std::collections::BTreeMap<String, &FlatEntry> =
2648        tree_flat.iter().map(|e| (e.path.clone(), e)).collect();
2649
2650    // Build index lookup: path → &IndexEntry (stage 0 only)
2651    let mut index_entries: std::collections::BTreeMap<&[u8], &IndexEntry> =
2652        std::collections::BTreeMap::new();
2653    let mut index_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
2654    let mut stage0_paths: std::collections::BTreeSet<Vec<u8>> = std::collections::BTreeSet::new();
2655    for ie in &index.entries {
2656        if ie.stage() != 0 {
2657            continue;
2658        }
2659        let path = String::from_utf8_lossy(&ie.path).to_string();
2660        index_entries.insert(&ie.path, ie);
2661        index_paths.insert(path);
2662        stage0_paths.insert(ie.path.clone());
2663    }
2664
2665    // Paths with only unmerged stages (1–3) and no stage 0 — `git diff <rev>` must still list them
2666    // so combined `diff --cc` conflict hunks can be emitted (`t4108-apply-threeway`).
2667    let mut unmerged_only_paths: std::collections::BTreeSet<String> =
2668        std::collections::BTreeSet::new();
2669    for ie in &index.entries {
2670        if !(1..=3).contains(&ie.stage()) {
2671            continue;
2672        }
2673        if stage0_paths.contains(&ie.path) {
2674            continue;
2675        }
2676        unmerged_only_paths.insert(String::from_utf8_lossy(&ie.path).into_owned());
2677    }
2678
2679    // Union of tree paths + index paths
2680    let mut all_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
2681    all_paths.extend(tree_map.keys().cloned());
2682    all_paths.extend(index_paths.iter().cloned());
2683    all_paths.extend(unmerged_only_paths.iter().cloned());
2684
2685    let mut result = Vec::new();
2686
2687    for path in &all_paths {
2688        if index_entries
2689            .get(path.as_bytes())
2690            .is_some_and(|ie| ie.skip_worktree())
2691        {
2692            // Sparse checkout: `git diff <rev>` does not report tree↔worktree drift for
2693            // skip-worktree paths (they are outside the sparse cone). Matches t7012 stash flow.
2694            continue;
2695        }
2696
2697        let tree_entry = tree_map.get(path.as_str());
2698
2699        // Gitlink entries (submodules) — compare HEAD commit, not file content.
2700        let is_gitlink = tree_entry.is_some_and(|te| te.mode == 0o160000)
2701            || index_entries
2702                .get(path.as_bytes())
2703                .is_some_and(|ie| ie.mode == 0o160000);
2704        if is_gitlink {
2705            let sub_dir = work_tree.join(path);
2706            let index_gitlink_oid = index_entries
2707                .get(path.as_bytes())
2708                .filter(|ie| ie.mode == 0o160000)
2709                .map(|ie| ie.oid);
2710            match (tree_entry, index_gitlink_oid) {
2711                (Some(te), _) => {
2712                    let sub_head = read_submodule_head_oid(&sub_dir);
2713                    // A gitlink whose worktree directory no longer exists is a deleted submodule:
2714                    // Git's `diff-lib.c` reports it as a deletion (status `D`, new mode 000000) and
2715                    // `--submodule` renders the `(submodule deleted)` summary (t4041 #46).
2716                    if sub_head.is_none() && !sub_dir.exists() {
2717                        result.push(DiffEntry {
2718                            status: DiffStatus::Deleted,
2719                            old_path: Some(path.clone()),
2720                            new_path: Some(path.clone()),
2721                            old_mode: format_mode(te.mode),
2722                            new_mode: "000000".to_string(),
2723                            old_oid: te.oid,
2724                            new_oid: zero_oid(),
2725                            score: None,
2726                        });
2727                        continue;
2728                    }
2729                    let index_matches_tree = index_gitlink_oid.is_some_and(|oid| oid == te.oid);
2730                    let head_differs = sub_head.as_ref() != Some(&te.oid);
2731                    let dirty_while_aligned = index_matches_tree
2732                        && !head_differs
2733                        && submodule_has_dirty_worktree_for_super_diff(work_tree, path, &te.oid);
2734                    if head_differs || dirty_while_aligned {
2735                        // Raw `git diff <tree>` lines use a null OID on the worktree side when the
2736                        // checked-out submodule HEAD differs from the tree's gitlink; patch output
2737                        // still resolves the real commit from the submodule directory.
2738                        let new_oid = if head_differs { zero_oid() } else { te.oid };
2739                        result.push(DiffEntry {
2740                            status: DiffStatus::Modified,
2741                            old_path: Some(path.clone()),
2742                            new_path: Some(path.clone()),
2743                            old_mode: format_mode(te.mode),
2744                            new_mode: format_mode(te.mode),
2745                            old_oid: te.oid,
2746                            new_oid,
2747                            score: None,
2748                        });
2749                    }
2750                }
2751                (None, Some(idx_oid)) => {
2752                    // Gitlink staged in the index but absent from the tree: a new submodule. Git
2753                    // reports it as an addition (status `A`) with the index gitlink on the new side,
2754                    // so `--submodule` renders `(new submodule)` (t4041 #46). The patch renderer
2755                    // resolves the real commit from the index OID / submodule HEAD.
2756                    let new_oid = read_submodule_head_oid(&sub_dir).unwrap_or(idx_oid);
2757                    result.push(DiffEntry {
2758                        status: DiffStatus::Added,
2759                        old_path: Some(path.clone()),
2760                        new_path: Some(path.clone()),
2761                        old_mode: "000000".to_string(),
2762                        new_mode: format_mode(0o160000),
2763                        old_oid: zero_oid(),
2764                        new_oid,
2765                        score: None,
2766                    });
2767                }
2768                (None, None) => {}
2769            }
2770            continue;
2771        }
2772
2773        let file_path = work_tree.join(path);
2774
2775        let wt_meta = match fs::symlink_metadata(&file_path) {
2776            Ok(m) => Some(m),
2777            Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
2778            Err(e) => return Err(Error::Io(e)),
2779        };
2780
2781        if unmerged_only_paths.contains(path) {
2782            if let (Some(te), Some(meta)) = (tree_entry, wt_meta.as_ref()) {
2783                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
2784                let wt_oid =
2785                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, None)?;
2786                let wt_mode = mode_from_metadata(meta);
2787                if wt_oid != te.oid || wt_mode != te.mode {
2788                    result.push(DiffEntry {
2789                        status: DiffStatus::Modified,
2790                        old_path: Some(path.clone()),
2791                        new_path: Some(path.clone()),
2792                        old_mode: format_mode(te.mode),
2793                        new_mode: format_mode(wt_mode),
2794                        old_oid: te.oid,
2795                        new_oid: wt_oid,
2796                        score: None,
2797                    });
2798                }
2799            }
2800            continue;
2801        }
2802
2803        match (tree_entry, wt_meta) {
2804            (Some(te), Some(ref meta)) => {
2805                let wt_mode = mode_from_metadata(meta);
2806                let Some(ie) = index_entries.get(path.as_bytes()) else {
2807                    continue;
2808                };
2809
2810                let index_matches_tree = ie.oid == te.oid && ie.mode == te.mode;
2811
2812                // Fully clean: index matches `HEAD`, worktree matches index, stat cache fresh.
2813                if index_matches_tree && wt_mode == te.mode && stat_matches(ie, meta) {
2814                    continue;
2815                }
2816
2817                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
2818                let idx_ent = index_entries.get(path.as_bytes()).copied();
2819
2820                // Staged mode (same blob as `HEAD`, different mode recorded in the index).
2821                if ie.oid == te.oid && ie.mode != te.mode {
2822                    result.push(DiffEntry {
2823                        status: DiffStatus::Modified,
2824                        old_path: Some(path.clone()),
2825                        new_path: Some(path.clone()),
2826                        old_mode: format_mode(te.mode),
2827                        new_mode: format_mode(ie.mode),
2828                        old_oid: te.oid,
2829                        new_oid: te.oid,
2830                        score: None,
2831                    });
2832                    continue;
2833                }
2834
2835                // Index still matches `HEAD`: only unstaged worktree drift (content and/or
2836                // worktree-only exec bit when `update-index` was not run — t4049 harness).
2837                if index_matches_tree {
2838                    let wt_oid = hash_worktree_file(
2839                        odb,
2840                        &file_path,
2841                        meta,
2842                        &conv,
2843                        &file_attrs,
2844                        path,
2845                        idx_ent,
2846                    )?;
2847                    let mut eff_oid = wt_oid;
2848                    if eff_oid != te.oid {
2849                        if let Ok(raw) = fs::read(&file_path) {
2850                            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2851                            if raw_oid == te.oid {
2852                                eff_oid = te.oid;
2853                            }
2854                        }
2855                    }
2856                    if eff_oid != te.oid {
2857                        result.push(DiffEntry {
2858                            status: DiffStatus::Modified,
2859                            old_path: Some(path.clone()),
2860                            new_path: Some(path.clone()),
2861                            old_mode: format_mode(te.mode),
2862                            new_mode: format_mode(wt_mode),
2863                            old_oid: te.oid,
2864                            new_oid: eff_oid,
2865                            score: None,
2866                        });
2867                    } else if wt_mode != te.mode {
2868                        result.push(DiffEntry {
2869                            status: DiffStatus::Modified,
2870                            old_path: Some(path.clone()),
2871                            new_path: Some(path.clone()),
2872                            old_mode: format_mode(te.mode),
2873                            new_mode: format_mode(wt_mode),
2874                            old_oid: te.oid,
2875                            new_oid: te.oid,
2876                            score: None,
2877                        });
2878                    }
2879                    continue;
2880                }
2881
2882                // Staged content (and possibly mode): `git diff <rev>` is tree vs working tree.
2883                let wt_oid =
2884                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, idx_ent)?;
2885                let mut eff_oid = wt_oid;
2886                if eff_oid != te.oid {
2887                    if let Ok(raw) = fs::read(&file_path) {
2888                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2889                        if raw_oid == te.oid {
2890                            eff_oid = te.oid;
2891                        }
2892                    }
2893                }
2894                if eff_oid != te.oid || wt_mode != te.mode {
2895                    result.push(DiffEntry {
2896                        status: DiffStatus::Modified,
2897                        old_path: Some(path.clone()),
2898                        new_path: Some(path.clone()),
2899                        old_mode: format_mode(te.mode),
2900                        new_mode: format_mode(wt_mode),
2901                        old_oid: te.oid,
2902                        new_oid: eff_oid,
2903                        score: None,
2904                    });
2905                }
2906            }
2907            (Some(te), None) => {
2908                // In tree but missing from worktree
2909                result.push(DiffEntry {
2910                    status: DiffStatus::Deleted,
2911                    old_path: Some(path.clone()),
2912                    new_path: None,
2913                    old_mode: format_mode(te.mode),
2914                    new_mode: "000000".to_owned(),
2915                    old_oid: te.oid,
2916                    new_oid: zero_oid(),
2917                    score: None,
2918                });
2919            }
2920            (None, Some(ref meta)) => {
2921                // In index but not in tree, and exists in worktree
2922                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
2923                let wt_oid = hash_worktree_file(
2924                    odb,
2925                    &file_path,
2926                    meta,
2927                    &conv,
2928                    &file_attrs,
2929                    path,
2930                    index_entries.get(path.as_bytes()).copied(),
2931                )?;
2932                let wt_mode = mode_from_metadata(meta);
2933                result.push(DiffEntry {
2934                    status: DiffStatus::Added,
2935                    old_path: None,
2936                    new_path: Some(path.clone()),
2937                    old_mode: "000000".to_owned(),
2938                    new_mode: format_mode(wt_mode),
2939                    old_oid: zero_oid(),
2940                    new_oid: wt_oid,
2941                    score: None,
2942                });
2943            }
2944            (None, None) => {
2945                // Tracked in index but neither in tree nor worktree — skip
2946            }
2947        }
2948    }
2949
2950    result.sort_by(|a, b| a.path().cmp(b.path()));
2951    Ok(result)
2952}
2953
2954// ── Rename detection ────────────────────────────────────────────────
2955
2956fn read_added_entry_bytes(
2957    odb: &Odb,
2958    entry: &DiffEntry,
2959    work_root: Option<&Path>,
2960) -> Option<Vec<u8>> {
2961    if entry.new_oid != zero_oid() {
2962        return odb.read(&entry.new_oid).ok().map(|obj| obj.data);
2963    }
2964    let path = entry.new_path.as_deref()?;
2965    let root = work_root?;
2966    fs::read(root.join(path)).ok()
2967}
2968
2969fn modified_as_copy_from_sources(
2970    odb: &Odb,
2971    work_root: Option<&Path>,
2972    e: &DiffEntry,
2973    threshold: u32,
2974    sources: &[(String, ObjectId, bool)],
2975    source_contents: &[Option<Vec<u8>>],
2976    source_tree_entries: &[(String, String, ObjectId)],
2977) -> Option<DiffEntry> {
2978    fn regular_file_mode(mode: &str) -> bool {
2979        mode == "100644" || mode == "100755"
2980    }
2981
2982    if e.status != DiffStatus::Modified || !regular_file_mode(&e.new_mode) {
2983        return None;
2984    }
2985    let new_data = read_added_entry_bytes(odb, e, work_root)?;
2986    let new_oid_eff = if e.new_oid != zero_oid() {
2987        e.new_oid
2988    } else {
2989        Odb::hash_object_data(ObjectKind::Blob, &new_data)
2990    };
2991
2992    let mut best: Option<(usize, u32)> = None;
2993    for (si, (src_path, src_oid, is_deleted)) in sources.iter().enumerate() {
2994        if *is_deleted {
2995            continue;
2996        }
2997        if e.new_path.as_deref() == Some(src_path.as_str()) {
2998            continue;
2999        }
3000        let src_mode_str = source_tree_entries
3001            .iter()
3002            .find(|(p, _, _)| p == src_path)
3003            .map(|(_, m, _)| m.as_str())
3004            .unwrap_or("100644");
3005        if !regular_file_mode(src_mode_str) {
3006            continue;
3007        }
3008
3009        let score = if *src_oid == new_oid_eff {
3010            100
3011        } else {
3012            match (&source_contents[si], Some(new_data.as_slice())) {
3013                (Some(old_data), Some(nd)) => compute_similarity(old_data, nd),
3014                _ => 0,
3015            }
3016        };
3017        if score >= threshold {
3018            let replace = match best {
3019                None => true,
3020                Some((_, s)) => score > s,
3021            };
3022            if replace {
3023                best = Some((si, score));
3024            }
3025        }
3026    }
3027
3028    let (si, score) = best?;
3029    let (src_path, src_oid, _) = &sources[si];
3030    let src_mode = source_tree_entries
3031        .iter()
3032        .find(|(p, _, _)| p == src_path)
3033        .map(|(_, m, _)| m.clone())
3034        .unwrap_or_else(|| e.old_mode.clone());
3035
3036    Some(DiffEntry {
3037        status: DiffStatus::Copied,
3038        old_path: Some(src_path.clone()),
3039        new_path: e.new_path.clone(),
3040        old_mode: src_mode,
3041        new_mode: e.new_mode.clone(),
3042        old_oid: *src_oid,
3043        new_oid: e.new_oid,
3044        score: Some(score),
3045    })
3046}
3047
3048/// Detect renames by pairing Deleted and Added entries with similar content.
3049///
3050/// `threshold` is the minimum similarity percentage (0–100) for a pair to
3051/// be considered a rename (Git's default is 50%).  The function reads blob
3052/// content from the ODB to compute a line-level similarity score.
3053///
3054/// Exact-OID matches are always 100% similar regardless of content.
3055///
3056/// When `work_root` is set, added entries whose `new_oid` is the zero placeholder (as in
3057/// uncached `diff-index` when the work tree diverged from the index) load content from disk
3058/// under that root instead of the object database.
3059pub fn detect_renames(
3060    odb: &Odb,
3061    work_root: Option<&Path>,
3062    entries: Vec<DiffEntry>,
3063    threshold: u32,
3064) -> Vec<DiffEntry> {
3065    // Split entries into deleted, added, and others.
3066    let mut deleted: Vec<DiffEntry> = Vec::new();
3067    let mut added: Vec<DiffEntry> = Vec::new();
3068    let mut others: Vec<DiffEntry> = Vec::new();
3069
3070    for entry in entries {
3071        match entry.status {
3072            DiffStatus::Deleted => deleted.push(entry),
3073            DiffStatus::Added => added.push(entry),
3074            _ => others.push(entry),
3075        }
3076    }
3077
3078    if deleted.is_empty() || added.is_empty() {
3079        // Nothing to pair — return original order.
3080        let mut result = others;
3081        result.extend(deleted);
3082        result.extend(added);
3083        result.sort_by(|a, b| a.path().cmp(b.path()));
3084        return result;
3085    }
3086
3087    // Read content for all deleted blobs.
3088    let deleted_contents: Vec<Option<Vec<u8>>> = deleted
3089        .iter()
3090        .map(|d| odb.read(&d.old_oid).ok().map(|obj| obj.data))
3091        .collect();
3092
3093    // Read content for all added blobs.
3094    let added_contents: Vec<Option<Vec<u8>>> = added
3095        .iter()
3096        .map(|a| read_added_entry_bytes(odb, a, work_root))
3097        .collect();
3098
3099    // Build a matrix of similarity scores and find the best pairings.
3100    // We use a greedy approach: pick the highest-scoring pair first.
3101    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
3102
3103    fn is_regularish_mode(mode: &str) -> bool {
3104        mode == "100644" || mode == "100755"
3105    }
3106
3107    fn same_path_same_blob(del: &DiffEntry, add: &DiffEntry) -> bool {
3108        del.old_path == add.new_path && del.old_oid == add.new_oid && del.old_mode == add.new_mode
3109    }
3110
3111    for (di, del) in deleted.iter().enumerate() {
3112        for (ai, add) in added.iter().enumerate() {
3113            // Exact OID match → 100%
3114            if del.old_oid == add.new_oid {
3115                scores.push((100, di, ai));
3116                continue;
3117            }
3118
3119            // Do not use line similarity across file types (e.g. regular ↔ symlink); Git keeps these
3120            // as separate changes (`t4008-diff-break-rewrite` #7).
3121            if !is_regularish_mode(&del.old_mode) || !is_regularish_mode(&add.new_mode) {
3122                continue;
3123            }
3124
3125            let score = match (&deleted_contents[di], &added_contents[ai]) {
3126                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
3127                _ => 0,
3128            };
3129
3130            if score >= threshold {
3131                scores.push((score, di, ai));
3132            }
3133        }
3134    }
3135
3136    // Sort: prefer real path-changing pairs before same-path no-op pairs, then
3137    // same-basename pairs, then by score descending.
3138    // This matches Git's behavior where basename matches are checked first.
3139    scores.sort_by(|a, b| {
3140        let a_noop = same_path_same_blob(&deleted[a.1], &added[a.2]);
3141        let b_noop = same_path_same_blob(&deleted[b.1], &added[b.2]);
3142        let a_same = same_basename(&deleted[a.1], &added[a.2]);
3143        let b_same = same_basename(&deleted[b.1], &added[b.2]);
3144        a_noop
3145            .cmp(&b_noop)
3146            .then_with(|| b_same.cmp(&a_same))
3147            .then_with(|| b.0.cmp(&a.0))
3148    });
3149
3150    let mut used_deleted = vec![false; deleted.len()];
3151    let mut used_added = vec![false; added.len()];
3152    let mut renames: Vec<DiffEntry> = Vec::new();
3153
3154    for (score, di, ai) in &scores {
3155        if used_deleted[*di] || used_added[*ai] {
3156            continue;
3157        }
3158        used_deleted[*di] = true;
3159        used_added[*ai] = true;
3160
3161        let del = &deleted[*di];
3162        let add = &added[*ai];
3163
3164        // A "rename" whose source and destination are the same path with the
3165        // same blob is not a change at all (this arises with pathological
3166        // duplicate tree entries, t4058). Git pairs and then drops it, leaving
3167        // no diff entry; mirror that by skipping emission.
3168        if same_path_same_blob(del, add) {
3169            continue;
3170        }
3171
3172        renames.push(DiffEntry {
3173            status: DiffStatus::Renamed,
3174            old_path: del.old_path.clone(),
3175            new_path: add.new_path.clone(),
3176            old_mode: del.old_mode.clone(),
3177            new_mode: add.new_mode.clone(),
3178            old_oid: del.old_oid,
3179            new_oid: add.new_oid,
3180            score: Some(*score),
3181        });
3182    }
3183
3184    // Collect unmatched entries.
3185    let mut result = others;
3186    result.extend(renames);
3187    for (i, entry) in deleted.into_iter().enumerate() {
3188        if !used_deleted[i] {
3189            result.push(entry);
3190        }
3191    }
3192    for (i, entry) in added.into_iter().enumerate() {
3193        if !used_added[i] {
3194            result.push(entry);
3195        }
3196    }
3197
3198    result.sort_by(|a, b| a.path().cmp(b.path()));
3199    result
3200}
3201
3202/// Detect copies among diff entries.
3203///
3204/// This first runs rename detection (pairing Deleted+Added), then for any
3205/// remaining Added entries, looks for copy sources.
3206///
3207/// - `find_copies_harder` = false: only Modified entries are copy source candidates.
3208/// - `find_copies_harder` = true: also examine unmodified files from `source_tree_entries`.
3209///
3210/// `source_tree_entries` should be a list of (path, mode, oid) from the source tree;
3211/// used when `find_copies_harder` is true to consider unmodified files as copy sources.
3212pub fn detect_copies(
3213    odb: &Odb,
3214    work_root: Option<&Path>,
3215    entries: Vec<DiffEntry>,
3216    threshold: u32,
3217    find_copies_harder: bool,
3218    source_tree_entries: &[(String, String, ObjectId)],
3219) -> Vec<DiffEntry> {
3220    use std::collections::{HashMap, HashSet};
3221
3222    // Separate entries by status.
3223    let mut deleted: Vec<DiffEntry> = Vec::new();
3224    let mut added: Vec<DiffEntry> = Vec::new();
3225    let mut others: Vec<DiffEntry> = Vec::new();
3226
3227    for entry in entries {
3228        match entry.status {
3229            DiffStatus::Deleted => deleted.push(entry),
3230            DiffStatus::Added => added.push(entry),
3231            _ => others.push(entry),
3232        }
3233    }
3234
3235    // Build source candidates: deleted files, modified files, and optionally tree entries.
3236    // Track which sources are from deleted files (can become renames).
3237    let mut sources: Vec<(String, ObjectId, bool)> = Vec::new(); // (path, oid, is_deleted)
3238    let mut deleted_source_idx: HashMap<String, usize> = HashMap::new();
3239
3240    for entry in &deleted {
3241        if let Some(ref path) = entry.old_path {
3242            deleted_source_idx.insert(path.clone(), sources.len());
3243            sources.push((path.clone(), entry.old_oid, true));
3244        }
3245    }
3246
3247    // Modified and type-changed files are candidates for `-C` (e.g. symlink rewrite leaves the
3248    // old blob available as a copy source for another path; see `t4008-diff-break-rewrite`).
3249    for entry in &others {
3250        if matches!(entry.status, DiffStatus::Modified | DiffStatus::TypeChanged) {
3251            if let Some(ref old_path) = entry.old_path {
3252                if !sources.iter().any(|(p, _, _)| p == old_path) {
3253                    sources.push((old_path.clone(), entry.old_oid, false));
3254                }
3255            }
3256        }
3257    }
3258
3259    // With find_copies_harder, add all source tree entries.
3260    if find_copies_harder {
3261        for (path, _mode, oid) in source_tree_entries {
3262            if !sources.iter().any(|(p, _, _)| p == path) {
3263                sources.push((path.clone(), *oid, false));
3264            }
3265        }
3266    }
3267
3268    if sources.is_empty() {
3269        let mut result = others;
3270        result.extend(deleted);
3271        result.extend(added);
3272        result.sort_by(|a, b| a.path().cmp(b.path()));
3273        return result;
3274    }
3275
3276    // Read content for sources.
3277    let source_contents: Vec<Option<Vec<u8>>> = sources
3278        .iter()
3279        .map(|(_, oid, _)| odb.read(oid).ok().map(|obj| obj.data))
3280        .collect();
3281
3282    let mut result_entries: Vec<DiffEntry> = Vec::new();
3283    let mut renamed_deleted: HashSet<usize> = HashSet::new();
3284    let mut used_added2 = vec![false; added.len()];
3285
3286    if !added.is_empty() {
3287        // Read content for added blobs.
3288        let added_contents: Vec<Option<Vec<u8>>> = added
3289            .iter()
3290            .map(|a| read_added_entry_bytes(odb, a, work_root))
3291            .collect();
3292
3293        // Build score matrix: (score, source_idx, added_idx)
3294        let mut scores: Vec<(u32, usize, usize)> = Vec::new();
3295        for (si, (src_path, src_oid, _)) in sources.iter().enumerate() {
3296            for (ai, add) in added.iter().enumerate() {
3297                // Never pair a path with itself as copy source (matches Git; avoids
3298                // arbitrary tie-breaking when several sources share the same blob).
3299                if add.new_path.as_deref() == Some(src_path.as_str()) {
3300                    continue;
3301                }
3302                let add_oid = if add.new_oid != zero_oid() {
3303                    add.new_oid
3304                } else if let Some(ref data) = added_contents[ai] {
3305                    Odb::hash_object_data(ObjectKind::Blob, data)
3306                } else {
3307                    zero_oid()
3308                };
3309                if *src_oid == add_oid {
3310                    scores.push((100, si, ai));
3311                    continue;
3312                }
3313                let score = match (&source_contents[si], &added_contents[ai]) {
3314                    (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
3315                    _ => 0,
3316                };
3317                if score >= threshold {
3318                    scores.push((score, si, ai));
3319                }
3320            }
3321        }
3322
3323        // Sort by score descending.
3324        scores.sort_by(|a, b| b.0.cmp(&a.0));
3325
3326        // Build source->added mappings, each added file assigned to best source.
3327        let mut used_added = vec![false; added.len()];
3328        let mut source_to_added: HashMap<usize, Vec<(usize, u32)>> = HashMap::new();
3329        for &(score, si, ai) in &scores {
3330            if used_added[ai] {
3331                continue;
3332            }
3333            used_added[ai] = true;
3334            source_to_added.entry(si).or_default().push((ai, score));
3335        }
3336
3337        // For each deleted source, pick one assignment as Rename, rest as Copy.
3338        for (&si, assignments_for_src) in &source_to_added {
3339            let (_, _, is_deleted) = &sources[si];
3340            if *is_deleted && !assignments_for_src.is_empty() {
3341                // Pick the last one (by path) as the rename target.
3342                // Git tends to pick the rename as the last alphabetically.
3343                let rename_ai = assignments_for_src
3344                    .iter()
3345                    .max_by_key(|(ai, _score)| added[*ai].path().to_string())
3346                    .map(|(ai, _)| *ai);
3347
3348                for &(ai, score) in assignments_for_src {
3349                    let (ref src_path, _, _) = sources[si];
3350                    let add = &added[ai];
3351                    let src_mode = source_tree_entries
3352                        .iter()
3353                        .find(|(p, _, _)| p == src_path)
3354                        .map(|(_, m, _)| m.clone())
3355                        .unwrap_or_else(|| add.old_mode.clone());
3356
3357                    let is_rename = Some(ai) == rename_ai;
3358                    result_entries.push(DiffEntry {
3359                        status: if is_rename {
3360                            DiffStatus::Renamed
3361                        } else {
3362                            DiffStatus::Copied
3363                        },
3364                        old_path: Some(src_path.clone()),
3365                        new_path: add.new_path.clone(),
3366                        old_mode: src_mode,
3367                        new_mode: add.new_mode.clone(),
3368                        old_oid: sources[si].1,
3369                        new_oid: add.new_oid,
3370                        score: Some(score),
3371                    });
3372                    used_added2[ai] = true;
3373                }
3374                renamed_deleted.insert(si);
3375            } else {
3376                // Non-deleted source: all assignments are copies.
3377                for &(ai, score) in assignments_for_src {
3378                    let (ref src_path, _, _) = sources[si];
3379                    let add = &added[ai];
3380                    let src_mode = source_tree_entries
3381                        .iter()
3382                        .find(|(p, _, _)| p == src_path)
3383                        .map(|(_, m, _)| m.clone())
3384                        .unwrap_or_else(|| add.old_mode.clone());
3385
3386                    result_entries.push(DiffEntry {
3387                        status: DiffStatus::Copied,
3388                        old_path: Some(src_path.clone()),
3389                        new_path: add.new_path.clone(),
3390                        old_mode: src_mode,
3391                        new_mode: add.new_mode.clone(),
3392                        old_oid: sources[si].1,
3393                        new_oid: add.new_oid,
3394                        score: Some(score),
3395                    });
3396                    used_added2[ai] = true;
3397                }
3398            }
3399        }
3400    }
3401
3402    // Keep deleted entries that weren't consumed by a rename.
3403    for entry in deleted.into_iter() {
3404        if let Some(ref path) = entry.old_path {
3405            if let Some(&si) = deleted_source_idx.get(path) {
3406                if renamed_deleted.contains(&si) {
3407                    // This deletion was consumed by a rename; skip it.
3408                    continue;
3409                }
3410            }
3411        }
3412        result_entries.push(entry);
3413    }
3414
3415    let mut result = others;
3416    result.extend(result_entries);
3417    // Keep unmatched added entries.
3418    for (i, entry) in added.into_iter().enumerate() {
3419        if !used_added2[i] {
3420            result.push(entry);
3421        }
3422    }
3423
3424    let mut final_result = Vec::with_capacity(result.len());
3425    for e in result {
3426        if let Some(c) = modified_as_copy_from_sources(
3427            odb,
3428            work_root,
3429            &e,
3430            threshold,
3431            &sources,
3432            &source_contents,
3433            source_tree_entries,
3434        ) {
3435            final_result.push(c);
3436        } else {
3437            final_result.push(e);
3438        }
3439    }
3440
3441    final_result.sort_by(|a, b| a.path().cmp(b.path()));
3442    final_result
3443}
3444
3445/// Apply Git-style rename and optional copy detection for index↔worktree diffs.
3446///
3447/// When `copies` is true (Git `diff.renames` / `status.renames` set to `copy`/`copies`),
3448/// runs copy detection, which also pairs deleted sources with one rename and any additional
3449/// destinations as copies.
3450///
3451/// # Errors
3452///
3453/// Propagates errors from reading the `head_tree` object from `odb`.
3454pub fn status_apply_rename_copy_detection(
3455    odb: &Odb,
3456    unstaged_raw: Vec<DiffEntry>,
3457    threshold: u32,
3458    copies: bool,
3459    head_tree: Option<&ObjectId>,
3460) -> Result<Vec<DiffEntry>> {
3461    if !copies {
3462        return Ok(detect_renames(odb, None, unstaged_raw, threshold));
3463    }
3464    let source_tree_entries: Vec<(String, String, ObjectId)> = match head_tree {
3465        Some(oid) => flatten_tree(odb, oid, "")?
3466            .into_iter()
3467            .map(|e| (e.path, format_mode(e.mode), e.oid))
3468            .collect(),
3469        None => Vec::new(),
3470    };
3471    Ok(detect_copies(
3472        odb,
3473        None,
3474        unstaged_raw,
3475        threshold,
3476        false,
3477        &source_tree_entries,
3478    ))
3479}
3480
3481/// Format a rename pair using Git's compact path format.
3482///
3483/// Examples:
3484/// - `a/b/c` → `c/b/a` → `a/b/c => c/b/a`
3485/// - `c/b/a` → `c/d/e` → `c/{b/a => d/e}`
3486/// - `c/d/e` → `d/e` → `{c/d => d}/e`
3487/// - `d/e` → `d/f/e` → `d/{ => f}/e`
3488pub fn format_rename_path(old: &str, new: &str) -> String {
3489    let ob = old.as_bytes();
3490    let nb = new.as_bytes();
3491
3492    // Find common prefix length, snapped to '/' boundary.
3493    let pfx = {
3494        let mut last_sep = 0usize;
3495        let min_len = ob.len().min(nb.len());
3496        for i in 0..min_len {
3497            if ob[i] != nb[i] {
3498                break;
3499            }
3500            if ob[i] == b'/' {
3501                last_sep = i + 1;
3502            }
3503        }
3504        last_sep
3505    };
3506
3507    // Find common suffix length, snapped to '/' boundary.
3508    let mut sfx = {
3509        let mut last_sep = 0usize;
3510        let min_len = ob.len().min(nb.len());
3511        for i in 0..min_len {
3512            let oi = ob.len() - 1 - i;
3513            let ni = nb.len() - 1 - i;
3514            if ob[oi] != nb[ni] {
3515                break;
3516            }
3517            if ob[oi] == b'/' {
3518                last_sep = i + 1;
3519            }
3520        }
3521        last_sep
3522    };
3523
3524    // Suffix starts at this position in each string.
3525    let mut sfx_at_old = ob.len() - sfx;
3526    let mut sfx_at_new = nb.len() - sfx;
3527
3528    // If prefix and suffix overlap in both strings (both middles empty),
3529    // reduce the suffix so that at least the longer string has a non-empty middle.
3530    while pfx > sfx_at_old && pfx > sfx_at_new && sfx > 0 {
3531        // Reduce suffix by snapping to the next smaller '/' boundary.
3532        let suffix_bytes = &ob[sfx_at_old..];
3533        let mut new_sfx = 0;
3534        // Find the next '/' after sfx_at_old (i.e., reduce suffix).
3535        for (i, &b) in suffix_bytes.iter().enumerate().skip(1) {
3536            if b == b'/' {
3537                new_sfx = sfx - i;
3538                break;
3539            }
3540        }
3541        if new_sfx == 0 || new_sfx >= sfx {
3542            sfx_at_old = ob.len();
3543            sfx_at_new = nb.len();
3544            break;
3545        }
3546        sfx = new_sfx;
3547        sfx_at_old = ob.len() - sfx;
3548        sfx_at_new = nb.len() - sfx;
3549    }
3550
3551    // When prefix and suffix overlap in the shorter string, they share
3552    // the '/' boundary character. In the output format, the shared '/'
3553    // appears in both positions (e.g. "d/{ => f}/e" for d/e → d/f/e).
3554    // Compute the middle parts. When prefix and suffix overlap in a
3555    // string, the middle for that string is empty. The shared '/' shows
3556    // in both prefix (trailing) and suffix (leading) positions.
3557    let prefix = &old[..pfx];
3558    let suffix = &old[sfx_at_old..];
3559    let old_mid = if pfx <= sfx_at_old {
3560        &old[pfx..sfx_at_old]
3561    } else {
3562        ""
3563    };
3564    let new_mid = if pfx <= sfx_at_new {
3565        &new[pfx..sfx_at_new]
3566    } else {
3567        ""
3568    };
3569
3570    if prefix.is_empty() && suffix.is_empty() {
3571        return format!("{old} => {new}");
3572    }
3573
3574    format!("{prefix}{{{old_mid} => {new_mid}}}{suffix}")
3575}
3576
3577/// Check if two entries share the same filename (basename).
3578fn same_basename(del: &DiffEntry, add: &DiffEntry) -> bool {
3579    let old = del.old_path.as_deref().unwrap_or("");
3580    let new = add.new_path.as_deref().unwrap_or("");
3581    let old_base = old.rsplit('/').next().unwrap_or(old);
3582    let new_base = new.rsplit('/').next().unwrap_or(new);
3583    old_base == new_base && !old_base.is_empty()
3584}
3585
3586/// Compute a similarity percentage (0–100) between two byte slices.
3587///
3588/// Uses Git's approach: count the bytes that are "shared" (appear in
3589/// equal lines), then compute `score = shared_bytes * 2 * 100 / (src_size + dst_size)`.
3590fn compute_similarity(old: &[u8], new: &[u8]) -> u32 {
3591    // Normalize CRLF → LF before comparing so that files differing
3592    // only in line endings are detected as renames.
3593    let old_norm = crate::crlf::crlf_to_lf(old);
3594    let new_norm = crate::crlf::crlf_to_lf(new);
3595
3596    let src_size = old_norm.len();
3597    let dst_size = new_norm.len();
3598
3599    if src_size == 0 && dst_size == 0 {
3600        return 100;
3601    }
3602    let total = src_size + dst_size;
3603    if total == 0 {
3604        return 100;
3605    }
3606
3607    // Use line-level diff to find shared content, then count bytes.
3608    use similar::{ChangeTag, TextDiff};
3609    let old_str = String::from_utf8_lossy(&old_norm);
3610    let new_str = String::from_utf8_lossy(&new_norm);
3611    let diff = TextDiff::from_lines(&old_str as &str, &new_str as &str);
3612
3613    let mut shared_bytes = 0usize;
3614    for change in diff.iter_all_changes() {
3615        if change.tag() == ChangeTag::Equal {
3616            // Count bytes in the matching line (including newline).
3617            shared_bytes += change.value().len();
3618        }
3619    }
3620
3621    // Git: score = copied * MAX_SCORE / max(src_size, dst_size)
3622    // We normalize to 0-100.
3623    let max_size = src_size.max(dst_size);
3624
3625    ((shared_bytes * 100) / max_size).min(100) as u32
3626}
3627
3628/// Compute rename/copy similarity percentage (0–100) between two byte slices.
3629///
3630/// This uses the same scoring logic as internal rename detection.
3631#[must_use]
3632pub fn rename_similarity_score(old: &[u8], new: &[u8]) -> u32 {
3633    compute_similarity(old, new)
3634}
3635
3636// ── Output formatting ───────────────────────────────────────────────
3637
3638/// Format a diff entry in Git's raw diff format.
3639///
3640/// Example: `:100644 100644 abc1234... def5678... M\tfile.txt`
3641pub fn format_raw(entry: &DiffEntry) -> String {
3642    let path = match entry.status {
3643        DiffStatus::Renamed | DiffStatus::Copied => {
3644            format!(
3645                "{}\t{}",
3646                entry.old_path.as_deref().unwrap_or(""),
3647                entry.new_path.as_deref().unwrap_or("")
3648            )
3649        }
3650        _ => entry.path().to_owned(),
3651    };
3652
3653    let status_str = match (entry.status, entry.score) {
3654        (DiffStatus::Renamed, Some(s)) => format!("R{:03}", s),
3655        (DiffStatus::Copied, Some(s)) => format!("C{:03}", s),
3656        _ => entry.status.letter().to_string(),
3657    };
3658
3659    format!(
3660        ":{} {} {} {} {}\t{}",
3661        entry.old_mode, entry.new_mode, entry.old_oid, entry.new_oid, status_str, path
3662    )
3663}
3664
3665/// Format a diff entry with abbreviated OIDs.
3666pub fn format_raw_abbrev(entry: &DiffEntry, abbrev_len: usize) -> String {
3667    let ellipsis = if std::env::var("GIT_PRINT_SHA1_ELLIPSIS").ok().as_deref() == Some("yes") {
3668        "..."
3669    } else {
3670        ""
3671    };
3672    let old_hex = format!("{}", entry.old_oid);
3673    let new_hex = format!("{}", entry.new_oid);
3674    let old_abbrev = &old_hex[..abbrev_len.min(old_hex.len())];
3675    let new_abbrev = &new_hex[..abbrev_len.min(new_hex.len())];
3676
3677    // Renames/copies carry a similarity score and a `<old>\t<new>` path pair.
3678    let path = match entry.status {
3679        DiffStatus::Renamed | DiffStatus::Copied => format!(
3680            "{}\t{}",
3681            entry.old_path.as_deref().unwrap_or(""),
3682            entry.new_path.as_deref().unwrap_or("")
3683        ),
3684        _ => entry.path().to_owned(),
3685    };
3686    let status_str = match (entry.status, entry.score) {
3687        (DiffStatus::Renamed, Some(s)) => format!("R{s:03}"),
3688        (DiffStatus::Copied, Some(s)) => format!("C{s:03}"),
3689        _ => entry.status.letter().to_string(),
3690    };
3691
3692    format!(
3693        ":{} {} {}{} {}{} {}\t{}",
3694        entry.old_mode,
3695        entry.new_mode,
3696        old_abbrev,
3697        ellipsis,
3698        new_abbrev,
3699        ellipsis,
3700        status_str,
3701        path
3702    )
3703}
3704
3705/// Generate a unified diff patch for two blobs.
3706///
3707/// # Parameters
3708///
3709/// - `old_content` — the old file content (empty for added files).
3710/// - `new_content` — the new file content (empty for deleted files).
3711/// - `old_path` — display path for the old side.
3712/// - `new_path` — display path for the new side.
3713/// - `context_lines` — number of context lines around changes (default: 3).
3714/// - Inter-hunk context defaults to `0` (see [`unified_diff_with_prefix`]).
3715///
3716/// # Returns
3717///
3718/// The unified diff as a string.
3719pub fn unified_diff(
3720    old_content: &str,
3721    new_content: &str,
3722    old_path: &str,
3723    new_path: &str,
3724    context_lines: usize,
3725    indent_heuristic: bool,
3726    quote_path_fully: bool,
3727) -> String {
3728    unified_diff_with_prefix(
3729        old_content,
3730        new_content,
3731        old_path,
3732        new_path,
3733        context_lines,
3734        0,
3735        "a/",
3736        "b/",
3737        indent_heuristic,
3738        quote_path_fully,
3739    )
3740}
3741
3742/// Same as `unified_diff` but with configurable source/destination prefixes.
3743///
3744/// `inter_hunk_context` is Git's `--inter-hunk-context`: adjacent hunks merge when
3745/// the unchanged gap between them is at most `2 * context_lines + inter_hunk_context` lines.
3746#[allow(clippy::too_many_arguments)] // Mirrors Git-style unified diff parameters.
3747pub fn unified_diff_with_prefix(
3748    old_content: &str,
3749    new_content: &str,
3750    old_path: &str,
3751    new_path: &str,
3752    context_lines: usize,
3753    inter_hunk_context: usize,
3754    src_prefix: &str,
3755    dst_prefix: &str,
3756    indent_heuristic: bool,
3757    quote_path_fully: bool,
3758) -> String {
3759    unified_diff_with_prefix_and_funcname(
3760        old_content,
3761        new_content,
3762        old_path,
3763        new_path,
3764        context_lines,
3765        inter_hunk_context,
3766        src_prefix,
3767        dst_prefix,
3768        None,
3769        indent_heuristic,
3770        quote_path_fully,
3771    )
3772}
3773
3774/// Same as [`unified_diff_with_prefix`] with optional custom hunk-header
3775/// function-name matching.
3776#[allow(clippy::too_many_arguments)]
3777pub fn unified_diff_with_prefix_and_funcname(
3778    old_content: &str,
3779    new_content: &str,
3780    old_path: &str,
3781    new_path: &str,
3782    context_lines: usize,
3783    inter_hunk_context: usize,
3784    src_prefix: &str,
3785    dst_prefix: &str,
3786    funcname_matcher: Option<&FuncnameMatcher>,
3787    indent_heuristic: bool,
3788    quote_path_fully: bool,
3789) -> String {
3790    unified_diff_with_prefix_and_funcname_and_algorithm(
3791        old_content,
3792        new_content,
3793        old_path,
3794        new_path,
3795        context_lines,
3796        inter_hunk_context,
3797        src_prefix,
3798        dst_prefix,
3799        funcname_matcher,
3800        similar::Algorithm::Myers,
3801        false,
3802        false,
3803        indent_heuristic,
3804        quote_path_fully,
3805    )
3806}
3807
3808/// Same as [`unified_diff_with_prefix_and_funcname`] but allows callers to
3809/// choose the line diff algorithm used for hunk generation.
3810///
3811/// When `function_context` is true (`git diff -W`), hunks are expanded to
3812/// whole logical functions using the same rules as Git's `XDL_EMIT_FUNCCONTEXT`.
3813#[allow(clippy::too_many_arguments)]
3814pub fn unified_diff_with_prefix_and_funcname_and_algorithm(
3815    old_content: &str,
3816    new_content: &str,
3817    old_path: &str,
3818    new_path: &str,
3819    context_lines: usize,
3820    inter_hunk_context: usize,
3821    src_prefix: &str,
3822    dst_prefix: &str,
3823    funcname_matcher: Option<&FuncnameMatcher>,
3824    algorithm: similar::Algorithm,
3825    function_context: bool,
3826    use_git_histogram: bool,
3827    indent_heuristic: bool,
3828    quote_path_fully: bool,
3829) -> String {
3830    // `--function-context` (`-W`) expansion must apply regardless of the line
3831    // algorithm; the histogram body printer below does not do it, so route `-W`
3832    // through the function-context emitter first (t4015 #136).
3833    if function_context {
3834        return unified_diff_with_function_context(
3835            old_content,
3836            new_content,
3837            old_path,
3838            new_path,
3839            context_lines,
3840            inter_hunk_context,
3841            src_prefix,
3842            dst_prefix,
3843            funcname_matcher,
3844            algorithm,
3845            indent_heuristic,
3846            quote_path_fully,
3847        );
3848    }
3849
3850    if use_git_histogram {
3851        return unified_diff_histogram_with_prefix_and_funcname(
3852            old_content,
3853            new_content,
3854            old_path,
3855            new_path,
3856            context_lines,
3857            inter_hunk_context,
3858            src_prefix,
3859            dst_prefix,
3860            funcname_matcher,
3861            quote_path_fully,
3862        );
3863    }
3864
3865    use crate::quote_path::format_diff_path_with_prefix;
3866    use similar::{udiff::UnifiedDiffHunk, TextDiff};
3867
3868    let diff = TextDiff::configure()
3869        .algorithm(algorithm)
3870        .diff_lines(old_content, new_content);
3871    let compacted_ops = diff_indent_heuristic::diff_lines_ops_compacted(
3872        old_content,
3873        new_content,
3874        algorithm,
3875        indent_heuristic,
3876    );
3877
3878    let mut output = String::new();
3879    if old_path == "/dev/null" {
3880        output.push_str("--- /dev/null\n");
3881    } else if src_prefix.is_empty() {
3882        // Callers (e.g. `diff-tree`, `diff-index`) may pass a fully formatted token
3883        // (already includes `a/` and any C-style quoting).
3884        output.push_str(&format!("--- {old_path}\n"));
3885    } else {
3886        output.push_str("--- ");
3887        output.push_str(&format_diff_path_with_prefix(
3888            src_prefix,
3889            old_path,
3890            quote_path_fully,
3891        ));
3892        output.push('\n');
3893    }
3894    if new_path == "/dev/null" {
3895        output.push_str("+++ /dev/null\n");
3896    } else if dst_prefix.is_empty() {
3897        output.push_str(&format!("+++ {new_path}\n"));
3898    } else {
3899        output.push_str("+++ ");
3900        output.push_str(&format_diff_path_with_prefix(
3901            dst_prefix,
3902            new_path,
3903            quote_path_fully,
3904        ));
3905        output.push('\n');
3906    }
3907
3908    let old_lines: Vec<&str> = old_content.lines().collect();
3909
3910    // Git's xdiff merges adjacent changes while the gap between them in the old file is at most
3911    // `2 * context_lines + inter_hunk_context` (see `xdl_get_hunk` in xemit.c).
3912    // `similar::group_diff_ops` couples the split threshold and the displayed edge context to a
3913    // single radius (split at `> 2n`), which over-merges when the gap limit is odd
3914    // (t4032: `-U0 --inter-hunk-context=1` with 2 common lines must stay 2 hunks).
3915    let max_common_gap = context_lines
3916        .saturating_mul(2)
3917        .saturating_add(inter_hunk_context);
3918    let op_groups = group_diff_ops_gap(compacted_ops, context_lines, max_common_gap);
3919
3920    for ops in op_groups {
3921        if ops.is_empty() {
3922            continue;
3923        }
3924        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
3925        let hunk_str = format!("{hunk}");
3926        // The similar crate outputs @@ -a,b +c,d @@\n but Git adds
3927        // function context after the closing @@. Extract the hunk header
3928        // and add function context.
3929        if let Some(first_newline) = hunk_str.find('\n') {
3930            let header_line = &hunk_str[..first_newline];
3931            let rest = &hunk_str[first_newline..];
3932
3933            // Parse the old start line from the @@ header
3934            if let Some(func_ctx) =
3935                extract_function_context(header_line, &old_lines, funcname_matcher)
3936            {
3937                output.push_str(header_line);
3938                output.push(' ');
3939                output.push_str(&func_ctx);
3940                output.push_str(rest);
3941            } else {
3942                output.push_str(&hunk_str);
3943            }
3944        } else {
3945            output.push_str(&hunk_str);
3946        }
3947    }
3948
3949    output
3950}
3951
3952/// Group diff ops into hunks like Git's `xdl_get_hunk`: two changes merge into one hunk while
3953/// the run of unchanged lines between them is at most `max_common_gap`
3954/// (`2 * context + inter_hunk_context`), and each hunk keeps at most `context` unchanged lines
3955/// at its edges. Unlike `similar::group_diff_ops`, the split threshold is decoupled from the
3956/// edge context so odd gap limits group exactly like Git.
3957fn group_diff_ops_gap(
3958    mut ops: Vec<similar::DiffOp>,
3959    context: usize,
3960    max_common_gap: usize,
3961) -> Vec<Vec<similar::DiffOp>> {
3962    use similar::DiffOp;
3963    if ops.is_empty() {
3964        return vec![];
3965    }
3966
3967    let mut pending_group = Vec::new();
3968    let mut rv = Vec::new();
3969
3970    if let Some(DiffOp::Equal {
3971        old_index,
3972        new_index,
3973        len,
3974    }) = ops.first_mut()
3975    {
3976        let offset = (*len).saturating_sub(context);
3977        *old_index += offset;
3978        *new_index += offset;
3979        *len -= offset;
3980    }
3981
3982    if let Some(DiffOp::Equal { len, .. }) = ops.last_mut() {
3983        *len -= (*len).saturating_sub(context);
3984    }
3985
3986    for op in ops.into_iter() {
3987        if let DiffOp::Equal {
3988            old_index,
3989            new_index,
3990            len,
3991        } = op
3992        {
3993            // End the current group and start a new one whenever the unchanged
3994            // run is too long to fuse the surrounding changes.
3995            if len > max_common_gap {
3996                pending_group.push(DiffOp::Equal {
3997                    old_index,
3998                    new_index,
3999                    len: context,
4000                });
4001                rv.push(pending_group);
4002                let offset = len.saturating_sub(context);
4003                pending_group = vec![DiffOp::Equal {
4004                    old_index: old_index + offset,
4005                    new_index: new_index + offset,
4006                    len: len - offset,
4007                }];
4008                continue;
4009            }
4010        }
4011        pending_group.push(op);
4012    }
4013
4014    match &pending_group[..] {
4015        &[] | &[similar::DiffOp::Equal { .. }] => {}
4016        _ => rv.push(pending_group),
4017    }
4018
4019    rv
4020}
4021
4022/// `git diff -W`: expand each hunk to include full function bodies (see Git `xemit.c`).
4023fn unified_diff_with_function_context(
4024    old_content: &str,
4025    new_content: &str,
4026    old_path: &str,
4027    new_path: &str,
4028    context_lines: usize,
4029    inter_hunk_context: usize,
4030    src_prefix: &str,
4031    dst_prefix: &str,
4032    funcname_matcher: Option<&FuncnameMatcher>,
4033    algorithm: similar::Algorithm,
4034    indent_heuristic: bool,
4035    quote_path_fully: bool,
4036) -> String {
4037    use crate::quote_path::format_diff_path_with_prefix;
4038    use similar::{udiff::UnifiedDiffHunk, TextDiff};
4039
4040    let diff = TextDiff::configure()
4041        .algorithm(algorithm)
4042        .diff_lines(old_content, new_content);
4043
4044    let old_lines: Vec<&str> = old_content.lines().collect();
4045    let new_lines: Vec<&str> = new_content.lines().collect();
4046    let n_old = old_lines.len();
4047    let n_new = new_lines.len();
4048
4049    // Group changes the way Git's xdl_get_hunk does: merge while the unchanged
4050    // gap is at most `2*context + inter_hunk_context`. `similar::group_diff_ops`
4051    // splits only at gap > `2*radius`, which over-merges changes in *different*
4052    // functions (e.g. a leading insertion and a body change) into one hunk and
4053    // then over-expands the function context (t4015 #136). Use the gap-correct
4054    // grouping (same as the non-function-context path).
4055    let max_common_gap = context_lines
4056        .saturating_mul(2)
4057        .saturating_add(inter_hunk_context);
4058    let all_ops = diff.ops().to_vec();
4059    let op_groups = group_diff_ops_gap(all_ops.clone(), context_lines, max_common_gap);
4060
4061    let mut ranges: Vec<(usize, usize, usize, usize)> = Vec::new();
4062
4063    for ops in op_groups {
4064        if ops.is_empty() {
4065            continue;
4066        }
4067        let i1_anchor = func_context_old_anchor(&ops, n_old);
4068        let i1_end = hunk_old_change_end_exclusive(&ops);
4069        let skip_preimage_pull =
4070            append_with_whole_function_added(&ops, n_old, n_new, &new_lines, funcname_matcher);
4071        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
4072        let hunk_str = format!("{hunk}");
4073        let header_line = hunk_str
4074            .lines()
4075            .next()
4076            .unwrap_or("")
4077            .trim_end_matches(['\r', '\n']);
4078        let Some((base_s1, _base_e1, _base_s2, _base_e2)) =
4079            parse_unified_hunk_header_ranges(header_line)
4080        else {
4081            continue;
4082        };
4083
4084        let ctx = context_lines;
4085        let (s1, e1, s2, e2) = if skip_preimage_pull {
4086            let s = n_old.saturating_sub(ctx);
4087            let s2 = map_old_line_to_new(&all_ops, s, n_new).min(n_new);
4088            (s, n_old, s2, n_new)
4089        } else {
4090            let mut s1 = base_s1.saturating_sub(ctx);
4091            let mut s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
4092
4093            let base_pre_s1 = i1_anchor.saturating_sub(ctx);
4094            if base_pre_s1 < s1 {
4095                s1 = base_pre_s1;
4096                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
4097            }
4098
4099            let fs1 = expand_func_pre_start(s1, i1_anchor, n_old, &old_lines, funcname_matcher);
4100            if fs1 < s1 {
4101                s1 = fs1;
4102                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
4103            }
4104
4105            // `i1_end` is the exclusive end of the changed region; its post-image
4106            // context is `ctx` lines (Git's `xche->i1 + xche->chg1 + lctx`). The
4107            // hunk's `base_e1` already includes the group's trailing context, so
4108            // use the change end + ctx directly to avoid double-counting context
4109            // (which over-extended the hunk and merged separate functions — t4015 #136).
4110            let mut e1 = (i1_end + ctx).min(n_old);
4111            let mut e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
4112            let fe1 = expand_func_post_end(e1, i1_end, n_old, &old_lines, funcname_matcher);
4113            if fe1 > e1 {
4114                e1 = fe1;
4115                e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
4116            }
4117            (s1, e1, s2, e2)
4118        };
4119
4120        ranges.push((s1, e1, s2, e2));
4121    }
4122
4123    // Merge ranges whose function-context expansion made them overlap on the old
4124    // side (Git emits a single hunk in that case).
4125    ranges.sort_by_key(|r| (r.0, r.2));
4126    let mut merged: Vec<(usize, usize, usize, usize)> = Vec::with_capacity(ranges.len());
4127    for (s1, e1, s2, e2) in ranges {
4128        if let Some(last) = merged.last_mut() {
4129            if s1 < last.1 {
4130                last.1 = last.1.max(e1);
4131                last.3 = last.3.max(e2);
4132                continue;
4133            }
4134        }
4135        merged.push((s1, e1, s2, e2));
4136    }
4137    let ranges = merged;
4138
4139    let mut output = String::new();
4140    if old_path == "/dev/null" {
4141        output.push_str("--- /dev/null\n");
4142    } else if src_prefix.is_empty() {
4143        output.push_str(&format!("--- {old_path}\n"));
4144    } else {
4145        output.push_str("--- ");
4146        output.push_str(&format_diff_path_with_prefix(
4147            src_prefix,
4148            old_path,
4149            quote_path_fully,
4150        ));
4151        output.push('\n');
4152    }
4153    if new_path == "/dev/null" {
4154        output.push_str("+++ /dev/null\n");
4155    } else if dst_prefix.is_empty() {
4156        output.push_str(&format!("+++ {new_path}\n"));
4157    } else {
4158        output.push_str("+++ ");
4159        output.push_str(&format_diff_path_with_prefix(
4160            dst_prefix,
4161            new_path,
4162            quote_path_fully,
4163        ));
4164        output.push('\n');
4165    }
4166
4167    for (s1, e1, s2, e2) in ranges {
4168        if s1 >= e1 && s2 >= e2 {
4169            continue;
4170        }
4171        let old_seg =
4172            line_slice_for_diff_with_eof_nl(&old_lines, s1, e1, old_content.ends_with('\n'));
4173        let new_seg =
4174            line_slice_for_diff_with_eof_nl(&new_lines, s2, e2, new_content.ends_with('\n'));
4175        let inner_ctx = old_seg.lines().count().max(new_seg.lines().count()).max(1);
4176        let piece = unified_diff_with_prefix_and_funcname_and_algorithm(
4177            &old_seg,
4178            &new_seg,
4179            old_path,
4180            new_path,
4181            inner_ctx,
4182            0,
4183            src_prefix,
4184            dst_prefix,
4185            funcname_matcher,
4186            algorithm,
4187            false,
4188            false,
4189            indent_heuristic,
4190            quote_path_fully,
4191        );
4192        let shifted = shift_unified_hunk_headers_to_full_file(&piece, s1, s2);
4193        let with_func =
4194            enrich_unified_hunk_headers_funcname(&shifted, &old_lines, funcname_matcher);
4195        for line in with_func.lines() {
4196            if line.starts_with("--- ") || line.starts_with("+++ ") {
4197                continue;
4198            }
4199            output.push_str(line);
4200            output.push('\n');
4201        }
4202    }
4203
4204    output
4205}
4206
4207/// `piece` is a unified diff for a slice of the file; hunk headers use 1-based
4208/// coordinates relative to that slice. Shift them by `delta_old` / `delta_new`
4209/// (0-based offsets of the slice in the full file) so the combined patch applies
4210/// to the whole file.
4211fn shift_unified_hunk_headers_to_full_file(
4212    patch: &str,
4213    delta_old: usize,
4214    delta_new: usize,
4215) -> String {
4216    if delta_old == 0 && delta_new == 0 {
4217        return patch.to_owned();
4218    }
4219    let mut out = String::with_capacity(patch.len());
4220    for line in patch.lines() {
4221        if let Some(shifted) = shift_one_unified_hunk_header(line, delta_old, delta_new) {
4222            out.push_str(&shifted);
4223        } else {
4224            out.push_str(line);
4225        }
4226        out.push('\n');
4227    }
4228    out
4229}
4230
4231fn shift_one_unified_hunk_header(line: &str, delta_old: usize, delta_new: usize) -> Option<String> {
4232    let rest = line.strip_prefix("@@ ")?;
4233    let (old_chunk, after_plus) = rest.split_once(" +")?;
4234    let old_spec = old_chunk.strip_prefix('-')?;
4235    let (new_spec, suffix) = after_plus.split_once(" @@")?;
4236    let shifted_old = shift_unified_range_spec(old_spec, delta_old)?;
4237    let shifted_new = shift_unified_range_spec(new_spec, delta_new)?;
4238    Some(format!("@@ -{shifted_old} +{shifted_new} @@{suffix}"))
4239}
4240
4241fn shift_unified_range_spec(spec: &str, delta: usize) -> Option<String> {
4242    let spec = spec.trim();
4243    if let Some((start_s, count_s)) = spec.split_once(',') {
4244        let start: usize = start_s.parse().ok()?;
4245        let count: usize = count_s.parse().ok()?;
4246        Some(format!("{},{}", start.saturating_add(delta), count))
4247    } else {
4248        let start: usize = spec.parse().ok()?;
4249        Some(format!("{}", start.saturating_add(delta)))
4250    }
4251}
4252
4253/// Re-attach `@@ ... @@ <funcname>` using full-file line indices (inner diffs use slices).
4254fn enrich_unified_hunk_headers_funcname(
4255    patch: &str,
4256    full_old_lines: &[&str],
4257    funcname_matcher: Option<&FuncnameMatcher>,
4258) -> String {
4259    let mut out = String::with_capacity(patch.len());
4260    for line in patch.lines() {
4261        if let Some(fixed) = enrich_one_hunk_header_funcname(line, full_old_lines, funcname_matcher)
4262        {
4263            out.push_str(&fixed);
4264        } else {
4265            out.push_str(line);
4266        }
4267        out.push('\n');
4268    }
4269    out
4270}
4271
4272fn enrich_one_hunk_header_funcname(
4273    line: &str,
4274    full_old_lines: &[&str],
4275    funcname_matcher: Option<&FuncnameMatcher>,
4276) -> Option<String> {
4277    let after_at = line.strip_prefix("@@ ")?;
4278    let idx = after_at.find(" @@")?;
4279    let mid = after_at[..idx].trim();
4280    let tail = after_at[idx + 3..].trim_start();
4281    let header_for_parse = format!("@@ {mid} @@");
4282    let func = extract_function_context(&header_for_parse, full_old_lines, funcname_matcher);
4283    Some(if let Some(f) = func {
4284        format!("@@ {mid} @@ {f}")
4285    } else if !tail.is_empty() {
4286        format!("@@ {mid} @@ {tail}")
4287    } else {
4288        format!("@@ {mid} @@")
4289    })
4290}
4291
4292fn line_slice_for_diff_with_eof_nl(
4293    lines: &[&str],
4294    start: usize,
4295    end: usize,
4296    full_file_ends_with_newline: bool,
4297) -> String {
4298    if start >= end {
4299        return String::new();
4300    }
4301    let mut s = lines[start..end].join("\n");
4302    let slice_is_suffix_of_file = end == lines.len();
4303    let need_trailing_nl = if slice_is_suffix_of_file {
4304        full_file_ends_with_newline
4305    } else {
4306        true
4307    };
4308    if need_trailing_nl && !s.ends_with('\n') {
4309        s.push('\n');
4310    }
4311    s
4312}
4313
4314/// Map a 0-based old line index to the corresponding 0-based new line index using the full-file
4315/// diff ops (Git aligns context across deletions/insertions).
4316fn map_old_line_to_new(ops: &[similar::DiffOp], old_line: usize, n_new: usize) -> usize {
4317    use similar::DiffOp;
4318    let mut n = 0usize;
4319    for op in ops {
4320        match *op {
4321            DiffOp::Equal {
4322                old_index,
4323                new_index,
4324                len,
4325            } => {
4326                if old_index + len <= old_line {
4327                    n = new_index + len;
4328                    continue;
4329                }
4330                if old_index < old_line {
4331                    let take = old_line - old_index;
4332                    return (new_index + take).min(n_new);
4333                }
4334                return new_index.min(n_new);
4335            }
4336            DiffOp::Delete {
4337                old_index,
4338                old_len,
4339                new_index,
4340            } => {
4341                if old_index + old_len <= old_line {
4342                    n = new_index;
4343                    continue;
4344                }
4345                if old_index < old_line {
4346                    return new_index.min(n_new);
4347                }
4348            }
4349            DiffOp::Insert {
4350                old_index,
4351                new_index,
4352                new_len,
4353            } => {
4354                if old_index < old_line {
4355                    n = new_index + new_len;
4356                    continue;
4357                }
4358                if old_index == old_line {
4359                    // `old_line` is an exclusive end or insertion point aligned with this insert
4360                    // (e.g. EOF append maps to after the inserted block).
4361                    return (new_index + new_len).min(n_new);
4362                }
4363                return new_index.min(n_new);
4364            }
4365            DiffOp::Replace {
4366                old_index,
4367                old_len,
4368                new_index,
4369                new_len,
4370            } => {
4371                if old_index + old_len <= old_line {
4372                    n = new_index + new_len;
4373                    continue;
4374                }
4375                if old_index < old_line {
4376                    let into_old = old_line - old_index;
4377                    let mapped = new_index + into_old.min(new_len);
4378                    return mapped.min(n_new);
4379                }
4380                return new_index.min(n_new);
4381            }
4382        }
4383    }
4384    n.min(n_new)
4385}
4386
4387/// Parse `@@ -old +new @@` into 0-based half-open ranges in each file.
4388fn parse_unified_hunk_header_ranges(header: &str) -> Option<(usize, usize, usize, usize)> {
4389    let rest = header.strip_prefix("@@ ")?;
4390    let (old_tok, rest2) = rest.split_once(" +")?;
4391    let old_tok = old_tok.strip_prefix('-')?;
4392    let new_tok = rest2.split_once(" @@").map(|(a, _)| a)?;
4393
4394    fn parse_side(spec: &str) -> Option<(usize, usize)> {
4395        let spec = spec.trim();
4396        let (start_one_based, count) = if let Some((a, b)) = spec.split_once(',') {
4397            (a.parse::<usize>().ok()?, b.parse::<usize>().ok()?)
4398        } else {
4399            let s = spec.parse::<usize>().ok()?;
4400            (s, 1usize)
4401        };
4402        let s0 = start_one_based.saturating_sub(1);
4403        let e0 = s0.saturating_add(count);
4404        Some((s0, e0))
4405    }
4406
4407    let (os, oe) = parse_side(old_tok)?;
4408    let (ns, ne) = parse_side(new_tok)?;
4409    Some((os, oe, ns, ne))
4410}
4411
4412/// Git `xemit.c`: when a hunk only inserts at EOF (first inserted line is `new_index == n_old`)
4413/// and the added text already contains a funcname line, do not pull extra context from the preimage.
4414fn append_with_whole_function_added(
4415    ops: &[similar::DiffOp],
4416    n_old: usize,
4417    n_new: usize,
4418    new_lines: &[&str],
4419    matcher: Option<&FuncnameMatcher>,
4420) -> bool {
4421    use similar::DiffOp;
4422    if n_old == 0 {
4423        return false;
4424    }
4425    let mut only_ins_or_eq = true;
4426    let mut min_new_ins = usize::MAX;
4427    for op in ops {
4428        match *op {
4429            DiffOp::Equal { .. } => {}
4430            DiffOp::Insert {
4431                new_index, new_len, ..
4432            } => {
4433                min_new_ins = min_new_ins.min(new_index);
4434                if new_len == 0 {
4435                    only_ins_or_eq = false;
4436                }
4437            }
4438            DiffOp::Delete { .. } | DiffOp::Replace { .. } => {
4439                only_ins_or_eq = false;
4440            }
4441        }
4442    }
4443    let mut insert_at_eof = false;
4444    for op in ops {
4445        if let DiffOp::Insert { old_index, .. } = *op {
4446            if old_index == n_old {
4447                insert_at_eof = true;
4448                break;
4449            }
4450        }
4451    }
4452    let append_at_eof = min_new_ins == n_old || insert_at_eof;
4453    if !only_ins_or_eq || !append_at_eof || min_new_ins == usize::MAX {
4454        return false;
4455    }
4456    // Git only skips preimage pull when the inserted block is clearly a new logical
4457    // function (see `xemit.c` walking `xdf2` for `is_func_rec`). A loose "any line
4458    // looks like a function" check would match `return` / `printf` and break `-W`
4459    // hunks that still need preimage context (t4051 `extended`).
4460    let mut j = min_new_ins;
4461    while j < n_new {
4462        let line = new_lines[j];
4463        if line.trim().is_empty() {
4464            j += 1;
4465            continue;
4466        }
4467        if let Some(m) = matcher {
4468            if m.match_line(line).is_some() {
4469                return true;
4470            }
4471        } else if inserted_block_starts_with_c_like_function_definition(line) {
4472            return true;
4473        }
4474        j += 1;
4475    }
4476    false
4477}
4478
4479fn inserted_block_starts_with_c_like_function_definition(line: &str) -> bool {
4480    let t = line.trim_start();
4481    let Some(open_paren) = t.find('(') else {
4482        return false;
4483    };
4484    let head = &t[..open_paren];
4485    let tokens: Vec<&str> = head.split_whitespace().collect();
4486    if tokens.len() < 2 {
4487        // `printf(...)`, `return (`, etc. — not `return_type name(`.
4488        return false;
4489    }
4490    let nameish = tokens.last().copied().unwrap_or("");
4491    let name = nameish.trim_end_matches(['*', '&']);
4492    if name.is_empty() || !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
4493        return false;
4494    }
4495    let type_or_modifier = |tok: &str| {
4496        matches!(
4497            tok,
4498            "static"
4499                | "extern"
4500                | "inline"
4501                | "void"
4502                | "int"
4503                | "char"
4504                | "short"
4505                | "long"
4506                | "float"
4507                | "double"
4508                | "unsigned"
4509                | "signed"
4510                | "struct"
4511                | "enum"
4512                | "union"
4513                | "const"
4514                | "volatile"
4515                | "typedef"
4516        )
4517    };
4518    tokens[..tokens.len() - 1]
4519        .iter()
4520        .any(|tok| type_or_modifier(tok))
4521}
4522
4523fn hunk_old_change_end_exclusive(ops: &[similar::DiffOp]) -> usize {
4524    use similar::DiffOp;
4525    let mut max_o = 0usize;
4526    for op in ops {
4527        match *op {
4528            DiffOp::Delete {
4529                old_index, old_len, ..
4530            } => {
4531                max_o = max_o.max(old_index + old_len);
4532            }
4533            DiffOp::Replace {
4534                old_index, old_len, ..
4535            } => {
4536                max_o = max_o.max(old_index + old_len);
4537            }
4538            DiffOp::Insert { old_index, .. } => {
4539                // Pure insertions do not consume old lines; Git's post-context anchor is the
4540                // insertion point (`old_index`), not 0 (t4051 `extended`).
4541                max_o = max_o.max(old_index);
4542            }
4543            DiffOp::Equal { .. } => {}
4544        }
4545    }
4546    max_o
4547}
4548
4549fn func_context_old_anchor(ops: &[similar::DiffOp], n_old: usize) -> usize {
4550    use similar::DiffOp;
4551    let mut has_delete_or_replace = false;
4552    let mut min_del = usize::MAX;
4553    let mut min_ins_old = usize::MAX;
4554
4555    for op in ops {
4556        match *op {
4557            DiffOp::Delete {
4558                old_index, old_len, ..
4559            } => {
4560                has_delete_or_replace = true;
4561                min_del = min_del.min(old_index);
4562                min_del = min_del.min(old_index + old_len.saturating_sub(1));
4563            }
4564            DiffOp::Replace {
4565                old_index, old_len, ..
4566            } => {
4567                has_delete_or_replace = true;
4568                min_del = min_del.min(old_index);
4569                min_del = min_del.min(old_index + old_len.saturating_sub(1));
4570            }
4571            DiffOp::Insert { old_index, .. } => {
4572                min_ins_old = min_ins_old.min(old_index);
4573            }
4574            DiffOp::Equal { .. } => {}
4575        }
4576    }
4577
4578    let mut i1 = if has_delete_or_replace {
4579        min_del
4580    } else if min_ins_old != usize::MAX {
4581        min_ins_old
4582    } else {
4583        0
4584    };
4585
4586    let pure_insert = ops
4587        .iter()
4588        .all(|op| matches!(op, DiffOp::Insert { .. } | DiffOp::Equal { .. }))
4589        && ops.iter().any(|op| matches!(op, DiffOp::Insert { .. }));
4590
4591    if pure_insert && i1 >= n_old && n_old > 0 {
4592        i1 = n_old - 1;
4593    }
4594
4595    i1.min(n_old.saturating_sub(1))
4596}
4597
4598fn expand_func_pre_start(
4599    s1: usize,
4600    i1: usize,
4601    n_old: usize,
4602    old_lines: &[&str],
4603    matcher: Option<&FuncnameMatcher>,
4604) -> usize {
4605    if n_old == 0 {
4606        return s1;
4607    }
4608    let i1 = i1.min(n_old.saturating_sub(1));
4609    let mut fs1 = get_func_line_backward(old_lines, i1, matcher).unwrap_or(i1);
4610    while fs1 > 0
4611        && !is_line_empty_for_func_context(old_lines[fs1 - 1])
4612        && !is_func_line(old_lines[fs1 - 1], matcher)
4613    {
4614        fs1 -= 1;
4615    }
4616    s1.min(fs1)
4617}
4618
4619fn expand_func_post_end(
4620    e1: usize,
4621    i1_end: usize,
4622    n_old: usize,
4623    old_lines: &[&str],
4624    matcher: Option<&FuncnameMatcher>,
4625) -> usize {
4626    let from = i1_end.min(n_old);
4627    let fe1 = get_func_line_forward(old_lines, from, matcher).unwrap_or(n_old);
4628    let mut fe1_adj = fe1;
4629    while fe1_adj > 0 && is_line_empty_for_func_context(old_lines[fe1_adj - 1]) {
4630        fe1_adj -= 1;
4631    }
4632    e1.max(fe1_adj).min(n_old)
4633}
4634
4635fn is_line_empty_for_func_context(line: &str) -> bool {
4636    line.chars().all(|c| c.is_whitespace())
4637}
4638
4639fn is_func_line(line: &str, matcher: Option<&FuncnameMatcher>) -> bool {
4640    if let Some(m) = matcher {
4641        return m.match_line(line).is_some();
4642    }
4643    let t = line.trim_end_matches(['\n', '\r']);
4644    if t.is_empty() {
4645        return false;
4646    }
4647    let b = t.as_bytes()[0];
4648    b.is_ascii_alphabetic() || b == b'_' || b == b'$'
4649}
4650
4651fn get_func_line_backward(
4652    old_lines: &[&str],
4653    start: usize,
4654    matcher: Option<&FuncnameMatcher>,
4655) -> Option<usize> {
4656    let mut l = start.min(old_lines.len().saturating_sub(1));
4657    if old_lines.is_empty() {
4658        return None;
4659    }
4660    loop {
4661        if is_func_line(old_lines[l], matcher) {
4662            return Some(l);
4663        }
4664        if l == 0 {
4665            break;
4666        }
4667        l -= 1;
4668    }
4669    None
4670}
4671
4672fn get_func_line_forward(
4673    old_lines: &[&str],
4674    start: usize,
4675    matcher: Option<&FuncnameMatcher>,
4676) -> Option<usize> {
4677    let mut l = start;
4678    while l < old_lines.len() {
4679        if is_func_line(old_lines[l], matcher) {
4680            return Some(l);
4681        }
4682        l += 1;
4683    }
4684    None
4685}
4686
4687/// Compute a unified diff with anchored lines.
4688///
4689/// Anchored lines that appear exactly once in both old and new content are
4690/// forced to match, splitting the diff into segments around those anchor points.
4691/// This produces diffs where the anchored text stays as context and surrounding
4692/// lines are shown as additions/removals.
4693///
4694/// Segment diffs use `algorithm`. When `use_git_histogram` is true, histogram uses imara-diff
4695/// (Git-compatible); otherwise `algorithm` is passed to `similar`.
4696pub fn anchored_unified_diff(
4697    old_content: &str,
4698    new_content: &str,
4699    old_path: &str,
4700    new_path: &str,
4701    context_lines: usize,
4702    anchors: &[String],
4703    algorithm: similar::Algorithm,
4704    use_git_histogram: bool,
4705    indent_heuristic: bool,
4706    quote_path_fully: bool,
4707) -> String {
4708    use crate::quote_path::format_diff_path_with_prefix;
4709    use similar::TextDiff;
4710
4711    let old_lines: Vec<&str> = old_content.lines().collect();
4712    let new_lines: Vec<&str> = new_content.lines().collect();
4713
4714    // Find anchored lines that appear exactly once in both old and new
4715    let mut anchor_pairs: Vec<(usize, usize)> = Vec::new(); // (old_idx, new_idx)
4716
4717    for anchor in anchors {
4718        let anchor_str = anchor.as_str();
4719
4720        // Count occurrences in old
4721        let old_positions: Vec<usize> = old_lines
4722            .iter()
4723            .enumerate()
4724            .filter(|(_, l)| l.trim_end() == anchor_str)
4725            .map(|(i, _)| i)
4726            .collect();
4727
4728        // Count occurrences in new
4729        let new_positions: Vec<usize> = new_lines
4730            .iter()
4731            .enumerate()
4732            .filter(|(_, l)| l.trim_end() == anchor_str)
4733            .map(|(i, _)| i)
4734            .collect();
4735
4736        // Only anchor if unique in both
4737        if old_positions.len() == 1 && new_positions.len() == 1 {
4738            anchor_pairs.push((old_positions[0], new_positions[0]));
4739        }
4740    }
4741
4742    // If no valid anchors, fall back to normal diff
4743    if anchor_pairs.is_empty() {
4744        return unified_diff_with_prefix_and_funcname_and_algorithm(
4745            old_content,
4746            new_content,
4747            old_path,
4748            new_path,
4749            context_lines,
4750            0,
4751            "a/",
4752            "b/",
4753            None,
4754            algorithm,
4755            false,
4756            use_git_histogram,
4757            indent_heuristic,
4758            quote_path_fully,
4759        );
4760    }
4761
4762    // Sort anchor pairs by their position in the old file
4763    anchor_pairs.sort_by_key(|&(old_idx, _)| old_idx);
4764
4765    // Filter to only keep pairs where new positions are also increasing
4766    // (longest increasing subsequence of new positions)
4767    let mut filtered: Vec<(usize, usize)> = Vec::new();
4768    for &pair in &anchor_pairs {
4769        if filtered.is_empty() || filtered.last().is_some_and(|last| pair.1 > last.1) {
4770            filtered.push(pair);
4771        }
4772    }
4773    let anchor_pairs = filtered;
4774
4775    // Build a modified version of old/new where we diff segments between anchors.
4776    // We'll construct the diff by processing segments:
4777    // - Before first anchor
4778    // - Between consecutive anchors
4779    // - After last anchor
4780    // Each anchor line itself is a fixed context match.
4781
4782    // Collect all diff operations
4783    struct LineDiffOp {
4784        tag: char, // ' ', '+', '-'
4785        line: String,
4786    }
4787
4788    let append_segment_diff =
4789        |ops: &mut Vec<LineDiffOp>, old_seg_input: &str, new_seg_input: &str| {
4790            use similar::ChangeTag;
4791            let old_ls: Vec<&str> = old_seg_input.lines().collect();
4792            let new_ls: Vec<&str> = new_seg_input.lines().collect();
4793            if old_ls.is_empty() && new_ls.is_empty() {
4794                return;
4795            }
4796            let seg_diff = TextDiff::configure()
4797                .algorithm(algorithm)
4798                .diff_slices(&old_ls, &new_ls);
4799            let raw = seg_diff.ops().to_vec();
4800            let compacted = diff_indent_heuristic::apply_change_compact_to_ops(
4801                &raw,
4802                &old_ls,
4803                &new_ls,
4804                indent_heuristic,
4805            );
4806            for op in &compacted {
4807                for ch in op.iter_changes(&old_ls, &new_ls) {
4808                    let t = match ch.tag() {
4809                        ChangeTag::Equal => ' ',
4810                        ChangeTag::Delete => '-',
4811                        ChangeTag::Insert => '+',
4812                    };
4813                    ops.push(LineDiffOp {
4814                        tag: t,
4815                        line: ch.value().to_string(),
4816                    });
4817                }
4818            }
4819        };
4820
4821    let mut ops: Vec<LineDiffOp> = Vec::new();
4822    let mut old_pos = 0usize;
4823    let mut new_pos = 0usize;
4824
4825    for &(old_anchor, new_anchor) in &anchor_pairs {
4826        // Diff the segment before this anchor
4827        let old_segment: Vec<&str> = old_lines[old_pos..old_anchor].to_vec();
4828        let new_segment: Vec<&str> = new_lines[new_pos..new_anchor].to_vec();
4829
4830        let old_seg_text = old_segment.join("\n");
4831        let new_seg_text = new_segment.join("\n");
4832
4833        if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
4834            let old_seg_input = if old_seg_text.is_empty() {
4835                String::new()
4836            } else {
4837                format!("{}\n", old_seg_text)
4838            };
4839            let new_seg_input = if new_seg_text.is_empty() {
4840                String::new()
4841            } else {
4842                format!("{}\n", new_seg_text)
4843            };
4844            append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
4845        }
4846
4847        // The anchor line itself is always context
4848        ops.push(LineDiffOp {
4849            tag: ' ',
4850            line: old_lines[old_anchor].to_string(),
4851        });
4852
4853        old_pos = old_anchor + 1;
4854        new_pos = new_anchor + 1;
4855    }
4856
4857    // Diff the remaining segment after the last anchor
4858    let old_segment: Vec<&str> = old_lines[old_pos..].to_vec();
4859    let new_segment: Vec<&str> = new_lines[new_pos..].to_vec();
4860    let old_seg_text = old_segment.join("\n");
4861    let new_seg_text = new_segment.join("\n");
4862
4863    if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
4864        let old_seg_input = if old_seg_text.is_empty() {
4865            String::new()
4866        } else {
4867            format!("{}\n", old_seg_text)
4868        };
4869        let new_seg_input = if new_seg_text.is_empty() {
4870            String::new()
4871        } else {
4872            format!("{}\n", new_seg_text)
4873        };
4874        append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
4875    }
4876
4877    // Now format as unified diff with hunks
4878    let mut output = String::new();
4879    if old_path == "/dev/null" {
4880        output.push_str("--- /dev/null\n");
4881    } else {
4882        output.push_str("--- ");
4883        output.push_str(&format_diff_path_with_prefix(
4884            "a/",
4885            old_path,
4886            quote_path_fully,
4887        ));
4888        output.push('\n');
4889    }
4890    if new_path == "/dev/null" {
4891        output.push_str("+++ /dev/null\n");
4892    } else {
4893        output.push_str("+++ ");
4894        output.push_str(&format_diff_path_with_prefix(
4895            "b/",
4896            new_path,
4897            quote_path_fully,
4898        ));
4899        output.push('\n');
4900    }
4901
4902    // Group ops into hunks with context
4903    let total_ops = ops.len();
4904    if total_ops == 0 {
4905        return output;
4906    }
4907
4908    // Find ranges of changes
4909    let mut hunks: Vec<(usize, usize)> = Vec::new(); // (start, end) indices into ops
4910    let mut i = 0;
4911    while i < total_ops {
4912        if ops[i].tag != ' ' {
4913            let start = i.saturating_sub(context_lines);
4914            let mut end = i;
4915            // Extend to include consecutive changes and their context
4916            while end < total_ops {
4917                if ops[end].tag != ' ' {
4918                    end += 1;
4919                    continue;
4920                }
4921                // Check if there's another change within context_lines
4922                let mut next_change = end;
4923                while next_change < total_ops && ops[next_change].tag == ' ' {
4924                    next_change += 1;
4925                }
4926                if next_change < total_ops && next_change - end <= context_lines * 2 {
4927                    end = next_change + 1;
4928                } else {
4929                    end = (end + context_lines).min(total_ops);
4930                    break;
4931                }
4932            }
4933            // Merge with previous hunk if overlapping
4934            if let Some(last) = hunks.last_mut() {
4935                if start <= last.1 {
4936                    last.1 = end;
4937                } else {
4938                    hunks.push((start, end));
4939                }
4940            } else {
4941                hunks.push((start, end));
4942            }
4943            i = end;
4944        } else {
4945            i += 1;
4946        }
4947    }
4948
4949    // Output each hunk
4950    for (start, end) in hunks {
4951        // Count old/new lines in this hunk
4952        let mut old_start = 1usize;
4953        let mut new_start = 1usize;
4954        // Calculate line numbers by counting ops before this hunk
4955        for op in &ops[..start] {
4956            match op.tag {
4957                ' ' => {
4958                    old_start += 1;
4959                    new_start += 1;
4960                }
4961                '-' => {
4962                    old_start += 1;
4963                }
4964                '+' => {
4965                    new_start += 1;
4966                }
4967                _ => {}
4968            }
4969        }
4970        let mut old_count = 0usize;
4971        let mut new_count = 0usize;
4972        for op in &ops[start..end] {
4973            match op.tag {
4974                ' ' => {
4975                    old_count += 1;
4976                    new_count += 1;
4977                }
4978                '-' => {
4979                    old_count += 1;
4980                }
4981                '+' => {
4982                    new_count += 1;
4983                }
4984                _ => {}
4985            }
4986        }
4987
4988        output.push_str(&format!(
4989            "@@ -{},{} +{},{} @@\n",
4990            old_start, old_count, new_start, new_count
4991        ));
4992        for op in &ops[start..end] {
4993            output.push(op.tag);
4994            output.push_str(&op.line);
4995            output.push('\n');
4996        }
4997    }
4998
4999    output
5000}
5001
5002/// Extract function context for a hunk header.
5003///
5004/// Given a hunk header like `@@ -8,7 +8,7 @@`, find the last line
5005/// before line 8 in the old content that looks like a function header
5006/// (starts with a non-whitespace character, like Git's default).
5007fn extract_function_context(
5008    header: &str,
5009    old_lines: &[&str],
5010    funcname_matcher: Option<&FuncnameMatcher>,
5011) -> Option<String> {
5012    // Parse the old start line number from "@@ -<start>,<count> ..."
5013    let at_pos = header.find("-")?;
5014    let rest = &header[at_pos + 1..];
5015    let comma_or_space = rest.find([',', ' '])?;
5016    let start_str = &rest[..comma_or_space];
5017    let start_line: usize = start_str.parse().ok()?;
5018
5019    // Parse the old line count; "@@ -<start>,<count> ..." (no comma means count 1).
5020    // Only look for the comma inside the old-range token itself — searching the
5021    // whole remainder would pick up the comma from the new side (e.g. "+0,0").
5022    let old_token_end = rest.find([' ', '\t']).unwrap_or(rest.len());
5023    let old_token = &rest[..old_token_end];
5024    let old_count: usize = if let Some(comma) = old_token.find(',') {
5025        old_token[comma + 1..].parse().unwrap_or(1)
5026    } else {
5027        1
5028    };
5029
5030    if start_line == 0 {
5031        return None;
5032    }
5033
5034    // Look backwards for a line that matches the funcname pattern. start_line is
5035    // 1-indexed. For a normal hunk the first changed pre-image line is
5036    // old_lines[start_line-1], so we search lines strictly before it
5037    // (old_lines[0..start_line-1]). For a pure insertion (old count 0) the
5038    // content is inserted *after* old line start_line, so Git's function search
5039    // begins at that line itself: search old_lines[0..start_line].
5040    let search_end = if old_count == 0 {
5041        start_line.min(old_lines.len())
5042    } else {
5043        if start_line <= 1 {
5044            return None;
5045        }
5046        (start_line - 1).min(old_lines.len())
5047    };
5048    let truncate = |text: &str| {
5049        if text.len() > 80 {
5050            let mut end = 80;
5051            while end > 0 && !text.is_char_boundary(end) {
5052                end -= 1;
5053            }
5054            text[..end].to_owned()
5055        } else {
5056            text.to_owned()
5057        }
5058    };
5059
5060    for i in (0..search_end).rev() {
5061        let line = old_lines[i];
5062        if line.is_empty() {
5063            continue;
5064        }
5065        if let Some(matcher) = funcname_matcher {
5066            if let Some(matched) = matcher.match_line(line) {
5067                return Some(truncate(&matched));
5068            }
5069            continue;
5070        }
5071
5072        let first = line.as_bytes()[0];
5073        if first.is_ascii_alphabetic() || first == b'_' || first == b'$' {
5074            return Some(truncate(line.trim_end_matches(char::is_whitespace)));
5075        }
5076    }
5077    None
5078}
5079
5080/// Generate diff stat output (file name + insertions/deletions).
5081///
5082/// Returns a single line like: ` file.txt | 5 ++---`
5083pub fn format_stat_line(
5084    path: &str,
5085    insertions: usize,
5086    deletions: usize,
5087    max_path_len: usize,
5088) -> String {
5089    format_stat_line_width(path, insertions, deletions, max_path_len, 0)
5090}
5091
5092pub fn format_stat_line_width(
5093    path: &str,
5094    insertions: usize,
5095    deletions: usize,
5096    max_path_len: usize,
5097    count_width: usize,
5098) -> String {
5099    let total = insertions + deletions;
5100    let plus = "+".repeat(insertions.min(50));
5101    let minus = "-".repeat(deletions.min(50));
5102    let cw = if count_width > 0 {
5103        count_width
5104    } else {
5105        format!("{}", total).len()
5106    };
5107    let bar = format!("{}{}", plus, minus);
5108    if bar.is_empty() {
5109        format!(
5110            " {:<width$} | {:>cw$}",
5111            path,
5112            total,
5113            width = max_path_len,
5114            cw = cw
5115        )
5116    } else {
5117        format!(
5118            " {:<width$} | {:>cw$} {}",
5119            path,
5120            total,
5121            bar,
5122            width = max_path_len,
5123            cw = cw
5124        )
5125    }
5126}
5127
5128/// Normalise one line like Git's `-b` / `--ignore-space-change`.
5129#[must_use]
5130pub fn normalize_ignore_space_change_line(line: &str) -> String {
5131    let mut result = String::with_capacity(line.len());
5132    let mut in_space = false;
5133    for c in line.chars() {
5134        if c.is_whitespace() {
5135            if !in_space {
5136                result.push(' ');
5137                in_space = true;
5138            }
5139        } else {
5140            result.push(c);
5141            in_space = false;
5142        }
5143    }
5144    while result.ends_with(' ') {
5145        result.pop();
5146    }
5147    result
5148}
5149
5150/// Normalise text like Git's `-b` / `--ignore-space-change`: on each line, collapse runs of
5151/// whitespace to a single ASCII space and trim trailing spaces.
5152///
5153/// Line breaks are preserved by splitting on [`str::lines`] and rejoining with `\n` (same approach
5154/// as the porcelain `diff` whitespace handling in `grit`).
5155#[must_use]
5156pub fn normalize_ignore_space_change(content: &str) -> String {
5157    content
5158        .lines()
5159        .map(normalize_ignore_space_change_line)
5160        .collect::<Vec<_>>()
5161        .join("\n")
5162}
5163
5164/// Count insertions and deletions between two strings.
5165///
5166/// Returns `(insertions, deletions)`.
5167pub fn count_changes(old_content: &str, new_content: &str) -> (usize, usize) {
5168    count_changes_with_algorithm(old_content, new_content, similar::Algorithm::Myers, false)
5169}
5170
5171/// Count insertions and deletions using the given line-diff algorithm.
5172///
5173/// Git's `--stat` / `--numstat` follow the configured diff algorithm; this mirrors that by
5174/// running [`similar::TextDiff`] with an explicit [`similar::Algorithm`].
5175#[must_use]
5176pub fn count_changes_with_algorithm(
5177    old_content: &str,
5178    new_content: &str,
5179    algorithm: similar::Algorithm,
5180    use_git_histogram: bool,
5181) -> (usize, usize) {
5182    if use_git_histogram {
5183        use imara_diff::{Algorithm, Diff, InternedInput};
5184        let input = InternedInput::new(old_content, new_content);
5185        let mut d = Diff::compute(Algorithm::Histogram, &input);
5186        d.postprocess_lines(&input);
5187        return (d.count_additions() as usize, d.count_removals() as usize);
5188    }
5189
5190    use similar::{ChangeTag, TextDiff};
5191
5192    let diff = TextDiff::configure()
5193        .algorithm(algorithm)
5194        .diff_lines(old_content, new_content);
5195    let mut ins = 0;
5196    let mut del = 0;
5197
5198    for change in diff.iter_all_changes() {
5199        match change.tag() {
5200            ChangeTag::Insert => ins += 1,
5201            ChangeTag::Delete => del += 1,
5202            ChangeTag::Equal => {}
5203        }
5204    }
5205
5206    (ins, del)
5207}
5208
5209/// Line count for diffstat/`--numstat`, matching Git's `count_lines()` in `diff.c`.
5210///
5211/// Counts newline-terminated lines; a final line without trailing newline still counts as one line.
5212/// An empty buffer yields `0`.
5213#[must_use]
5214pub fn count_git_lines(data: &[u8]) -> usize {
5215    if data.is_empty() {
5216        return 0;
5217    }
5218    let mut count = 0usize;
5219    let mut nl_just_seen = false;
5220    for &ch in data {
5221        if ch == b'\n' {
5222            count += 1;
5223            nl_just_seen = true;
5224        } else {
5225            nl_just_seen = false;
5226        }
5227    }
5228    if !nl_just_seen {
5229        count += 1;
5230    }
5231    count
5232}
5233
5234/// Internal maximum diff score used by Git rename/break heuristics (`MAX_SCORE` in `diffcore.h`).
5235pub const GIT_DIFF_MAX_SCORE: u64 = 60_000;
5236const DIFF_MAX_SCORE: u64 = GIT_DIFF_MAX_SCORE;
5237const DIFF_MINIMUM_BREAK_SIZE: usize = 400;
5238const DIFF_DEFAULT_BREAK_SCORE: u64 = 30_000;
5239/// Default break threshold (`DEFAULT_BREAK_SCORE` in `diffcore.h`), internal 0–[`GIT_DIFF_MAX_SCORE`] scale.
5240pub const GIT_DIFF_DEFAULT_BREAK_SCORE: u64 = DIFF_DEFAULT_BREAK_SCORE;
5241/// Default merge threshold after a break (`DEFAULT_MERGE_SCORE` in `diffcore.h`): pairs broken for
5242/// rename/copy but not consumed are merged back when deletion-weight is below this (60% by default).
5243pub const GIT_DIFF_DEFAULT_MERGE_SCORE_AFTER_BREAK: u64 = 36_000;
5244const DIFF_HASHBASE: u32 = 107_927;
5245
5246#[derive(Clone, Copy, Default)]
5247struct SpanSlot {
5248    hashval: u32,
5249    cnt: u32,
5250}
5251
5252struct SpanHashTop {
5253    alloc_log2: u8,
5254    free_slots: i32,
5255    data: Vec<SpanSlot>,
5256}
5257
5258impl SpanHashTop {
5259    fn new(initial_log2: u8) -> Self {
5260        let cap = 1usize << initial_log2;
5261        Self {
5262            alloc_log2: initial_log2,
5263            free_slots: initial_free(initial_log2),
5264            data: vec![SpanSlot::default(); cap],
5265        }
5266    }
5267
5268    fn len(&self) -> usize {
5269        1usize << self.alloc_log2
5270    }
5271
5272    fn add_span(&mut self, hashval: u32, cnt: u32) {
5273        loop {
5274            let lim = self.len();
5275            let mut bucket = (hashval as usize) & (lim - 1);
5276            loop {
5277                let h = &mut self.data[bucket];
5278                if h.cnt == 0 {
5279                    h.hashval = hashval;
5280                    h.cnt = cnt;
5281                    self.free_slots -= 1;
5282                    if self.free_slots < 0 {
5283                        self.rehash();
5284                        break;
5285                    }
5286                    return;
5287                }
5288                if h.hashval == hashval {
5289                    h.cnt = h.cnt.saturating_add(cnt);
5290                    return;
5291                }
5292                bucket += 1;
5293                if bucket >= lim {
5294                    bucket = 0;
5295                }
5296            }
5297        }
5298    }
5299
5300    fn rehash(&mut self) {
5301        let old = std::mem::take(&mut self.data);
5302        let old_log = self.alloc_log2;
5303        self.alloc_log2 = old_log.saturating_add(1);
5304        let new_len = 1usize << self.alloc_log2;
5305        self.free_slots = initial_free(self.alloc_log2);
5306        self.data = vec![SpanSlot::default(); new_len];
5307        let old_sz = 1usize << old_log;
5308        for o in old.iter().take(old_sz) {
5309            let o = *o;
5310            if o.cnt == 0 {
5311                continue;
5312            }
5313            self.add_span_after_rehash(o.hashval, o.cnt);
5314        }
5315    }
5316
5317    fn add_span_after_rehash(&mut self, hashval: u32, cnt: u32) {
5318        loop {
5319            let lim = self.len();
5320            let mut bucket = (hashval as usize) & (lim - 1);
5321            loop {
5322                let h = &mut self.data[bucket];
5323                if h.cnt == 0 {
5324                    h.hashval = hashval;
5325                    h.cnt = cnt;
5326                    self.free_slots -= 1;
5327                    if self.free_slots < 0 {
5328                        self.rehash();
5329                        break;
5330                    }
5331                    return;
5332                }
5333                if h.hashval == hashval {
5334                    h.cnt = h.cnt.saturating_add(cnt);
5335                    return;
5336                }
5337                bucket += 1;
5338                if bucket >= lim {
5339                    bucket = 0;
5340                }
5341            }
5342        }
5343    }
5344
5345    fn sort_by_hashval(&mut self) {
5346        let sz = self.len();
5347        self.data[..sz].sort_by(|a, b| {
5348            if a.cnt == 0 {
5349                return std::cmp::Ordering::Greater;
5350            }
5351            if b.cnt == 0 {
5352                return std::cmp::Ordering::Less;
5353            }
5354            a.hashval.cmp(&b.hashval)
5355        });
5356    }
5357}
5358
5359fn initial_free(sz_log2: u8) -> i32 {
5360    let sz = sz_log2 as i32;
5361    ((1i32 << sz_log2) * (sz - 3) / sz).max(0)
5362}
5363
5364fn hash_blob_spans(buf: &[u8], is_text: bool) -> SpanHashTop {
5365    let mut hash = SpanHashTop::new(9);
5366    let mut n = 0u32;
5367    let mut accum1: u32 = 0;
5368    let mut accum2: u32 = 0;
5369    let mut i = 0usize;
5370    while i < buf.len() {
5371        let c = buf[i] as u32;
5372        let old_1 = accum1;
5373        i += 1;
5374
5375        if is_text && c == b'\r' as u32 && i < buf.len() && buf[i] == b'\n' {
5376            continue;
5377        }
5378
5379        accum1 = accum1.wrapping_shl(7) ^ accum2.wrapping_shr(25);
5380        accum2 = accum2.wrapping_shl(7) ^ old_1.wrapping_shr(25);
5381        accum1 = accum1.wrapping_add(c);
5382        n += 1;
5383        if n < 64 && c != b'\n' as u32 {
5384            continue;
5385        }
5386        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
5387        hash.add_span(hashval, n);
5388        n = 0;
5389        accum1 = 0;
5390        accum2 = 0;
5391    }
5392    if n > 0 {
5393        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
5394        hash.add_span(hashval, n);
5395    }
5396    hash.sort_by_hashval();
5397    hash
5398}
5399
5400/// Approximate copied vs added material between two blobs (Git `diffcore_count_changes`).
5401///
5402/// Returns `(copied_bytes_from_src, literal_added_bytes_in_dst)` matching Git's
5403/// `diffcore_count_changes` semantics (used for `--dirstat=changes` damage).
5404#[must_use]
5405pub fn diffcore_count_changes(old: &[u8], new: &[u8]) -> (u64, u64) {
5406    let src_is_text = !crate::merge_file::is_binary(old);
5407    let dst_is_text = !crate::merge_file::is_binary(new);
5408    let src_count = hash_blob_spans(old, src_is_text);
5409    let dst_count = hash_blob_spans(new, dst_is_text);
5410    let mut sc: u64 = 0;
5411    let mut la: u64 = 0;
5412    let mut si = 0usize;
5413    let mut di = 0usize;
5414    let src_len = src_count.len();
5415    let dst_len = dst_count.len();
5416    loop {
5417        if si >= src_len || src_count.data[si].cnt == 0 {
5418            break;
5419        }
5420        let s_hash = src_count.data[si].hashval;
5421        let s_cnt = u64::from(src_count.data[si].cnt);
5422        while di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval < s_hash {
5423            la += u64::from(dst_count.data[di].cnt);
5424            di += 1;
5425        }
5426        let mut dst_cnt = 0u64;
5427        if di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval == s_hash {
5428            dst_cnt = u64::from(dst_count.data[di].cnt);
5429            di += 1;
5430        }
5431        if s_cnt < dst_cnt {
5432            la += dst_cnt - s_cnt;
5433            sc += s_cnt;
5434        } else {
5435            sc += dst_cnt;
5436        }
5437        si += 1;
5438    }
5439    while di < dst_len && dst_count.data[di].cnt != 0 {
5440        la += u64::from(dst_count.data[di].cnt);
5441        di += 1;
5442    }
5443    (sc, la)
5444}
5445
5446/// Whether this modified blob pair should use Git's "complete rewrite" diffstat path when
5447/// `--break-rewrites` is in effect (`should_break` in `diffcore-break.c`).
5448#[must_use]
5449pub fn should_break_rewrite_for_stat(old: &[u8], new: &[u8]) -> bool {
5450    should_break_rewrite_inner(old, new, DIFF_DEFAULT_BREAK_SCORE)
5451}
5452
5453/// Whether an in-place blob edit should be split into delete+create for rename/copy (`should_break`
5454/// in `diffcore-break.c`). `break_score` is on the internal 0–[`GIT_DIFF_MAX_SCORE`] scale (default
5455/// [`DIFF_DEFAULT_BREAK_SCORE`]).
5456#[must_use]
5457pub fn should_break_rewrite_pair(old: &[u8], new: &[u8], break_score: u64) -> bool {
5458    should_break_rewrite_inner(old, new, break_score)
5459}
5460
5461/// Parse a single Git `parse_rename_score` token (`50`, `50%`, decimal forms) into internal
5462/// 0–[`GIT_DIFF_MAX_SCORE`] units.
5463pub fn parse_diff_rename_score_token(arg: &str) -> Option<u64> {
5464    let mut num: u64 = 0;
5465    let mut scale: u64 = 1;
5466    let mut dot = false;
5467    let mut saw_digit = false;
5468    for ch in arg.chars() {
5469        if !dot && ch == '.' {
5470            scale = 1;
5471            dot = true;
5472            continue;
5473        }
5474        if ch == '%' {
5475            scale = if dot { scale.saturating_mul(100) } else { 100 };
5476            break;
5477        }
5478        if ch.is_ascii_digit() {
5479            saw_digit = true;
5480            if scale < 100_000 {
5481                scale = scale.saturating_mul(10);
5482                num = num.saturating_mul(10) + u64::from(ch as u8 - b'0');
5483            }
5484        } else {
5485            break;
5486        }
5487    }
5488    if !saw_digit {
5489        return None;
5490    }
5491    Some(if num >= scale {
5492        GIT_DIFF_MAX_SCORE
5493    } else {
5494        GIT_DIFF_MAX_SCORE * num / scale
5495    })
5496}
5497
5498/// Git `merge_score` from `diffcore-break.c` when a pair is considered broken: how much of the
5499/// source blob was removed (0–[`DIFF_MAX_SCORE`] scale). Used for `dissimilarity index` metadata.
5500#[must_use]
5501pub fn rewrite_merge_score(old: &[u8], new: &[u8]) -> Option<u64> {
5502    if old.is_empty() {
5503        return None;
5504    }
5505    let max_size = old.len().max(new.len());
5506    if max_size < DIFF_MINIMUM_BREAK_SIZE {
5507        return None;
5508    }
5509    let (src_copied, _) = diffcore_count_changes(old, new);
5510    let src_copied = src_copied.min(old.len() as u64);
5511    let src_removed = (old.len() as u64).saturating_sub(src_copied);
5512    Some(src_removed * DIFF_MAX_SCORE / old.len() as u64)
5513}
5514
5515/// Percentage shown in `dissimilarity index N%` for a rewrite (`similarity_index` in Git's diff.c).
5516#[must_use]
5517pub fn rewrite_dissimilarity_index_percent(old: &[u8], new: &[u8]) -> Option<u32> {
5518    let score = rewrite_merge_score(old, new)?;
5519    Some((score * 100 / DIFF_MAX_SCORE).min(100) as u32)
5520}
5521
5522fn should_break_rewrite_inner(src: &[u8], dst: &[u8], break_score: u64) -> bool {
5523    if src.is_empty() {
5524        return false;
5525    }
5526    let max_size = src.len().max(dst.len());
5527    if max_size < DIFF_MINIMUM_BREAK_SIZE {
5528        return false;
5529    }
5530    let (src_copied, literal_added) = diffcore_count_changes(src, dst);
5531    let src_copied = src_copied.min(src.len() as u64);
5532    let mut literal_added = literal_added;
5533    let dst_len = dst.len() as u64;
5534    if src_copied < dst_len && literal_added + src_copied > dst_len {
5535        literal_added = dst_len.saturating_sub(src_copied);
5536    }
5537    let src_removed = (src.len() as u64).saturating_sub(src_copied);
5538    let merge_score = src_removed * DIFF_MAX_SCORE / src.len() as u64;
5539    if merge_score > break_score {
5540        return true;
5541    }
5542    let delta_size = src_removed.saturating_add(literal_added);
5543    if delta_size * DIFF_MAX_SCORE / (max_size as u64) < break_score {
5544        return false;
5545    }
5546    let s = src.len() as u64;
5547    if (s * break_score < src_removed * DIFF_MAX_SCORE)
5548        && (literal_added * 20 < src_removed)
5549        && (literal_added * 20 < src_copied)
5550    {
5551        return false;
5552    }
5553    true
5554}
5555
5556// ── Helpers ─────────────────────────────────────────────────────────
5557
5558/// Flatten a tree object recursively into a sorted list of (path, mode, oid).
5559struct FlatEntry {
5560    path: String,
5561    mode: u32,
5562    oid: ObjectId,
5563}
5564
5565fn flatten_tree(odb: &Odb, tree_oid: &ObjectId, prefix: &str) -> Result<Vec<FlatEntry>> {
5566    let entries = read_tree(odb, tree_oid)?;
5567    let mut result = Vec::new();
5568
5569    for entry in entries {
5570        let name_str = String::from_utf8_lossy(&entry.name);
5571        let path = format_path(prefix, &name_str);
5572        if is_tree_mode(entry.mode) {
5573            let nested = flatten_tree(odb, &entry.oid, &path)?;
5574            result.extend(nested);
5575        } else {
5576            result.push(FlatEntry {
5577                path,
5578                mode: entry.mode,
5579                oid: entry.oid,
5580            });
5581        }
5582    }
5583
5584    Ok(result)
5585}
5586
5587/// Paths present in `HEAD`'s tree with mode and blob/commit OID (for status porcelain v2).
5588pub fn head_path_states(
5589    odb: &Odb,
5590    head_tree: Option<&ObjectId>,
5591) -> Result<std::collections::BTreeMap<String, (u32, ObjectId)>> {
5592    let mut m = std::collections::BTreeMap::new();
5593    let Some(t) = head_tree else {
5594        return Ok(m);
5595    };
5596    for fe in flatten_tree(odb, t, "")? {
5597        m.insert(fe.path, (fe.mode, fe.oid));
5598    }
5599    Ok(m)
5600}
5601
5602/// Whether a mode represents a tree (directory).
5603fn is_tree_mode(mode: u32) -> bool {
5604    mode == 0o040000
5605}
5606
5607/// Build a path with an optional prefix.
5608fn format_path(prefix: &str, name: &str) -> String {
5609    if prefix.is_empty() {
5610        name.to_owned()
5611    } else {
5612        format!("{prefix}/{name}")
5613    }
5614}
5615
5616/// Format a numeric mode as a zero-padded octal string.
5617pub fn format_mode(mode: u32) -> String {
5618    format!("{mode:06o}")
5619}
5620
5621/// Read the HEAD commit OID from a submodule checkout directory.
5622///
5623/// Returns `None` if the path is missing, not a submodule checkout, or has no resolvable HEAD.
5624#[must_use]
5625pub fn read_submodule_head_for_checkout(sub_dir: &Path) -> Option<ObjectId> {
5626    read_submodule_head(sub_dir)
5627}
5628
5629/// First line of a commit's message for `git diff --submodule=log` output.
5630///
5631/// Honors `encoding` in the commit object (Latin-1 vs UTF-8) using the same
5632/// rules as Git's submodule summary.
5633#[must_use]
5634pub fn submodule_commit_subject_line(c: &CommitData) -> String {
5635    let enc = c.encoding.as_deref().unwrap_or("UTF-8");
5636    let is_latin1 = enc.eq_ignore_ascii_case("ISO8859-1")
5637        || enc.eq_ignore_ascii_case("ISO-8859-1")
5638        || enc.eq_ignore_ascii_case("LATIN1")
5639        || enc.eq_ignore_ascii_case("ISO-8859-15");
5640    if let Some(raw) = c.raw_message.as_deref() {
5641        let line = raw.split(|b| *b == b'\n').next().unwrap_or(raw);
5642        if is_latin1 {
5643            return line
5644                .iter()
5645                .map(|&b| b as char)
5646                .collect::<String>()
5647                .trim()
5648                .to_owned();
5649        }
5650        return String::from_utf8_lossy(line).trim().to_string();
5651    }
5652    c.message.lines().next().unwrap_or("").trim().to_owned()
5653}
5654
5655/// True when `sub_dir` is an empty directory (or missing), i.e. the placeholder left by
5656/// `git apply --index` before `git submodule update`.
5657fn submodule_worktree_is_unpopulated_placeholder(sub_dir: &Path) -> bool {
5658    match fs::read_dir(sub_dir) {
5659        Ok(mut it) => it.next().is_none(),
5660        Err(e) if e.kind() == std::io::ErrorKind::NotFound => true,
5661        Err(_) => false,
5662    }
5663}
5664
5665fn read_submodule_head(sub_dir: &Path) -> Option<ObjectId> {
5666    read_submodule_head_oid(sub_dir)
5667}
5668
5669/// Resolve the embedded git directory for a submodule work tree (`sub_dir/.git`).
5670#[must_use]
5671pub fn submodule_embedded_git_dir(sub_dir: &Path) -> Option<PathBuf> {
5672    let gitfile = sub_dir.join(".git");
5673    if gitfile.is_file() {
5674        let content = fs::read_to_string(&gitfile).ok()?;
5675        let gitdir = content
5676            .lines()
5677            .find_map(|l| l.strip_prefix("gitdir: "))?
5678            .trim();
5679        Some(if Path::new(gitdir).is_absolute() {
5680            PathBuf::from(gitdir)
5681        } else {
5682            sub_dir.join(gitdir)
5683        })
5684    } else if gitfile.is_dir() {
5685        Some(gitfile)
5686    } else {
5687        None
5688    }
5689}
5690
5691/// Walk upward from `sub_dir` to find the nearest containing Git work tree.
5692fn find_superproject_git(sub_dir: &Path) -> Option<(PathBuf, PathBuf)> {
5693    let mut cur = sub_dir.parent()?;
5694    loop {
5695        let git_path = cur.join(".git");
5696        if git_path.exists() {
5697            let gd = if git_path.is_file() {
5698                let content = fs::read_to_string(&git_path).ok()?;
5699                let line = content
5700                    .lines()
5701                    .find_map(|l| l.strip_prefix("gitdir: "))?
5702                    .trim();
5703                if Path::new(line).is_absolute() {
5704                    PathBuf::from(line)
5705                } else {
5706                    cur.join(line)
5707                }
5708            } else {
5709                git_path
5710            };
5711            return Some((cur.to_path_buf(), gd));
5712        }
5713        cur = cur.parent()?;
5714    }
5715}
5716
5717/// Read the HEAD commit OID from a submodule working tree directory.
5718///
5719/// Handles both embedded `.git` directories and `gitdir:` gitfiles pointing at
5720/// `.git/modules/...` (or other locations). Returns `None` if the path is not
5721/// a checkout or has no resolvable HEAD.
5722pub fn read_submodule_head_oid(sub_dir: &Path) -> Option<ObjectId> {
5723    // Submodule `.git` may be a gitfile pointing at `.git/modules/<name>` in another superproject
5724    // after `cp -R`. Prefer the current superproject's module dir when present.
5725    let mut git_dir = submodule_embedded_git_dir(sub_dir)?;
5726    if let Some((super_wt, super_git_dir)) = find_superproject_git(sub_dir) {
5727        let rel = sub_dir.strip_prefix(&super_wt).ok()?;
5728        let rel_str = rel.to_string_lossy().replace('\\', "/");
5729        let local_mod = super_git_dir
5730            .join("modules")
5731            .join(rel_str.trim_start_matches('/'));
5732        if local_mod.join("HEAD").exists() {
5733            let sg = super_git_dir.canonicalize().unwrap_or(super_git_dir);
5734            let cur = git_dir.canonicalize().unwrap_or_else(|_| git_dir.clone());
5735            if !cur.starts_with(&sg) {
5736                git_dir = local_mod;
5737            }
5738        }
5739    }
5740    let head_content = fs::read_to_string(git_dir.join("HEAD")).ok()?;
5741    let head_trimmed = head_content.trim();
5742    if head_trimmed.starts_with("ref: ") {
5743        // Use the full ref resolver so packed-refs and worktrees match Git. If `HEAD` is a stale
5744        // symref (e.g. still `refs/heads/master` while only `main` exists), fall back like
5745        // `resolve_gitlink_ref` / `git add` on embedded repos (`t6437-submodule-merge`).
5746        match crate::refs::resolve_ref(&git_dir, "HEAD") {
5747            Ok(oid) => Some(oid),
5748            Err(_) => {
5749                let mut found = None;
5750                for branch in ["main", "master"] {
5751                    let p = git_dir.join("refs/heads").join(branch);
5752                    if let Ok(s) = fs::read_to_string(&p) {
5753                        if let Ok(o) = ObjectId::from_hex(s.trim()) {
5754                            found = Some(o);
5755                            break;
5756                        }
5757                    }
5758                }
5759                found
5760            }
5761        }
5762    } else {
5763        ObjectId::from_hex(head_trimmed).ok()
5764    }
5765}
5766
5767/// True when a populated submodule checkout is *broken*: its `HEAD` resolves to a commit OID, but
5768/// that commit object cannot be read from the submodule's own object database.
5769///
5770/// This mirrors Git's [`is_submodule_modified`], which shells out to `git status --porcelain=2`
5771/// inside the submodule; when the submodule's object store is corrupt (e.g. `rm -r .git/objects`),
5772/// that inner status fails and Git aborts the surrounding `status`/`diff`/`fetch`. We detect the
5773/// same condition in-process so the superproject operation can return a fatal error rather than
5774/// silently treating the submodule as clean (t5526 "fetching submodule into a broken repository").
5775///
5776/// Returns `false` when the submodule is not checked out, has no embedded git dir, or has an
5777/// unresolvable HEAD (those are handled separately by the unpopulated/placeholder logic).
5778#[must_use]
5779pub fn submodule_head_object_broken(sub_dir: &Path) -> bool {
5780    let Some(sub_git_dir) = submodule_embedded_git_dir(sub_dir) else {
5781        return false;
5782    };
5783    let Some(head_oid) = read_submodule_head_oid(sub_dir) else {
5784        return false;
5785    };
5786    let odb = Odb::new(&sub_git_dir.join("objects"));
5787    match odb.read(&head_oid) {
5788        // HEAD object present but not a commit would be a different kind of corruption; only the
5789        // missing-object case is the broken-repo scenario Git guards against here.
5790        Ok(obj) => parse_commit(&obj.data).is_err(),
5791        Err(_) => true,
5792    }
5793}
5794
5795/// True when a checked-out submodule at `rel_path` has modified or untracked content relative to
5796/// the gitlink `recorded_oid` stored in the superproject (used for `git diff <tree>` parity).
5797fn submodule_has_dirty_worktree_for_super_diff(
5798    super_worktree: &Path,
5799    rel_path: &str,
5800    recorded_oid: &ObjectId,
5801) -> bool {
5802    let flags = submodule_porcelain_flags(super_worktree, rel_path, *recorded_oid);
5803    flags.modified || flags.untracked
5804}
5805
5806/// Submodule dirty bits aligned with Git's `DIRTY_SUBMODULE_*` / porcelain v2 `S???` token.
5807#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
5808pub struct SubmodulePorcelainFlags {
5809    /// Submodule checkout HEAD differs from the gitlink OID recorded in the parent index.
5810    pub new_commits: bool,
5811    /// The submodule has its own staged or unstaged changes (`DIRTY_SUBMODULE_MODIFIED`).
5812    pub modified: bool,
5813    /// The submodule work tree contains paths not in its index (`DIRTY_SUBMODULE_UNTRACKED`).
5814    pub untracked: bool,
5815}
5816
5817/// Inspect a checked-out submodule at `rel_path` (relative to `super_worktree`) and return
5818/// flags used for `git status --porcelain=v2` submodule tokens.
5819///
5820/// `recorded_oid` is the gitlink OID stored in the **parent** index (stage 0). When the
5821/// submodule is not checked out or cannot be opened, returns [`Default::default()`].
5822pub fn submodule_porcelain_flags(
5823    super_worktree: &Path,
5824    rel_path: &str,
5825    recorded_oid: ObjectId,
5826) -> SubmodulePorcelainFlags {
5827    let sub_dir = super_worktree.join(rel_path);
5828    let Some(sub_git_dir) = submodule_embedded_git_dir(&sub_dir) else {
5829        return SubmodulePorcelainFlags::default();
5830    };
5831    let Some(sub_head) = read_submodule_head_oid(&sub_dir) else {
5832        return SubmodulePorcelainFlags::default();
5833    };
5834
5835    let new_commits = sub_head != recorded_oid;
5836
5837    let index_path = sub_git_dir.join("index");
5838    let sub_index = match crate::index::Index::load(&index_path) {
5839        Ok(ix) => ix,
5840        Err(_) => {
5841            return SubmodulePorcelainFlags {
5842                new_commits,
5843                ..Default::default()
5844            }
5845        }
5846    };
5847
5848    let tracked: std::collections::BTreeSet<String> = sub_index
5849        .entries
5850        .iter()
5851        .filter(|e| e.stage() == 0)
5852        .map(|e| String::from_utf8_lossy(&e.path).into_owned())
5853        .collect();
5854    let untracked = submodule_dir_has_untracked_inner(&sub_dir, &sub_dir, &tracked, &sub_index);
5855
5856    let objects_dir = sub_git_dir.join("objects");
5857    let odb = Odb::new(&objects_dir);
5858
5859    let sub_head_tree = (|| -> Option<ObjectId> {
5860        let h = fs::read_to_string(sub_git_dir.join("HEAD")).ok()?;
5861        let h_str = h.trim();
5862        let commit_oid = if let Some(r) = h_str.strip_prefix("ref: ") {
5863            let oid_hex = fs::read_to_string(sub_git_dir.join(r)).ok()?;
5864            ObjectId::from_hex(oid_hex.trim()).ok()?
5865        } else {
5866            ObjectId::from_hex(h_str).ok()?
5867        };
5868        let obj = odb.read(&commit_oid).ok()?;
5869        let commit = parse_commit(&obj.data).ok()?;
5870        Some(commit.tree)
5871    })();
5872
5873    let staged_dirty = sub_head_tree
5874        .as_ref()
5875        .map(|t| diff_index_to_tree(&odb, &sub_index, Some(t), false).map(|v| !v.is_empty()))
5876        .unwrap_or(Ok(false));
5877    let staged_dirty = staged_dirty.unwrap_or(false);
5878
5879    let unstaged_dirty = diff_index_to_worktree(&odb, &sub_index, &sub_dir, false, true)
5880        .map(|v| !v.is_empty())
5881        .unwrap_or(false);
5882
5883    let mut modified = staged_dirty || unstaged_dirty;
5884
5885    // Nested submodule has its own index: OR `modified` from immediate gitlink children so a
5886    // dirty nested checkout (e.g. staged `file` under `sub1/sub2`) marks the parent gitlink as
5887    // modified in the superproject (t7506). Do **not** OR `untracked` — untracked-only inside a
5888    // nested submodule must stay `S..U` on the parent, not `S.U` / `S.M.`.
5889    for e in &sub_index.entries {
5890        if e.stage() != 0 || e.mode != 0o160000 {
5891            continue;
5892        }
5893        let child = String::from_utf8_lossy(&e.path).into_owned();
5894        let full_rel = if rel_path.is_empty() {
5895            child
5896        } else {
5897            format!("{rel_path}/{child}")
5898        };
5899        let nested = submodule_porcelain_flags(super_worktree, &full_rel, e.oid);
5900        modified |= nested.modified;
5901    }
5902
5903    SubmodulePorcelainFlags {
5904        new_commits,
5905        modified,
5906        untracked,
5907    }
5908}
5909
5910fn submodule_dir_has_untracked_inner(
5911    dir: &Path,
5912    root: &Path,
5913    tracked: &std::collections::BTreeSet<String>,
5914    owning_index: &Index,
5915) -> bool {
5916    let entries = match fs::read_dir(dir) {
5917        Ok(e) => e,
5918        Err(_) => return false,
5919    };
5920    let mut sorted: Vec<_> = entries.filter_map(|e| e.ok()).collect();
5921    sorted.sort_by_key(|e| e.file_name());
5922
5923    for entry in sorted {
5924        let name = entry.file_name().to_string_lossy().to_string();
5925        if name == ".git" {
5926            continue;
5927        }
5928        let path = entry.path();
5929        let rel = path
5930            .strip_prefix(root)
5931            .map(|p| p.to_string_lossy().to_string())
5932            .unwrap_or_else(|_| name.clone());
5933
5934        let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
5935        if is_dir {
5936            let is_gitlink = owning_index
5937                .get(rel.as_bytes(), 0)
5938                .is_some_and(|e| e.mode == 0o160000);
5939            if is_gitlink {
5940                let Some(nested_git) = submodule_embedded_git_dir(&path) else {
5941                    continue;
5942                };
5943                let nested_index_path = nested_git.join("index");
5944                let Ok(nested_ix) = crate::index::Index::load(&nested_index_path) else {
5945                    continue;
5946                };
5947                let nested_tracked: std::collections::BTreeSet<String> = nested_ix
5948                    .entries
5949                    .iter()
5950                    .filter(|e| e.stage() == 0)
5951                    .map(|e| String::from_utf8_lossy(&e.path).into_owned())
5952                    .collect();
5953                if submodule_dir_has_untracked_inner(&path, &path, &nested_tracked, &nested_ix) {
5954                    return true;
5955                }
5956            } else if submodule_dir_has_untracked_inner(&path, root, tracked, owning_index) {
5957                return true;
5958            }
5959        } else if !tracked.contains(&rel) {
5960            return true;
5961        }
5962    }
5963    false
5964}
5965
5966/// Reorder diff entries by a `-O<orderfile>` (`diff.orderFile`): entries are sorted by the index of
5967/// the first orderfile glob pattern that matches their path; unmatched entries sort last (stable).
5968pub fn apply_orderfile_entries(
5969    entries: Vec<DiffEntry>,
5970    order_path: &str,
5971    cwd: &Path,
5972) -> Result<Vec<DiffEntry>> {
5973    apply_orderfile(entries, order_path, cwd)
5974}
5975
5976fn apply_orderfile(
5977    mut entries: Vec<DiffEntry>,
5978    order_path: &str,
5979    cwd: &Path,
5980) -> Result<Vec<DiffEntry>> {
5981    let patterns = read_orderfile_patterns(order_path, cwd)?;
5982    let sort_key = |entry: &DiffEntry| -> usize {
5983        let path = entry
5984            .new_path
5985            .as_ref()
5986            .or(entry.old_path.as_ref())
5987            .cloned()
5988            .unwrap_or_default();
5989        for (i, pat) in patterns.iter().enumerate() {
5990            if orderfile_pattern_matches(pat, &path) {
5991                return i;
5992            }
5993        }
5994        patterns.len()
5995    };
5996    entries.sort_by_key(|e| sort_key(e));
5997    Ok(entries)
5998}
5999
6000/// Read non-empty, non-comment glob patterns (one per line) from a `-O<orderfile>` file.
6001pub fn read_orderfile_patterns(order_path: &str, cwd: &Path) -> Result<Vec<String>> {
6002    let path = Path::new(order_path);
6003    let resolved = if path.is_absolute() {
6004        path.to_path_buf()
6005    } else {
6006        cwd.join(path)
6007    };
6008    let _meta = std::fs::metadata(&resolved)
6009        .map_err(|e| Error::Message(format!("could not read orderfile {order_path}: {e}")))?;
6010    let mut f = std::fs::File::open(&resolved)
6011        .map_err(|e| Error::Message(format!("could not read orderfile {order_path}: {e}")))?;
6012    let mut content = String::new();
6013    std::io::Read::read_to_string(&mut f, &mut content)
6014        .map_err(|e| Error::Message(format!("could not read orderfile {order_path}: {e}")))?;
6015    Ok(content
6016        .lines()
6017        .map(|l| l.trim().to_string())
6018        .filter(|l| !l.is_empty() && !l.starts_with('#'))
6019        .collect())
6020}
6021
6022/// Reorder diff entries for `git diff` `--rotate-to` / `--skip-to` (changed paths only).
6023pub fn apply_rotate_skip_entries(
6024    mut entries: Vec<DiffEntry>,
6025    rotate_to: Option<&str>,
6026    skip_to: Option<&str>,
6027) -> Result<Vec<DiffEntry>> {
6028    let Some(needle) = rotate_to.or(skip_to) else {
6029        return Ok(entries);
6030    };
6031    let needle = needle.trim();
6032    if needle.is_empty() {
6033        return Ok(entries);
6034    }
6035    let idx = entries
6036        .iter()
6037        .position(|e| e.path() == needle)
6038        .ok_or_else(|| Error::Message(format!("fatal: No such path '{needle}' in the diff")))?;
6039    if rotate_to.is_some() {
6040        entries.rotate_left(idx);
6041    }
6042    if let Some(skip) = skip_to.filter(|s| !s.trim().is_empty()) {
6043        let pos = entries
6044            .iter()
6045            .position(|e| e.path() == skip)
6046            .ok_or_else(|| Error::Message(format!("fatal: No such path '{skip}' in the diff")))?;
6047        entries.drain(..pos);
6048    }
6049    Ok(entries)
6050}
6051
6052/// `git log` rotate/skip: reorder using the **commit tree** path order (all blobs), then keep only
6053/// paths present in `entries` — matches Git's `diff --rotate-to` with history walks.
6054pub fn apply_rotate_skip_log_entries(
6055    odb: &Odb,
6056    commit_tree: &ObjectId,
6057    entries: Vec<DiffEntry>,
6058    rotate_to: Option<&str>,
6059    skip_to: Option<&str>,
6060) -> Result<Vec<DiffEntry>> {
6061    let tree_paths = crate::merge_diff::all_blob_paths_in_tree_order(odb, commit_tree);
6062    apply_rotate_skip_ordered_paths(&tree_paths, entries, rotate_to, skip_to)
6063}
6064
6065fn apply_rotate_skip_ordered_paths(
6066    tree_paths: &[String],
6067    entries: Vec<DiffEntry>,
6068    rotate_to: Option<&str>,
6069    skip_to: Option<&str>,
6070) -> Result<Vec<DiffEntry>> {
6071    let rotate = rotate_to.and_then(|s| {
6072        let t = s.trim();
6073        (!t.is_empty()).then_some(t)
6074    });
6075    let skip = skip_to.and_then(|s| {
6076        let t = s.trim();
6077        (!t.is_empty()).then_some(t)
6078    });
6079    if rotate.is_none() && skip.is_none() {
6080        return Ok(entries);
6081    }
6082
6083    use std::collections::HashMap;
6084    let mut by_path: HashMap<String, DiffEntry> = HashMap::new();
6085    for e in entries {
6086        by_path.insert(e.path().to_string(), e);
6087    }
6088
6089    // `git log --skip-to`: only list changed paths from the skip point onward (unmodified paths
6090    // in the tree-order suffix are omitted). `--rotate-to` still lists every changed file in order.
6091    if rotate.is_none() {
6092        let Some(skip_path) = skip else {
6093            return Ok(by_path.into_values().collect());
6094        };
6095        let idx = tree_paths
6096            .iter()
6097            .position(|p| p == skip_path)
6098            .ok_or_else(|| {
6099                Error::Message(format!("fatal: No such path '{skip_path}' in the diff"))
6100            })?;
6101        let mut out = Vec::new();
6102        for p in tree_paths.iter().skip(idx) {
6103            if let Some(e) = by_path.remove(p) {
6104                out.push(e);
6105            }
6106        }
6107        return Ok(out);
6108    }
6109
6110    let Some(needle) = rotate else {
6111        return Ok(by_path.into_values().collect());
6112    };
6113    let idx = tree_paths
6114        .iter()
6115        .position(|p| p == needle)
6116        .ok_or_else(|| Error::Message(format!("fatal: No such path '{needle}' in the diff")))?;
6117    let mut order: Vec<String> = tree_paths.to_vec();
6118    order.rotate_left(idx);
6119    if let Some(skip_path) = skip {
6120        let pos = order.iter().position(|p| p == skip_path).ok_or_else(|| {
6121            Error::Message(format!("fatal: No such path '{skip_path}' in the diff"))
6122        })?;
6123        order.drain(..pos);
6124    }
6125    let mut out = Vec::new();
6126    for p in order {
6127        if let Some(e) = by_path.remove(&p) {
6128            out.push(e);
6129        }
6130    }
6131    Ok(out)
6132}
6133
6134/// Check if an orderfile pattern matches a path (matches the basename or the full path).
6135/// Supports basic glob patterns: `*` matches any sequence, `?` matches one char.
6136pub fn orderfile_pattern_matches(pattern: &str, path: &str) -> bool {
6137    let name = path.rsplit('/').next().unwrap_or(path);
6138    orderfile_glob_match(pattern, name) || orderfile_glob_match(pattern, path)
6139}
6140
6141/// Basic glob matching (supports `*` and `?`).
6142fn orderfile_glob_match(pattern: &str, text: &str) -> bool {
6143    let mut pi = 0;
6144    let mut ti = 0;
6145    let pb = pattern.as_bytes();
6146    let tb = text.as_bytes();
6147    let mut star_pi = usize::MAX;
6148    let mut star_ti = 0;
6149
6150    while ti < tb.len() {
6151        if pi < pb.len() && (pb[pi] == b'?' || pb[pi] == tb[ti]) {
6152            pi += 1;
6153            ti += 1;
6154        } else if pi < pb.len() && pb[pi] == b'*' {
6155            star_pi = pi;
6156            star_ti = ti;
6157            pi += 1;
6158        } else if star_pi != usize::MAX {
6159            pi = star_pi + 1;
6160            star_ti += 1;
6161            ti = star_ti;
6162        } else {
6163            return false;
6164        }
6165    }
6166    while pi < pb.len() && pb[pi] == b'*' {
6167        pi += 1;
6168    }
6169    pi == pb.len()
6170}