grit_lib/
diff.rs

1//! Diff machinery — compare trees, index entries, and working tree files.
2//!
3//! # Overview
4//!
5//! This module provides the core diffing infrastructure shared by `diff`,
6//! `diff-index`, `status`, `log`, `show`, `commit`, and `merge`.
7//!
8//! ## Levels of comparison
9//!
10//! 1. **Tree-to-tree** — compare two tree objects (e.g. for `log`/`show`).
11//! 2. **Tree-to-index** — compare a tree (usually HEAD) against the index
12//!    (staged changes, used by `diff --cached` and `status`).
13//! 3. **Index-to-worktree** — compare index against the working directory
14//!    (unstaged changes, used by `diff` and `status`).
15//!
16//! ## Content diff
17//!
18//! Line-level diffing uses the `similar` crate (Myers, patience, minimal) and,
19//! for Git's `histogram` algorithm, `imara-diff` for output compatible with upstream Git.
20//! Output formats: unified patch, raw (`:old-mode new-mode ...`), stat,
21//! numstat.
22
23use std::fs;
24#[cfg(unix)]
25use std::os::unix::fs::MetadataExt;
26use std::path::{Path, PathBuf};
27
28use crate::config::ConfigSet;
29use crate::diff_indent_heuristic;
30use crate::error::{Error, Result};
31use crate::index::{Index, IndexEntry};
32use crate::objects::{parse_commit, parse_tree, CommitData, ObjectId, ObjectKind, TreeEntry};
33use crate::odb::Odb;
34use crate::userdiff::FuncnameMatcher;
35
36/// Splits imara-diff unified body (concatenated hunks) into per-hunk slices for post-processing.
37fn imara_unified_hunk_slices(body: &str) -> Vec<&str> {
38    let mut starts: Vec<usize> = Vec::new();
39    if body.starts_with("@@") {
40        starts.push(0);
41    }
42    for (idx, _) in body.match_indices("\n@@ ") {
43        starts.push(idx + 1);
44    }
45    starts.push(body.len());
46    starts.windows(2).map(|w| &body[w[0]..w[1]]).collect()
47}
48
49fn histogram_unified_body_raw(
50    old_content: &str,
51    new_content: &str,
52    context_lines: usize,
53    inter_hunk_context: usize,
54) -> String {
55    use imara_diff::{Algorithm, Diff, Hunk, InternedInput};
56    use std::fmt::Write as _;
57
58    let input = InternedInput::new(old_content, new_content);
59    let mut diff = Diff::compute(Algorithm::Histogram, &input);
60    diff.postprocess_lines(&input);
61
62    // Assemble hunks ourselves: imara's `UnifiedDiff` printer starts the first
63    // hunk's context at line 0 whenever the first change is within
64    // `2 * context_len` of the file start, emitting more leading context than
65    // its own header claims (t4061), and its gap threshold cannot express
66    // Git's odd `2 * U + inter_hunk_context` fuse limits (t4032).
67    let hunks: Vec<Hunk> = diff.hunks().collect();
68    if hunks.is_empty() {
69        return String::new();
70    }
71
72    let ctx = context_lines.min(u32::MAX as usize) as u32;
73    let max_gap = (2usize.saturating_mul(context_lines))
74        .saturating_add(inter_hunk_context)
75        .min(u32::MAX as usize) as u32;
76    let before_len = input.before.len() as u32;
77    let after_len = input.after.len() as u32;
78
79    // Fuse hunks whose unchanged gap is at most `max_gap` (Git xdl_get_hunk).
80    let mut groups: Vec<&[Hunk]> = Vec::new();
81    let mut group_start = 0usize;
82    for i in 1..hunks.len() {
83        if hunks[i].before.start - hunks[i - 1].before.end > max_gap {
84            groups.push(&hunks[group_start..i]);
85            group_start = i;
86        }
87    }
88    groups.push(&hunks[group_start..]);
89
90    fn push_line(out: &mut String, prefix: char, text: &str) {
91        out.push(prefix);
92        out.push_str(text);
93        if !text.ends_with('\n') {
94            out.push('\n');
95        }
96    }
97
98    // Git hunk header range: 1-based start (the preceding line when the range
99    // is empty) with the `,count` part omitted when the count is exactly 1.
100    fn fmt_side(start: u32, count: u32) -> String {
101        let shown_start = if count == 0 { start } else { start + 1 };
102        if count == 1 {
103            format!("{shown_start}")
104        } else {
105            format!("{shown_start},{count}")
106        }
107    }
108
109    let mut out = String::new();
110    for group in groups {
111        let first = &group[0];
112        let last = &group[group.len() - 1];
113        let b_start = first.before.start.saturating_sub(ctx);
114        let a_start = first.after.start.saturating_sub(ctx);
115        let b_end = (last.before.end.saturating_add(ctx)).min(before_len);
116        let a_end = (last.after.end.saturating_add(ctx)).min(after_len);
117
118        let _ = writeln!(
119            out,
120            "@@ -{} +{} @@",
121            fmt_side(b_start, b_end - b_start),
122            fmt_side(a_start, a_end - a_start)
123        );
124
125        let mut pos = b_start;
126        for hunk in group {
127            for &token in &input.before[pos as usize..hunk.before.start as usize] {
128                push_line(&mut out, ' ', input.interner[token]);
129            }
130            for &token in &input.before[hunk.before.start as usize..hunk.before.end as usize] {
131                push_line(&mut out, '-', input.interner[token]);
132            }
133            for &token in &input.after[hunk.after.start as usize..hunk.after.end as usize] {
134                push_line(&mut out, '+', input.interner[token]);
135            }
136            pos = hunk.before.end;
137        }
138        for &token in &input.before[pos as usize..b_end as usize] {
139            push_line(&mut out, ' ', input.interner[token]);
140        }
141    }
142
143    out
144}
145
146/// Build the unified-diff body (no `---`/`+++` header) using Git's histogram
147/// algorithm, but applying `--ignore-blank-lines` / `-I` semantics through a
148/// Git-compatible implementation of the xdiff change-record machinery
149/// (`xdl_mark_ignorable_lines` + `xdl_get_hunk` + `xdl_emit_diff`).
150///
151/// `is_ignorable_change` is called with the slices of removed and added lines
152/// for one change record (each item is the line text without its trailing
153/// newline); it returns `true` when the whole change is ignorable (Git's
154/// `xch->ignore`). Ignorable changes that are far from substantive changes are
155/// dropped, and hunk boundaries are computed with Git's `max_ignorable`
156/// distance rules so the line numbers and surviving context match Git exactly.
157///
158/// Returns the body only; an empty string means no substantive change survives.
159/// Optional `--function-context` (`-W`) hunk expansion is applied *after*
160/// ignorable change-record pruning so the boundaries/funcname header match Git.
161#[allow(clippy::too_many_arguments)]
162pub(crate) fn histogram_unified_body_ignore_fc<F>(
163    old_content: &str,
164    new_content: &str,
165    context_lines: usize,
166    inter_hunk_context: usize,
167    function_context: bool,
168    funcname_matcher: Option<&FuncnameMatcher>,
169    is_ignorable_change: F,
170) -> String
171where
172    F: Fn(&[&str], &[&str]) -> bool,
173{
174    use imara_diff::{Algorithm, Diff, Hunk, InternedInput};
175    use std::fmt::Write as _;
176
177    let input = InternedInput::new(old_content, new_content);
178    let mut diff = Diff::compute(Algorithm::Histogram, &input);
179    diff.postprocess_lines(&input);
180
181    let raw_hunks: Vec<Hunk> = diff.hunks().collect();
182    if raw_hunks.is_empty() {
183        return String::new();
184    }
185
186    let before_len = input.before.len() as i64;
187    let after_len = input.after.len() as i64;
188    let ctxlen = context_lines as i64;
189    let interhunk = inter_hunk_context as i64;
190    // Git's xdl_get_hunk fuse limit: 2*ctxlen + interhunkctxlen.
191    let max_common = ctxlen.saturating_add(ctxlen).saturating_add(interhunk);
192    let max_ignorable = ctxlen;
193
194    // A change record mirrors Git's xdchange_t.
195    struct Change {
196        i1: i64,
197        chg1: i64,
198        i2: i64,
199        chg2: i64,
200        ignore: bool,
201    }
202
203    let mut changes: Vec<Change> = Vec::with_capacity(raw_hunks.len());
204    for h in &raw_hunks {
205        let i1 = h.before.start as i64;
206        let chg1 = (h.before.end - h.before.start) as i64;
207        let i2 = h.after.start as i64;
208        let chg2 = (h.after.end - h.after.start) as i64;
209        let removed: Vec<&str> = input.before[h.before.start as usize..h.before.end as usize]
210            .iter()
211            .map(|&t| input.interner[t])
212            .collect();
213        let added: Vec<&str> = input.after[h.after.start as usize..h.after.end as usize]
214            .iter()
215            .map(|&t| input.interner[t])
216            .collect();
217        let ignore = is_ignorable_change(&removed, &added);
218        changes.push(Change {
219            i1,
220            chg1,
221            i2,
222            chg2,
223            ignore,
224        });
225    }
226
227    // Helper to fetch a line of either side (token interner over lines).
228    let before_line = |idx: i64| -> &str { input.interner[input.before[idx as usize]] };
229    let after_line = |idx: i64| -> &str { input.interner[input.after[idx as usize]] };
230
231    // Port of xdl_get_hunk: starting at change index `start`, return
232    // `(new_start, last)` — the possibly-advanced first change index (leading
233    // ignorable changes dropped) and the index of the last change to include in
234    // this hunk — or `None` when the leading ignorable changes are all dropped
235    // and nothing remains.
236    let get_hunk = |start: usize| -> Option<(usize, usize)> {
237        // Remove ignorable changes that are too far before other changes.
238        // Faithful port of xdl_get_hunk's leading loop: walk every leading
239        // ignorable change; whenever its successor is absent or far enough away
240        // (>= max_ignorable), advance the hunk start past it. `scr` is the new
241        // start index, or `changes.len()` when everything is dropped.
242        let mut scr = start;
243        let mut xchp = start;
244        while xchp < changes.len() && changes[xchp].ignore {
245            let next = xchp + 1;
246            if next >= changes.len()
247                || changes[next].i1 - (changes[xchp].i1 + changes[xchp].chg1) >= max_ignorable
248            {
249                scr = next;
250            }
251            xchp = next;
252        }
253        if scr >= changes.len() {
254            return None;
255        }
256
257        let mut lxch = scr;
258        let mut ignored: i64 = 0;
259        let mut xchp = scr;
260        let mut idx = scr + 1;
261        while idx < changes.len() {
262            let xch = &changes[idx];
263            let prev = &changes[xchp];
264            let distance = xch.i1 - (prev.i1 + prev.chg1);
265            if distance > max_common {
266                break;
267            }
268            if distance < max_ignorable && (!xch.ignore || lxch == xchp) {
269                lxch = idx;
270                ignored = 0;
271            } else if distance < max_ignorable && xch.ignore {
272                ignored += xch.chg2;
273            } else if lxch != xchp
274                && xch.i1 + ignored - (changes[lxch].i1 + changes[lxch].chg1) > max_common
275            {
276                break;
277            } else if !xch.ignore {
278                lxch = idx;
279                ignored = 0;
280            } else {
281                ignored += xch.chg2;
282            }
283            xchp = idx;
284            idx += 1;
285        }
286        Some((scr, lxch))
287    };
288
289    fn push_line(out: &mut String, prefix: char, text: &str) {
290        out.push(prefix);
291        out.push_str(text);
292        if !text.ends_with('\n') {
293            out.push('\n');
294        }
295    }
296
297    // Git hunk header range (xdl_emit_hunk_hdr): `,count` part omitted when 1.
298    fn fmt_side(start: i64, count: i64) -> String {
299        if count == 1 {
300            format!("{start}")
301        } else {
302            format!("{start},{count}")
303        }
304    }
305
306    let mut out = String::new();
307    let mut cursor = 0usize;
308    while cursor < changes.len() {
309        let Some((xch_idx, xche_idx)) = get_hunk(cursor) else {
310            break;
311        };
312        let xch = &changes[xch_idx];
313        let xche = &changes[xche_idx];
314
315        // pre-context
316        let mut s1 = (xch.i1 - ctxlen).max(0);
317        let mut s2 = (xch.i2 - ctxlen).max(0);
318        // post-context (clamped so it does not run past either side's EOF).
319        let mut lctx = ctxlen;
320        lctx = lctx.min(before_len - (xche.i1 + xche.chg1));
321        lctx = lctx.min(after_len - (xche.i2 + xche.chg2));
322        let mut e1 = xche.i1 + xche.chg1 + lctx;
323        let mut e2 = xche.i2 + xche.chg2 + lctx;
324
325        // `--function-context`: expand the hunk up to the enclosing function's
326        // header and down to the line before the next function (Git's
327        // XDL_EMIT_FUNCCONTEXT in xemit.c), using the already ignore-pruned
328        // change boundaries.
329        if function_context {
330            // Upward: find the function line at or before xch.i1, then climb to
331            // the start of that function (past non-empty non-func lines).
332            let mut fs1 = xch.i1;
333            while fs1 >= 0 && !is_func_line(before_line(fs1), funcname_matcher) {
334                fs1 -= 1;
335            }
336            while fs1 > 0
337                && before_line(fs1 - 1).trim().is_empty() == false
338                && !is_func_line(before_line(fs1 - 1), funcname_matcher)
339            {
340                fs1 -= 1;
341            }
342            if fs1 < 0 {
343                fs1 = 0;
344            }
345            if fs1 < s1 {
346                s2 = (s2 - (s1 - fs1)).max(0);
347                s1 = fs1;
348            }
349            // Downward: find the next function line after the hunk end, climb up
350            // past trailing empty lines, and extend e1/e2 to it.
351            let end1 = xche.i1 + xche.chg1;
352            let mut fe1 = end1;
353            while fe1 < before_len && !is_func_line(before_line(fe1), funcname_matcher) {
354                fe1 += 1;
355            }
356            while fe1 > 0 && before_line(fe1 - 1).trim().is_empty() {
357                fe1 -= 1;
358            }
359            if fe1 > before_len {
360                fe1 = before_len;
361            }
362            if fe1 > e1 {
363                e2 = (e2 + (fe1 - e1)).min(after_len);
364                e1 = fe1;
365            }
366        }
367
368        let _ = writeln!(
369            out,
370            "@@ -{} +{} @@",
371            fmt_side(s1 + 1, e1 - s1),
372            fmt_side(s2 + 1, e2 - s2)
373        );
374
375        // Emit pre-context from the new side.
376        let mut s2c = s2;
377        while s2c < xch.i2 {
378            push_line(&mut out, ' ', after_line(s2c));
379            s2c += 1;
380        }
381
382        // Emit each change record and the context between merged records.
383        let mut s1c = xch.i1;
384        let mut s2c = xch.i2;
385        let mut k = xch_idx;
386        loop {
387            let c = &changes[k];
388            // Context between previous and current change atom.
389            while s1c < c.i1 && s2c < c.i2 {
390                push_line(&mut out, ' ', after_line(s2c));
391                s1c += 1;
392                s2c += 1;
393            }
394            // Removed lines from old side.
395            let mut r = c.i1;
396            while r < c.i1 + c.chg1 {
397                push_line(&mut out, '-', before_line(r));
398                r += 1;
399            }
400            // Added lines from new side.
401            let mut a = c.i2;
402            while a < c.i2 + c.chg2 {
403                push_line(&mut out, '+', after_line(a));
404                a += 1;
405            }
406            if k == xche_idx {
407                break;
408            }
409            s1c = c.i1 + c.chg1;
410            s2c = c.i2 + c.chg2;
411            k += 1;
412        }
413
414        // Post-context.
415        let mut s2p = xche.i2 + xche.chg2;
416        while s2p < e2 {
417            push_line(&mut out, ' ', after_line(s2p));
418            s2p += 1;
419        }
420
421        cursor = xche_idx + 1;
422    }
423
424    out
425}
426
427/// Unified diff hunks for Git's histogram algorithm (no `---` / `+++` lines).
428///
429/// Used by `--no-index` when whitespace normalization is off so the patch matches upstream Git.
430#[must_use]
431pub fn unified_diff_histogram_hunks_only(
432    old_content: &str,
433    new_content: &str,
434    context_lines: usize,
435    inter_hunk_context: usize,
436) -> String {
437    histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context)
438}
439
440/// Full unified diff (`---` / `+++` / hunks) using Git's histogram algorithm.
441#[must_use]
442pub fn unified_diff_histogram_with_prefix_and_funcname(
443    old_content: &str,
444    new_content: &str,
445    old_path: &str,
446    new_path: &str,
447    context_lines: usize,
448    inter_hunk_context: usize,
449    src_prefix: &str,
450    dst_prefix: &str,
451    funcname_matcher: Option<&FuncnameMatcher>,
452    quote_path_fully: bool,
453) -> String {
454    use crate::quote_path::format_diff_path_with_prefix;
455
456    let body =
457        histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context);
458
459    let mut output = String::new();
460    if old_path == "/dev/null" {
461        output.push_str("--- /dev/null\n");
462    } else if src_prefix.is_empty() {
463        output.push_str(&format!("--- {old_path}\n"));
464    } else {
465        output.push_str("--- ");
466        output.push_str(&format_diff_path_with_prefix(
467            src_prefix,
468            old_path,
469            quote_path_fully,
470        ));
471        output.push('\n');
472    }
473    if new_path == "/dev/null" {
474        output.push_str("+++ /dev/null\n");
475    } else if dst_prefix.is_empty() {
476        output.push_str(&format!("+++ {new_path}\n"));
477    } else {
478        output.push_str("+++ ");
479        output.push_str(&format_diff_path_with_prefix(
480            dst_prefix,
481            new_path,
482            quote_path_fully,
483        ));
484        output.push('\n');
485    }
486
487    let old_lines: Vec<&str> = old_content.lines().collect();
488    for hunk_str in imara_unified_hunk_slices(&body) {
489        if hunk_str.is_empty() {
490            continue;
491        }
492        if let Some(first_newline) = hunk_str.find('\n') {
493            let header_line = &hunk_str[..first_newline];
494            let rest = &hunk_str[first_newline..];
495            if let Some(func_ctx) =
496                extract_function_context(header_line, &old_lines, funcname_matcher)
497            {
498                output.push_str(header_line);
499                output.push(' ');
500                output.push_str(&func_ctx);
501                output.push_str(rest);
502            } else {
503                output.push_str(hunk_str);
504            }
505        } else {
506            output.push_str(hunk_str);
507        }
508    }
509
510    output
511}
512
513/// Full unified diff (`---` / `+++` / hunks) using Git's histogram algorithm,
514/// applying `--ignore-blank-lines` / `-I` change-record suppression
515/// ([`histogram_unified_body_ignore`]) and then attaching function-name hunk
516/// headers exactly like [`unified_diff_histogram_with_prefix_and_funcname`].
517///
518/// `is_ignorable_change` is given the removed/added line slices (without
519/// trailing newlines) of one change record and returns whether the whole change
520/// is ignorable. Returns an empty string when no substantive change survives so
521/// the caller can hide the file entry.
522#[allow(clippy::too_many_arguments)]
523pub fn unified_diff_histogram_ignore_with_prefix_and_funcname<F>(
524    old_content: &str,
525    new_content: &str,
526    old_path: &str,
527    new_path: &str,
528    context_lines: usize,
529    inter_hunk_context: usize,
530    src_prefix: &str,
531    dst_prefix: &str,
532    funcname_matcher: Option<&FuncnameMatcher>,
533    quote_path_fully: bool,
534    function_context: bool,
535    is_ignorable_change: F,
536) -> String
537where
538    F: Fn(&[&str], &[&str]) -> bool,
539{
540    use crate::quote_path::format_diff_path_with_prefix;
541
542    let body = histogram_unified_body_ignore_fc(
543        old_content,
544        new_content,
545        context_lines,
546        inter_hunk_context,
547        function_context,
548        funcname_matcher,
549        is_ignorable_change,
550    );
551    if body.is_empty() {
552        return String::new();
553    }
554
555    let mut output = String::new();
556    if old_path == "/dev/null" {
557        output.push_str("--- /dev/null\n");
558    } else if src_prefix.is_empty() {
559        output.push_str(&format!("--- {old_path}\n"));
560    } else {
561        output.push_str("--- ");
562        output.push_str(&format_diff_path_with_prefix(
563            src_prefix,
564            old_path,
565            quote_path_fully,
566        ));
567        output.push('\n');
568    }
569    if new_path == "/dev/null" {
570        output.push_str("+++ /dev/null\n");
571    } else if dst_prefix.is_empty() {
572        output.push_str(&format!("+++ {new_path}\n"));
573    } else {
574        output.push_str("+++ ");
575        output.push_str(&format_diff_path_with_prefix(
576            dst_prefix,
577            new_path,
578            quote_path_fully,
579        ));
580        output.push('\n');
581    }
582
583    let old_lines: Vec<&str> = old_content.lines().collect();
584    for hunk_str in imara_unified_hunk_slices(&body) {
585        if hunk_str.is_empty() {
586            continue;
587        }
588        if let Some(first_newline) = hunk_str.find('\n') {
589            let header_line = &hunk_str[..first_newline];
590            let rest = &hunk_str[first_newline..];
591            if let Some(func_ctx) =
592                extract_function_context(header_line, &old_lines, funcname_matcher)
593            {
594                output.push_str(header_line);
595                output.push(' ');
596                output.push_str(&func_ctx);
597                output.push_str(rest);
598            } else {
599                output.push_str(hunk_str);
600            }
601        } else {
602            output.push_str(hunk_str);
603        }
604    }
605
606    output
607}
608
609/// `diff.indentHeuristic` from config (Git defaults to true when unset).
610#[must_use]
611pub fn indent_heuristic_from_config(config: &ConfigSet) -> bool {
612    match config.get_bool("diff.indentHeuristic") {
613        Some(Ok(b)) => b,
614        Some(Err(_)) | None => true,
615    }
616}
617
618/// Resolve indent heuristic: `--no-indent-heuristic` and `--indent-heuristic` override config.
619#[must_use]
620pub fn resolve_indent_heuristic(
621    config: &ConfigSet,
622    cli_indent_heuristic: bool,
623    cli_no_indent_heuristic: bool,
624) -> bool {
625    if cli_no_indent_heuristic {
626        false
627    } else if cli_indent_heuristic {
628        true
629    } else {
630        indent_heuristic_from_config(config)
631    }
632}
633
634/// Parse `--indent-heuristic` / `--no-indent-heuristic` from a plumbing argv slice (last occurrence wins).
635#[must_use]
636pub fn parse_indent_heuristic_cli_flags(argv: &[String]) -> (bool, bool) {
637    let mut indent_heuristic = false;
638    let mut no_indent_heuristic = false;
639    for a in argv {
640        match a.as_str() {
641            "--indent-heuristic" => {
642                indent_heuristic = true;
643                no_indent_heuristic = false;
644            }
645            "--no-indent-heuristic" => {
646                no_indent_heuristic = true;
647                indent_heuristic = false;
648            }
649            _ => {}
650        }
651    }
652    (indent_heuristic, no_indent_heuristic)
653}
654
655// ---------------------------------------------------------------------------
656// Git-compatible implementation of the xdiff Myers engine,
657// restricted to what the word-diff path needs (no rename/ignore-whitespace
658// flags). Git's word diff runs the *default* xdiff Myers over the per-word
659// token streams; matching Git's exact change-record selection requires
660// reproducing its preprocessing (record classification + `xdl_cleanup_records`
661// + `xdl_trim_ends`) and its divide-and-conquer split, because `imara-diff`
662// (and `similar`) pick different — but equally minimal — alignments for inputs
663// with many repeated tokens (e.g. the `{`/`}`/`,` in the `bibtex` driver).
664mod git_xdiff {
665    const XDL_MAX_COST_MIN: i64 = 256;
666    const XDL_HEUR_MIN_COST: i64 = 256;
667    const XDL_SNAKE_CNT: i64 = 20;
668    const XDL_K_HEUR: i64 = 4;
669    const XDL_LINE_MAX: i64 = i64::MAX;
670    const XDL_KPDIS_RUN: i64 = 4;
671    const XDL_MAX_EQLIMIT: i64 = 1024;
672    const XDL_SIMSCAN_WINDOW: i64 = 100;
673
674    // record-classification actions (xprepare.c)
675    const DISCARD: u8 = 0;
676    const KEEP: u8 = 1;
677    const INVESTIGATE: u8 = 2;
678
679    /// Classical integer square root approximation using shifts (`xdl_bogosqrt`).
680    fn bogosqrt(mut n: i64) -> i64 {
681        let mut i: i64 = 1;
682        while n > 0 {
683            i <<= 1;
684            n >>= 2;
685        }
686        i
687    }
688
689    struct XdAlgoEnv {
690        mxcost: i64,
691        snake_cnt: i64,
692        heur_min: i64,
693    }
694
695    struct XdpSplit {
696        i1: i64,
697        i2: i64,
698        min_lo: bool,
699        min_hi: bool,
700    }
701
702    /// A single side of the diff: token ids plus the `changed` flags and the
703    /// `reference_index` mapping from effective record -> original record.
704    struct XdFile<'a> {
705        ids: &'a [u32],
706        changed: Vec<bool>,
707        reference_index: Vec<usize>,
708        dstart: i64,
709        dend: i64,
710        nreff: i64,
711    }
712
713    /// Hash/id lookup over the *effective* records: index into `reference_index`.
714    #[inline]
715    fn get_id(xdf: &XdFile<'_>, idx: i64) -> u32 {
716        xdf.ids[xdf.reference_index[idx as usize]]
717    }
718
719    /// Faithful port of `xdl_split` (the middle-snake finder). `kvdf`/`kvdb` are
720    /// the forward/backward k-vectors, offset so index 0 maps to diagonal 0.
721    #[allow(clippy::too_many_arguments)]
722    fn xdl_split(
723        xdf1: &XdFile<'_>,
724        off1: i64,
725        lim1: i64,
726        xdf2: &XdFile<'_>,
727        off2: i64,
728        lim2: i64,
729        kvdf: &mut [i64],
730        kvdb: &mut [i64],
731        koff: i64,
732        need_min: bool,
733        spl: &mut XdpSplit,
734        xenv: &XdAlgoEnv,
735    ) -> i64 {
736        let dmin = off1 - lim2;
737        let dmax = lim1 - off2;
738        let fmid = off1 - off2;
739        let bmid = lim1 - lim2;
740        let odd = ((fmid - bmid) & 1) != 0;
741        let mut fmin = fmid;
742        let mut fmax = fmid;
743        let mut bmin = bmid;
744        let mut bmax = bmid;
745
746        let kf = |k: i64| -> usize { (k + koff) as usize };
747
748        kvdf[kf(fmid)] = off1;
749        kvdb[kf(bmid)] = lim1;
750
751        let mut ec: i64 = 1;
752        loop {
753            let mut got_snake = false;
754
755            if fmin > dmin {
756                fmin -= 1;
757                kvdf[kf(fmin - 1)] = -1;
758            } else {
759                fmin += 1;
760            }
761            if fmax < dmax {
762                fmax += 1;
763                kvdf[kf(fmax + 1)] = -1;
764            } else {
765                fmax -= 1;
766            }
767
768            let mut d = fmax;
769            while d >= fmin {
770                let mut i1 = if kvdf[kf(d - 1)] >= kvdf[kf(d + 1)] {
771                    kvdf[kf(d - 1)] + 1
772                } else {
773                    kvdf[kf(d + 1)]
774                };
775                let prev1 = i1;
776                let mut i2 = i1 - d;
777                while i1 < lim1 && i2 < lim2 && get_id(xdf1, i1) == get_id(xdf2, i2) {
778                    i1 += 1;
779                    i2 += 1;
780                }
781                if i1 - prev1 > xenv.snake_cnt {
782                    got_snake = true;
783                }
784                kvdf[kf(d)] = i1;
785                if odd && bmin <= d && d <= bmax && kvdb[kf(d)] <= i1 {
786                    spl.i1 = i1;
787                    spl.i2 = i2;
788                    spl.min_lo = true;
789                    spl.min_hi = true;
790                    return ec;
791                }
792                d -= 2;
793            }
794
795            if bmin > dmin {
796                bmin -= 1;
797                kvdb[kf(bmin - 1)] = XDL_LINE_MAX;
798            } else {
799                bmin += 1;
800            }
801            if bmax < dmax {
802                bmax += 1;
803                kvdb[kf(bmax + 1)] = XDL_LINE_MAX;
804            } else {
805                bmax -= 1;
806            }
807
808            let mut d = bmax;
809            while d >= bmin {
810                let mut i1 = if kvdb[kf(d - 1)] < kvdb[kf(d + 1)] {
811                    kvdb[kf(d - 1)]
812                } else {
813                    kvdb[kf(d + 1)] - 1
814                };
815                let prev1 = i1;
816                let mut i2 = i1 - d;
817                while i1 > off1 && i2 > off2 && get_id(xdf1, i1 - 1) == get_id(xdf2, i2 - 1) {
818                    i1 -= 1;
819                    i2 -= 1;
820                }
821                if prev1 - i1 > xenv.snake_cnt {
822                    got_snake = true;
823                }
824                kvdb[kf(d)] = i1;
825                if !odd && fmin <= d && d <= fmax && i1 <= kvdf[kf(d)] {
826                    spl.i1 = i1;
827                    spl.i2 = i2;
828                    spl.min_lo = true;
829                    spl.min_hi = true;
830                    return ec;
831                }
832                d -= 2;
833            }
834
835            if need_min {
836                ec += 1;
837                continue;
838            }
839
840            if got_snake && ec > xenv.heur_min {
841                let mut best = 0i64;
842                let mut d = fmax;
843                while d >= fmin {
844                    let dd = if d > fmid { d - fmid } else { fmid - d };
845                    let i1 = kvdf[kf(d)];
846                    let i2 = i1 - d;
847                    let v = (i1 - off1) + (i2 - off2) - dd;
848                    if v > XDL_K_HEUR * ec
849                        && v > best
850                        && off1 + xenv.snake_cnt <= i1
851                        && i1 < lim1
852                        && off2 + xenv.snake_cnt <= i2
853                        && i2 < lim2
854                    {
855                        let mut k = 1i64;
856                        while get_id(xdf1, i1 - k) == get_id(xdf2, i2 - k) {
857                            if k == xenv.snake_cnt {
858                                best = v;
859                                spl.i1 = i1;
860                                spl.i2 = i2;
861                                break;
862                            }
863                            k += 1;
864                        }
865                    }
866                    d -= 2;
867                }
868                if best > 0 {
869                    spl.min_lo = true;
870                    spl.min_hi = false;
871                    return ec;
872                }
873
874                let mut best = 0i64;
875                let mut d = bmax;
876                while d >= bmin {
877                    let dd = if d > bmid { d - bmid } else { bmid - d };
878                    let i1 = kvdb[kf(d)];
879                    let i2 = i1 - d;
880                    let v = (lim1 - i1) + (lim2 - i2) - dd;
881                    if v > XDL_K_HEUR * ec
882                        && v > best
883                        && off1 < i1
884                        && i1 <= lim1 - xenv.snake_cnt
885                        && off2 < i2
886                        && i2 <= lim2 - xenv.snake_cnt
887                    {
888                        let mut k = 0i64;
889                        while get_id(xdf1, i1 + k) == get_id(xdf2, i2 + k) {
890                            if k == xenv.snake_cnt - 1 {
891                                best = v;
892                                spl.i1 = i1;
893                                spl.i2 = i2;
894                                break;
895                            }
896                            k += 1;
897                        }
898                    }
899                    d -= 2;
900                }
901                if best > 0 {
902                    spl.min_lo = false;
903                    spl.min_hi = true;
904                    return ec;
905                }
906            }
907
908            if ec >= xenv.mxcost {
909                let mut fbest = -1i64;
910                let mut fbest1 = -1i64;
911                let mut d = fmax;
912                while d >= fmin {
913                    let mut i1 = kvdf[kf(d)].min(lim1);
914                    let mut i2 = i1 - d;
915                    if lim2 < i2 {
916                        i1 = lim2 + d;
917                        i2 = lim2;
918                    }
919                    if fbest < i1 + i2 {
920                        fbest = i1 + i2;
921                        fbest1 = i1;
922                    }
923                    d -= 2;
924                }
925
926                let mut bbest = XDL_LINE_MAX;
927                let mut bbest1 = XDL_LINE_MAX;
928                let mut d = bmax;
929                while d >= bmin {
930                    let mut i1 = off1.max(kvdb[kf(d)]);
931                    let mut i2 = i1 - d;
932                    if i2 < off2 {
933                        i1 = off2 + d;
934                        i2 = off2;
935                    }
936                    if i1 + i2 < bbest {
937                        bbest = i1 + i2;
938                        bbest1 = i1;
939                    }
940                    d -= 2;
941                }
942
943                if (lim1 + lim2) - bbest < fbest - (off1 + off2) {
944                    spl.i1 = fbest1;
945                    spl.i2 = fbest - fbest1;
946                    spl.min_lo = true;
947                    spl.min_hi = false;
948                } else {
949                    spl.i1 = bbest1;
950                    spl.i2 = bbest - bbest1;
951                    spl.min_lo = false;
952                    spl.min_hi = true;
953                }
954                return ec;
955            }
956
957            ec += 1;
958        }
959    }
960
961    /// Faithful port of `xdl_recs_cmp` (divide & conquer). Marks `changed` flags.
962    #[allow(clippy::too_many_arguments)]
963    fn xdl_recs_cmp(
964        xdf1: &mut XdFile<'_>,
965        mut off1: i64,
966        mut lim1: i64,
967        xdf2: &mut XdFile<'_>,
968        mut off2: i64,
969        mut lim2: i64,
970        kvdf: &mut [i64],
971        kvdb: &mut [i64],
972        koff: i64,
973        need_min: bool,
974        xenv: &XdAlgoEnv,
975    ) {
976        while off1 < lim1 && off2 < lim2 && get_id(xdf1, off1) == get_id(xdf2, off2) {
977            off1 += 1;
978            off2 += 1;
979        }
980        while off1 < lim1 && off2 < lim2 && get_id(xdf1, lim1 - 1) == get_id(xdf2, lim2 - 1) {
981            lim1 -= 1;
982            lim2 -= 1;
983        }
984
985        if off1 == lim1 {
986            while off2 < lim2 {
987                let r = xdf2.reference_index[off2 as usize];
988                xdf2.changed[r] = true;
989                off2 += 1;
990            }
991        } else if off2 == lim2 {
992            while off1 < lim1 {
993                let r = xdf1.reference_index[off1 as usize];
994                xdf1.changed[r] = true;
995                off1 += 1;
996            }
997        } else {
998            let mut spl = XdpSplit {
999                i1: 0,
1000                i2: 0,
1001                min_lo: false,
1002                min_hi: false,
1003            };
1004            xdl_split(
1005                xdf1, off1, lim1, xdf2, off2, lim2, kvdf, kvdb, koff, need_min, &mut spl, xenv,
1006            );
1007            xdl_recs_cmp(
1008                xdf1, off1, spl.i1, xdf2, off2, spl.i2, kvdf, kvdb, koff, spl.min_lo, xenv,
1009            );
1010            xdl_recs_cmp(
1011                xdf1, spl.i1, lim1, xdf2, spl.i2, lim2, kvdf, kvdb, koff, spl.min_hi, xenv,
1012            );
1013        }
1014    }
1015
1016    /// `xdl_clean_mmatch`: decide whether a multimatch record should be discarded.
1017    fn clean_mmatch(action: &[u8], i: i64, mut s: i64, mut e: i64) -> bool {
1018        if i - s > XDL_SIMSCAN_WINDOW {
1019            s = i - XDL_SIMSCAN_WINDOW;
1020        }
1021        if e - i > XDL_SIMSCAN_WINDOW {
1022            e = i + XDL_SIMSCAN_WINDOW;
1023        }
1024
1025        let mut rdis0 = 0i64;
1026        let mut rpdis0 = 1i64;
1027        let mut r = 1i64;
1028        while i - r >= s {
1029            match action[(i - r) as usize] {
1030                DISCARD => rdis0 += 1,
1031                INVESTIGATE => rpdis0 += 1,
1032                _ => break, // KEEP
1033            }
1034            r += 1;
1035        }
1036        if rdis0 == 0 {
1037            return false;
1038        }
1039        let mut rdis1 = 0i64;
1040        let mut rpdis1 = 1i64;
1041        let mut r = 1i64;
1042        while i + r <= e {
1043            match action[(i + r) as usize] {
1044                DISCARD => rdis1 += 1,
1045                INVESTIGATE => rpdis1 += 1,
1046                _ => break, // KEEP
1047            }
1048            r += 1;
1049        }
1050        if rdis1 == 0 {
1051            return false;
1052        }
1053        rdis1 += rdis0;
1054        rpdis1 += rpdis0;
1055        rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1)
1056    }
1057
1058    /// Build the `changed` flags for both token streams using Git's xdiff Myers
1059    /// pipeline. `ids1`/`ids2` are interned token ids (equal id <=> equal token).
1060    /// Returns `(changed1, changed2)`, one bool per original token.
1061    pub fn changed_flags(ids1: &[u32], ids2: &[u32]) -> (Vec<bool>, Vec<bool>) {
1062        let nrec1 = ids1.len();
1063        let nrec2 = ids2.len();
1064
1065        // Per-token occurrence counts on each side (class len1/len2).
1066        use std::collections::HashMap;
1067        let mut count1: HashMap<u32, i64> = HashMap::new();
1068        let mut count2: HashMap<u32, i64> = HashMap::new();
1069        for &id in ids1 {
1070            *count1.entry(id).or_insert(0) += 1;
1071        }
1072        for &id in ids2 {
1073            *count2.entry(id).or_insert(0) += 1;
1074        }
1075
1076        let mut xdf1 = XdFile {
1077            ids: ids1,
1078            changed: vec![false; nrec1],
1079            reference_index: Vec::new(),
1080            dstart: 0,
1081            dend: nrec1 as i64 - 1,
1082            nreff: 0,
1083        };
1084        let mut xdf2 = XdFile {
1085            ids: ids2,
1086            changed: vec![false; nrec2],
1087            reference_index: Vec::new(),
1088            dstart: 0,
1089            dend: nrec2 as i64 - 1,
1090            nreff: 0,
1091        };
1092
1093        // xdl_trim_ends: trim leading/trailing matching records.
1094        let lim = nrec1.min(nrec2) as i64;
1095        let mut i = 0i64;
1096        while i < lim && ids1[i as usize] == ids2[i as usize] {
1097            i += 1;
1098        }
1099        xdf1.dstart = i;
1100        xdf2.dstart = i;
1101        let mut j = 0i64;
1102        let rem = lim - i;
1103        while j < rem && ids1[nrec1 - 1 - j as usize] == ids2[nrec2 - 1 - j as usize] {
1104            j += 1;
1105        }
1106        xdf1.dend = nrec1 as i64 - j - 1;
1107        xdf2.dend = nrec2 as i64 - j - 1;
1108
1109        // xdl_cleanup_records: classify and reduce to effective records.
1110        let mut action1 = vec![0u8; nrec1 + 1];
1111        let mut action2 = vec![0u8; nrec2 + 1];
1112
1113        let mut mlim = bogosqrt(nrec1 as i64);
1114        if mlim > XDL_MAX_EQLIMIT {
1115            mlim = XDL_MAX_EQLIMIT;
1116        }
1117        let mut idx = xdf1.dstart;
1118        while idx <= xdf1.dend {
1119            let id = ids1[idx as usize];
1120            let nm = *count2.get(&id).unwrap_or(&0);
1121            action1[idx as usize] = if nm == 0 {
1122                DISCARD
1123            } else if nm >= mlim {
1124                INVESTIGATE
1125            } else {
1126                KEEP
1127            };
1128            idx += 1;
1129        }
1130
1131        let mut mlim = bogosqrt(nrec2 as i64);
1132        if mlim > XDL_MAX_EQLIMIT {
1133            mlim = XDL_MAX_EQLIMIT;
1134        }
1135        let mut idx = xdf2.dstart;
1136        while idx <= xdf2.dend {
1137            let id = ids2[idx as usize];
1138            let nm = *count1.get(&id).unwrap_or(&0);
1139            action2[idx as usize] = if nm == 0 {
1140                DISCARD
1141            } else if nm >= mlim {
1142                INVESTIGATE
1143            } else {
1144                KEEP
1145            };
1146            idx += 1;
1147        }
1148
1149        let mut idx = xdf1.dstart;
1150        while idx <= xdf1.dend {
1151            let a = action1[idx as usize];
1152            if a == KEEP
1153                || (a == INVESTIGATE && !clean_mmatch(&action1, idx, xdf1.dstart, xdf1.dend))
1154            {
1155                xdf1.reference_index.push(idx as usize);
1156            } else {
1157                xdf1.changed[idx as usize] = true;
1158            }
1159            idx += 1;
1160        }
1161        xdf1.nreff = xdf1.reference_index.len() as i64;
1162
1163        let mut idx = xdf2.dstart;
1164        while idx <= xdf2.dend {
1165            let a = action2[idx as usize];
1166            if a == KEEP
1167                || (a == INVESTIGATE && !clean_mmatch(&action2, idx, xdf2.dstart, xdf2.dend))
1168            {
1169                xdf2.reference_index.push(idx as usize);
1170            } else {
1171                xdf2.changed[idx as usize] = true;
1172            }
1173            idx += 1;
1174        }
1175        xdf2.nreff = xdf2.reference_index.len() as i64;
1176
1177        // Allocate K vectors (xdl_do_diff). koff lets us use negative diagonals.
1178        let ndiags = xdf1.nreff + xdf2.nreff + 3;
1179        let kvd_len = (2 * ndiags + 2) as usize;
1180        let mut kvd = vec![0i64; kvd_len];
1181        // kvdf base offset = nreff2 + 1; kvdb base offset = ndiags + nreff2 + 1.
1182        let koff = xdf2.nreff + 1;
1183        let (kvdf_slice, kvdb_slice) = kvd.split_at_mut(ndiags as usize);
1184        // Both slices are indexed as [k + koff]; their length covers the diagonal range.
1185
1186        let xenv = XdAlgoEnv {
1187            mxcost: bogosqrt(ndiags).max(XDL_MAX_COST_MIN),
1188            snake_cnt: XDL_SNAKE_CNT,
1189            heur_min: XDL_HEUR_MIN_COST,
1190        };
1191
1192        let nreff1 = xdf1.nreff;
1193        let nreff2 = xdf2.nreff;
1194        xdl_recs_cmp(
1195            &mut xdf1, 0, nreff1, &mut xdf2, 0, nreff2, kvdf_slice, kvdb_slice, koff, false, &xenv,
1196        );
1197
1198        (xdf1.changed, xdf2.changed)
1199    }
1200}
1201
1202/// Convert per-token `changed` flags (Git xdiff style) into `similar::DiffOp`s.
1203fn changed_flags_to_ops(
1204    changed1: &[bool],
1205    changed2: &[bool],
1206    old_len: usize,
1207    new_len: usize,
1208) -> Vec<similar::DiffOp> {
1209    use similar::DiffOp;
1210    let mut ops: Vec<DiffOp> = Vec::new();
1211    let mut i = 0usize;
1212    let mut j = 0usize;
1213    while i < old_len || j < new_len {
1214        let del = i < old_len && changed1[i];
1215        let ins = j < new_len && changed2[j];
1216        if !del && !ins {
1217            // Equal run.
1218            let start_i = i;
1219            let start_j = j;
1220            while i < old_len && j < new_len && !changed1[i] && !changed2[j] {
1221                i += 1;
1222                j += 1;
1223            }
1224            ops.push(DiffOp::Equal {
1225                old_index: start_i,
1226                new_index: start_j,
1227                len: i - start_i,
1228            });
1229        } else {
1230            // Changed run: consecutive deletions then/and insertions.
1231            let start_i = i;
1232            let start_j = j;
1233            while i < old_len && changed1[i] {
1234                i += 1;
1235            }
1236            while j < new_len && changed2[j] {
1237                j += 1;
1238            }
1239            let dlen = i - start_i;
1240            let ilen = j - start_j;
1241            if dlen > 0 && ilen > 0 {
1242                ops.push(DiffOp::Replace {
1243                    old_index: start_i,
1244                    old_len: dlen,
1245                    new_index: start_j,
1246                    new_len: ilen,
1247                });
1248            } else if dlen > 0 {
1249                ops.push(DiffOp::Delete {
1250                    old_index: start_i,
1251                    old_len: dlen,
1252                    new_index: start_j,
1253                });
1254            } else if ilen > 0 {
1255                ops.push(DiffOp::Insert {
1256                    old_index: start_i,
1257                    new_index: start_j,
1258                    new_len: ilen,
1259                });
1260            }
1261        }
1262    }
1263    ops
1264}
1265
1266/// Diff two token streams with a Git-compatible implementation of the xdiff Myers engine and return the
1267/// result as `similar::DiffOp`s. Used by the word-diff machinery, where matching Git's exact
1268/// record selection and tie-breaking matters: `imara-diff`/`similar` pick different — but
1269/// equally minimal — alignments for streams with many repeated tokens (e.g. the `bibtex`
1270/// driver's `{`/`}`/`,`), mismatching Git's reference output.
1271#[must_use]
1272pub fn word_diff_ops_imara(old_words: &[&str], new_words: &[&str]) -> Vec<similar::DiffOp> {
1273    use std::collections::HashMap;
1274
1275    // Intern tokens to ids (equal id <=> equal token), mirroring xdiff's record hashing.
1276    let mut interner: HashMap<&str, u32> = HashMap::new();
1277    let mut ids1: Vec<u32> = Vec::with_capacity(old_words.len());
1278    for &w in old_words {
1279        let next = interner.len() as u32;
1280        let id = *interner.entry(w).or_insert(next);
1281        ids1.push(id);
1282    }
1283    let mut ids2: Vec<u32> = Vec::with_capacity(new_words.len());
1284    for &w in new_words {
1285        let next = interner.len() as u32;
1286        let id = *interner.entry(w).or_insert(next);
1287        ids2.push(id);
1288    }
1289
1290    let (changed1, changed2) = git_xdiff::changed_flags(&ids1, &ids2);
1291    let ops = changed_flags_to_ops(&changed1, &changed2, old_words.len(), new_words.len());
1292
1293    // Slide changed runs to Git's canonical position (`xdl_change_compact`); the word
1294    // diff never enables the indent heuristic.
1295    diff_indent_heuristic::apply_change_compact_to_ops(&ops, old_words, new_words, false)
1296}
1297
1298/// Line-diff ops for string slices after Git `xdl_change_compact` (and optional indent heuristic).
1299#[must_use]
1300pub fn diff_slice_ops_compacted(
1301    old_lines: &[&str],
1302    new_lines: &[&str],
1303    algorithm: similar::Algorithm,
1304    indent_heuristic: bool,
1305) -> Vec<similar::DiffOp> {
1306    diff_indent_heuristic::diff_slice_ops_compacted(
1307        old_lines,
1308        new_lines,
1309        algorithm,
1310        indent_heuristic,
1311    )
1312}
1313
1314/// Map each line in `new_joined` to its origin in `old_joined` after Git-style compaction (for blame).
1315#[must_use]
1316pub fn map_new_to_old_lines_compacted(
1317    old_joined: &str,
1318    new_joined: &str,
1319    algorithm: similar::Algorithm,
1320    indent_heuristic: bool,
1321    new_line_count: usize,
1322) -> Vec<Option<usize>> {
1323    let ops = diff_indent_heuristic::diff_lines_ops_compacted(
1324        old_joined,
1325        new_joined,
1326        algorithm,
1327        indent_heuristic,
1328    );
1329    diff_indent_heuristic::map_new_to_old_from_ops(&ops, new_line_count)
1330}
1331
1332/// The kind of change between two sides of a diff.
1333#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1334pub enum DiffStatus {
1335    /// File was added.
1336    Added,
1337    /// File was deleted.
1338    Deleted,
1339    /// File was modified (content or mode change).
1340    Modified,
1341    /// File was renamed (with optional content change).
1342    Renamed,
1343    /// File was copied.
1344    Copied,
1345    /// File type changed (e.g. regular → symlink).
1346    TypeChanged,
1347    /// Unmerged (conflict).
1348    Unmerged,
1349}
1350
1351impl DiffStatus {
1352    /// Single-character status letter used in raw diff output.
1353    #[must_use]
1354    pub fn letter(&self) -> char {
1355        match self {
1356            Self::Added => 'A',
1357            Self::Deleted => 'D',
1358            Self::Modified => 'M',
1359            Self::Renamed => 'R',
1360            Self::Copied => 'C',
1361            Self::TypeChanged => 'T',
1362            Self::Unmerged => 'U',
1363        }
1364    }
1365}
1366
1367/// A single diff entry representing one changed path.
1368#[derive(Debug, Clone, PartialEq, Eq)]
1369pub struct DiffEntry {
1370    /// The status of this change.
1371    pub status: DiffStatus,
1372    /// Path in the "old" side (None for Added).
1373    pub old_path: Option<String>,
1374    /// Path in the "new" side (None for Deleted).
1375    pub new_path: Option<String>,
1376    /// Old file mode (as octal string, e.g. "100644").
1377    pub old_mode: String,
1378    /// New file mode.
1379    pub new_mode: String,
1380    /// Old object ID (zero OID for Added).
1381    pub old_oid: ObjectId,
1382    /// New object ID (zero OID for Deleted).
1383    pub new_oid: ObjectId,
1384    /// Similarity score (0–100) for renames/copies.
1385    pub score: Option<u32>,
1386}
1387
1388impl DiffEntry {
1389    /// The primary path for display (new_path for adds, old_path for deletes).
1390    #[must_use]
1391    pub fn path(&self) -> &str {
1392        self.new_path
1393            .as_deref()
1394            .or(self.old_path.as_deref())
1395            .unwrap_or("")
1396    }
1397
1398    /// Return a human-oriented path display for this entry.
1399    ///
1400    /// For renames and copies this returns `old -> new`; for all other entry
1401    /// kinds this returns the primary path.
1402    #[must_use]
1403    pub fn display_path(&self) -> String {
1404        match self.status {
1405            DiffStatus::Renamed | DiffStatus::Copied => {
1406                let old = self.old_path.as_deref().unwrap_or("");
1407                let new = self.new_path.as_deref().unwrap_or("");
1408                if old.is_empty() || new.is_empty() {
1409                    self.path().to_owned()
1410                } else {
1411                    format!("{old} -> {new}")
1412                }
1413            }
1414            _ => self.path().to_owned(),
1415        }
1416    }
1417}
1418
1419/// The zero (null) object ID used for "no object" in diff output.
1420pub const ZERO_OID: &str = "0000000000000000000000000000000000000000";
1421
1422/// Return the zero ObjectId.
1423#[must_use]
1424pub fn zero_oid() -> ObjectId {
1425    ObjectId::from_bytes(&[0u8; 20]).unwrap_or_else(|_| {
1426        // This should never fail since we pass exactly 20 bytes
1427        panic!("internal error: failed to create zero OID");
1428    })
1429}
1430
1431/// Return the ObjectId for the empty blob object.
1432#[must_use]
1433pub fn empty_blob_oid() -> ObjectId {
1434    ObjectId::from_hex("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391").unwrap_or_else(|_| {
1435        // This should never fail since the object ID literal is valid.
1436        panic!("internal error: failed to create empty blob OID");
1437    })
1438}
1439
1440// ── Tree-to-tree diff ───────────────────────────────────────────────
1441
1442/// Compare two trees and return the list of changed entries.
1443///
1444/// # Parameters
1445///
1446/// - `odb` — object database to read tree objects from.
1447/// - `old_tree_oid` — OID of the old tree (or `None` for comparison against empty).
1448/// - `new_tree_oid` — OID of the new tree (or `None` for comparison against empty).
1449/// - `prefix` — path prefix for nested tree recursion (empty string for root).
1450///
1451/// # Errors
1452///
1453/// Returns errors from object database reads.
1454pub fn diff_trees(
1455    odb: &Odb,
1456    old_tree_oid: Option<&ObjectId>,
1457    new_tree_oid: Option<&ObjectId>,
1458    prefix: &str,
1459) -> Result<Vec<DiffEntry>> {
1460    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, false)
1461}
1462
1463/// Like `diff_trees` but with `show_trees` flag: when true, emit entries for
1464/// tree objects themselves in addition to their recursive contents (the `-t`
1465/// flag of `diff-tree`).
1466pub fn diff_trees_show_tree_entries(
1467    odb: &Odb,
1468    old_tree_oid: Option<&ObjectId>,
1469    new_tree_oid: Option<&ObjectId>,
1470    prefix: &str,
1471) -> Result<Vec<DiffEntry>> {
1472    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, true)
1473}
1474
1475fn diff_trees_opts(
1476    odb: &Odb,
1477    old_tree_oid: Option<&ObjectId>,
1478    new_tree_oid: Option<&ObjectId>,
1479    prefix: &str,
1480    show_trees: bool,
1481) -> Result<Vec<DiffEntry>> {
1482    let old_entries = match old_tree_oid {
1483        Some(oid) => read_tree(odb, oid)?,
1484        None => Vec::new(),
1485    };
1486    let new_entries = match new_tree_oid {
1487        Some(oid) => read_tree(odb, oid)?,
1488        None => Vec::new(),
1489    };
1490
1491    let mut result = Vec::new();
1492    diff_tree_entries_opts(
1493        odb,
1494        &old_entries,
1495        &new_entries,
1496        prefix,
1497        show_trees,
1498        &mut result,
1499    )?;
1500    Ok(result)
1501}
1502
1503/// Read and parse a tree object from the ODB.
1504fn read_tree(odb: &Odb, oid: &ObjectId) -> Result<Vec<TreeEntry>> {
1505    let obj = odb.read(oid)?;
1506    if obj.kind != ObjectKind::Tree {
1507        return Err(Error::CorruptObject(format!(
1508            "expected tree, got {}",
1509            obj.kind.as_str()
1510        )));
1511    }
1512    parse_tree(&obj.data)
1513}
1514
1515/// Compare two sorted lists of tree entries, recursing into subtrees.
1516fn diff_tree_entries_opts(
1517    odb: &Odb,
1518    old: &[TreeEntry],
1519    new: &[TreeEntry],
1520    prefix: &str,
1521    show_trees: bool,
1522    result: &mut Vec<DiffEntry>,
1523) -> Result<()> {
1524    let mut oi = 0;
1525    let mut ni = 0;
1526
1527    while oi < old.len() || ni < new.len() {
1528        match (old.get(oi), new.get(ni)) {
1529            (Some(o), Some(n)) => {
1530                let cmp = crate::objects::tree_entry_cmp(
1531                    &o.name,
1532                    is_tree_mode(o.mode),
1533                    &n.name,
1534                    is_tree_mode(n.mode),
1535                );
1536                match cmp {
1537                    std::cmp::Ordering::Less => {
1538                        // Old entry not in new → deleted
1539                        emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1540                        oi += 1;
1541                    }
1542                    std::cmp::Ordering::Greater => {
1543                        // New entry not in old → added
1544                        emit_added_opts(odb, n, prefix, show_trees, result)?;
1545                        ni += 1;
1546                    }
1547                    std::cmp::Ordering::Equal => {
1548                        // Both present — check for changes
1549                        if o.oid != n.oid || o.mode != n.mode {
1550                            let name_str = String::from_utf8_lossy(&o.name);
1551                            let path = format_path(prefix, &name_str);
1552                            if is_tree_mode(o.mode) && is_tree_mode(n.mode) {
1553                                // Both are trees
1554                                if show_trees {
1555                                    result.push(DiffEntry {
1556                                        status: DiffStatus::Modified,
1557                                        old_path: Some(path.clone()),
1558                                        new_path: Some(path.clone()),
1559                                        old_mode: format_mode(o.mode),
1560                                        new_mode: format_mode(n.mode),
1561                                        old_oid: o.oid,
1562                                        new_oid: n.oid,
1563                                        score: None,
1564                                    });
1565                                }
1566                                // Recurse
1567                                let nested = diff_trees_opts(
1568                                    odb,
1569                                    Some(&o.oid),
1570                                    Some(&n.oid),
1571                                    &path,
1572                                    show_trees,
1573                                )?;
1574                                result.extend(nested);
1575                            } else if is_tree_mode(o.mode) && !is_tree_mode(n.mode) {
1576                                // Tree → blob: delete tree contents, add blob
1577                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1578                                emit_added_opts(odb, n, prefix, show_trees, result)?;
1579                            } else if !is_tree_mode(o.mode) && is_tree_mode(n.mode) {
1580                                // Blob → tree: delete blob, add tree contents
1581                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1582                                emit_added_opts(odb, n, prefix, show_trees, result)?;
1583                            } else {
1584                                // Both blobs — modified.
1585                                // A mode-only change (e.g. chmod) is Modified.
1586                                // TypeChanged is only for actual type changes (blob ↔ symlink).
1587                                let old_type = o.mode & 0o170000;
1588                                let new_type = n.mode & 0o170000;
1589                                result.push(DiffEntry {
1590                                    status: if old_type != new_type {
1591                                        DiffStatus::TypeChanged
1592                                    } else {
1593                                        DiffStatus::Modified
1594                                    },
1595                                    old_path: Some(path.clone()),
1596                                    new_path: Some(path),
1597                                    old_mode: format_mode(o.mode),
1598                                    new_mode: format_mode(n.mode),
1599                                    old_oid: o.oid,
1600                                    new_oid: n.oid,
1601                                    score: None,
1602                                });
1603                            }
1604                        }
1605                        oi += 1;
1606                        ni += 1;
1607                    }
1608                }
1609            }
1610            (Some(o), None) => {
1611                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
1612                oi += 1;
1613            }
1614            (None, Some(n)) => {
1615                emit_added_opts(odb, n, prefix, show_trees, result)?;
1616                ni += 1;
1617            }
1618            (None, None) => break,
1619        }
1620    }
1621
1622    Ok(())
1623}
1624
1625fn emit_deleted_opts(
1626    odb: &Odb,
1627    entry: &TreeEntry,
1628    prefix: &str,
1629    show_trees: bool,
1630    result: &mut Vec<DiffEntry>,
1631) -> Result<()> {
1632    let name_str = String::from_utf8_lossy(&entry.name);
1633    let path = format_path(prefix, &name_str);
1634    if is_tree_mode(entry.mode) {
1635        if show_trees {
1636            result.push(DiffEntry {
1637                status: DiffStatus::Deleted,
1638                old_path: Some(path.clone()),
1639                new_path: None,
1640                old_mode: format_mode(entry.mode),
1641                new_mode: "000000".to_owned(),
1642                old_oid: entry.oid,
1643                new_oid: zero_oid(),
1644                score: None,
1645            });
1646        }
1647        // Recurse into deleted tree
1648        let nested = diff_trees_opts(odb, Some(&entry.oid), None, &path, show_trees)?;
1649        result.extend(nested);
1650    } else {
1651        result.push(DiffEntry {
1652            status: DiffStatus::Deleted,
1653            old_path: Some(path.clone()),
1654            new_path: None,
1655            old_mode: format_mode(entry.mode),
1656            new_mode: "000000".to_owned(),
1657            old_oid: entry.oid,
1658            new_oid: zero_oid(),
1659            score: None,
1660        });
1661    }
1662    Ok(())
1663}
1664
1665fn emit_added_opts(
1666    odb: &Odb,
1667    entry: &TreeEntry,
1668    prefix: &str,
1669    show_trees: bool,
1670    result: &mut Vec<DiffEntry>,
1671) -> Result<()> {
1672    let name_str = String::from_utf8_lossy(&entry.name);
1673    let path = format_path(prefix, &name_str);
1674    if is_tree_mode(entry.mode) {
1675        if show_trees {
1676            result.push(DiffEntry {
1677                status: DiffStatus::Added,
1678                old_path: None,
1679                new_path: Some(path.clone()),
1680                old_mode: "000000".to_owned(),
1681                new_mode: format_mode(entry.mode),
1682                old_oid: zero_oid(),
1683                new_oid: entry.oid,
1684                score: None,
1685            });
1686        }
1687        // Recurse into added tree
1688        let nested = diff_trees_opts(odb, None, Some(&entry.oid), &path, show_trees)?;
1689        result.extend(nested);
1690    } else {
1691        result.push(DiffEntry {
1692            status: DiffStatus::Added,
1693            old_path: None,
1694            new_path: Some(path),
1695            old_mode: "000000".to_owned(),
1696            new_mode: format_mode(entry.mode),
1697            old_oid: zero_oid(),
1698            new_oid: entry.oid,
1699            score: None,
1700        });
1701    }
1702    Ok(())
1703}
1704
1705// ── Index-to-tree diff (staged changes) ─────────────────────────────
1706
1707/// Compare the index against a tree (usually HEAD's tree).
1708///
1709/// This shows "staged" changes — what would be committed.
1710///
1711/// # Parameters
1712///
1713/// - `odb` — object database.
1714/// - `index` — the current index.
1715/// - `tree_oid` — the tree to compare against (e.g. HEAD's tree), or `None`
1716///   for comparison against an empty tree (initial commit).
1717///
1718/// # Errors
1719///
1720/// Returns errors from ODB reads.
1721///
1722/// When `ignore_submodules` is true, gitlink (`160000`) paths are omitted from the diff, matching
1723/// Git's `require_clean_work_tree(..., ignore_submodules=1)` used by `git rebase` / `git pull`.
1724pub fn diff_index_to_tree(
1725    odb: &Odb,
1726    index: &Index,
1727    tree_oid: Option<&ObjectId>,
1728    ignore_submodules: bool,
1729) -> Result<Vec<DiffEntry>> {
1730    // Flatten the tree into a sorted list of (path, mode, oid)
1731    let tree_entries = match tree_oid {
1732        Some(oid) => flatten_tree(odb, oid, "")?,
1733        None => Vec::new(),
1734    };
1735
1736    // Build maps keyed by path
1737    let mut tree_map: std::collections::BTreeMap<&str, &FlatEntry> =
1738        std::collections::BTreeMap::new();
1739    for entry in &tree_entries {
1740        tree_map.insert(&entry.path, entry);
1741    }
1742
1743    let mut result = Vec::new();
1744    let mut stage0_paths = std::collections::BTreeSet::new();
1745    let mut unmerged_modes: std::collections::BTreeMap<String, (u8, u32)> =
1746        std::collections::BTreeMap::new();
1747
1748    // Check index entries against tree
1749    for ie in &index.entries {
1750        let path = String::from_utf8_lossy(&ie.path).to_string();
1751        if ie.stage() == 0 && ie.intent_to_add() {
1752            // Intent-to-add entries are not "staged" for diff-index / status
1753            // (matches Git: `git diff --cached` is empty for `-N` paths).
1754            continue;
1755        }
1756        if ie.stage() != 0 {
1757            let rank = match ie.stage() {
1758                2 => 0u8,
1759                3 => 1u8,
1760                1 => 2u8,
1761                _ => 3u8,
1762            };
1763            match unmerged_modes.get(&path) {
1764                Some((existing_rank, _)) if *existing_rank <= rank => {}
1765                _ => {
1766                    unmerged_modes.insert(path, (rank, ie.mode));
1767                }
1768            }
1769            continue;
1770        }
1771        if ignore_submodules && ie.mode == 0o160000 {
1772            let _ = tree_map.remove(path.as_str());
1773            stage0_paths.insert(path.clone());
1774            continue;
1775        }
1776        stage0_paths.insert(path.clone());
1777        match tree_map.remove(path.as_str()) {
1778            Some(te) => {
1779                // Present in both — check for differences
1780                if te.oid != ie.oid || te.mode != ie.mode {
1781                    result.push(DiffEntry {
1782                        status: DiffStatus::Modified,
1783                        old_path: Some(path.clone()),
1784                        new_path: Some(path),
1785                        old_mode: format_mode(te.mode),
1786                        new_mode: format_mode(ie.mode),
1787                        old_oid: te.oid,
1788                        new_oid: ie.oid,
1789                        score: None,
1790                    });
1791                }
1792            }
1793            None => {
1794                // In index but not tree → added
1795                result.push(DiffEntry {
1796                    status: DiffStatus::Added,
1797                    old_path: None,
1798                    new_path: Some(path),
1799                    old_mode: "000000".to_owned(),
1800                    new_mode: format_mode(ie.mode),
1801                    old_oid: zero_oid(),
1802                    new_oid: ie.oid,
1803                    score: None,
1804                });
1805            }
1806        }
1807    }
1808
1809    for (path, (_, mode)) in &unmerged_modes {
1810        if stage0_paths.contains(path) {
1811            continue;
1812        }
1813        tree_map.remove(path.as_str());
1814        result.push(DiffEntry {
1815            status: DiffStatus::Unmerged,
1816            old_path: Some(path.clone()),
1817            new_path: Some(path.clone()),
1818            old_mode: "000000".to_owned(),
1819            new_mode: format_mode(*mode),
1820            old_oid: zero_oid(),
1821            new_oid: zero_oid(),
1822            score: None,
1823        });
1824    }
1825
1826    // Remaining tree entries not in index → deleted
1827    for (path, te) in tree_map {
1828        if ignore_submodules && te.mode == 0o160000 {
1829            continue;
1830        }
1831        result.push(DiffEntry {
1832            status: DiffStatus::Deleted,
1833            old_path: Some(path.to_owned()),
1834            new_path: None,
1835            old_mode: format_mode(te.mode),
1836            new_mode: "000000".to_owned(),
1837            old_oid: te.oid,
1838            new_oid: zero_oid(),
1839            score: None,
1840        });
1841    }
1842
1843    result.sort_by(|a, b| a.path().cmp(b.path()));
1844    Ok(result)
1845}
1846
1847// ── Index-to-worktree diff (unstaged changes) ───────────────────────
1848
1849/// Compare the index against the working tree.
1850///
1851/// This shows "unstaged" changes — modifications not yet staged.
1852///
1853/// Entries with [`IndexEntry::assume_unchanged`] or [`IndexEntry::skip_worktree`] are treated as
1854/// matching the work tree without examining the filesystem (Git `CE_VALID` / skip-worktree).
1855///
1856/// # Parameters
1857///
1858/// - `odb` — object database (for hashing worktree files).
1859/// - `index` — the current index.
1860/// - `work_tree` — path to the working tree root.
1861/// - `ignore_submodule_untracked` — when true, gitlink entries are not dirty solely from untracked
1862///   files inside the submodule (matches `git status -uno`).
1863/// - `simplify_gitlinks` — when true, nested gitlink entries only compare the submodule checkout
1864///   HEAD to the recorded OID (ignore dirty work trees inside nested submodules). Used when
1865///   computing `submodule_porcelain_flags` so untracked files under a nested submodule do not set
1866///   the parent submodule's `modified` bit (Git `DIRTY_SUBMODULE_MODIFIED`; t7506).
1867///
1868/// # Errors
1869///
1870/// Returns errors from I/O or hashing.
1871pub fn diff_index_to_worktree(
1872    odb: &Odb,
1873    index: &Index,
1874    work_tree: &Path,
1875    ignore_submodule_untracked: bool,
1876    simplify_gitlinks: bool,
1877) -> Result<Vec<DiffEntry>> {
1878    diff_index_to_worktree_with_options(
1879        odb,
1880        index,
1881        work_tree,
1882        DiffIndexToWorktreeOptions {
1883            ignore_submodule_untracked,
1884            simplify_gitlinks,
1885            ..DiffIndexToWorktreeOptions::default()
1886        },
1887    )
1888}
1889
1890/// Additional inputs for [`diff_index_to_worktree_with_options`].
1891#[derive(Debug, Clone, Copy, Default)]
1892pub struct DiffIndexToWorktreeOptions {
1893    /// Optional index mtime pair `(sec, nsec)` sampled when the index was read.
1894    ///
1895    /// When provided, entries with matching stat data are still considered dirty candidates if
1896    /// their recorded mtime is "racy" (at or after this timestamp), matching Git's
1897    /// `is_racy_timestamp` behavior.
1898    pub index_mtime: Option<(u32, u32)>,
1899    /// When true, gitlink entries are not dirty solely from untracked files inside the submodule.
1900    pub ignore_submodule_untracked: bool,
1901    /// When true, nested gitlink entries only compare the submodule checkout HEAD to the recorded OID.
1902    pub simplify_gitlinks: bool,
1903    /// When true, a populated gitlink checkout whose `.git` indirection cannot resolve to a HEAD
1904    /// is returned as an error instead of a normal modified gitlink.
1905    pub error_on_broken_gitlinks: bool,
1906}
1907
1908/// Compare the index against the working tree with optional racy-timestamp context.
1909///
1910/// This variant enables a stat-trust fast path: if an entry's stat tuple matches and the mode is
1911/// unchanged, the worktree blob hash is skipped unless the entry is racy relative to the supplied
1912/// index mtime.
1913///
1914/// # Parameters
1915///
1916/// - `odb` — object database (for hashing worktree files).
1917/// - `index` — the current index.
1918/// - `work_tree` — path to the working tree root.
1919/// - `options` — optional context for racy timestamp checks.
1920///
1921/// # Errors
1922///
1923/// Returns errors from I/O or hashing.
1924pub fn diff_index_to_worktree_with_options(
1925    odb: &Odb,
1926    index: &Index,
1927    work_tree: &Path,
1928    options: DiffIndexToWorktreeOptions,
1929) -> Result<Vec<DiffEntry>> {
1930    use crate::config::ConfigSet;
1931    use crate::crlf;
1932
1933    let ignore_submodule_untracked = options.ignore_submodule_untracked;
1934    let simplify_gitlinks = options.simplify_gitlinks;
1935
1936    let git_dir = work_tree.join(".git");
1937    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1938    let conv = crlf::ConversionConfig::from_config(&config);
1939    let attrs = crlf::load_gitattributes(work_tree);
1940
1941    let mut result = Vec::new();
1942    let mut unmerged_base: std::collections::BTreeMap<String, (u8, &IndexEntry)> =
1943        std::collections::BTreeMap::new();
1944
1945    for ie in &index.entries {
1946        if ie.stage() != 0 {
1947            let path = String::from_utf8_lossy(&ie.path).to_string();
1948            let rank = match ie.stage() {
1949                2 => 0u8,
1950                3 => 1u8,
1951                1 => 2u8,
1952                _ => 3u8,
1953            };
1954            match unmerged_base.get(&path) {
1955                Some((existing_rank, _)) if *existing_rank <= rank => {}
1956                _ => {
1957                    unmerged_base.insert(path, (rank, ie));
1958                }
1959            }
1960            continue;
1961        }
1962        // Sparse checkout: paths outside the cone are not expected on disk; `assume_unchanged`
1963        // is treated as clean without reading the filesystem (wt-status.c).
1964        if ie.skip_worktree() || ie.assume_unchanged() {
1965            continue;
1966        }
1967        // Use str slice directly to avoid allocation for path joining;
1968        // only allocate String if we need it for DiffEntry output.
1969        let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
1970        let is_intent_to_add = ie.intent_to_add();
1971
1972        // Gitlink entries (submodules): Git's `diff-index` reports `M` when the recorded
1973        // commit differs from the submodule checkout **or** when the submodule work tree is
1974        // dirty (staged/unstaged/untracked) even if HEAD still matches the gitlink. For the
1975        // latter case the "new" OID column is the null OID (see `git diff-index` / t7506).
1976        if ie.mode == 0o160000 {
1977            let sub_dir = work_tree.join(path_str_ref);
1978            let sub_head_oid = read_submodule_head_oid(&sub_dir);
1979            // A gitlink whose worktree directory is entirely absent is a deleted submodule. Git's
1980            // `check_removed` (diff-lib.c) `lstat`s the path first: a missing directory is reported
1981            // as a removal (`D`, new mode 000000), *before* the submodule "not checked out" special
1982            // case (which only applies when the directory exists). An empty directory that exists is
1983            // a placeholder and stays unchanged. Skipped when `simplify_gitlinks` so callers that
1984            // only compare recorded HEADs keep their behaviour. (t4060 #50/#51.)
1985            if !simplify_gitlinks && sub_head_oid.is_none() && !sub_dir.exists() {
1986                let path_owned = path_str_ref.to_owned();
1987                result.push(DiffEntry {
1988                    status: DiffStatus::Deleted,
1989                    old_path: Some(path_owned.clone()),
1990                    new_path: Some(path_owned),
1991                    old_mode: format_mode(ie.mode),
1992                    new_mode: "000000".to_owned(),
1993                    old_oid: ie.oid,
1994                    new_oid: zero_oid(),
1995                    score: None,
1996                });
1997                continue;
1998            }
1999            let ref_matches = if let Some(oid) = sub_head_oid {
2000                oid == ie.oid
2001            } else {
2002                let is_placeholder = submodule_worktree_is_unpopulated_placeholder(&sub_dir);
2003                if options.error_on_broken_gitlinks
2004                    && !is_placeholder
2005                    && submodule_embedded_git_dir(&sub_dir).is_some()
2006                {
2007                    return Err(Error::ConfigError(format!(
2008                        "could not read submodule HEAD for '{path_str_ref}'"
2009                    )));
2010                }
2011                is_placeholder
2012            };
2013            if simplify_gitlinks {
2014                if !ref_matches {
2015                    let path_owned = path_str_ref.to_owned();
2016                    let new_oid = sub_head_oid.unwrap_or_else(zero_oid);
2017                    result.push(DiffEntry {
2018                        status: DiffStatus::Modified,
2019                        old_path: Some(path_owned.clone()),
2020                        new_path: Some(path_owned),
2021                        old_mode: format_mode(ie.mode),
2022                        new_mode: format_mode(ie.mode),
2023                        old_oid: ie.oid,
2024                        new_oid,
2025                        score: None,
2026                    });
2027                }
2028                continue;
2029            }
2030            // A populated submodule whose HEAD points at a commit object that is missing from its
2031            // own object store is corrupt. Git's `is_submodule_modified` shells `git status` into
2032            // the submodule, which fails, and Git aborts the surrounding status/diff. Mirror that:
2033            // a broken submodule is a hard error rather than a silently-clean gitlink (t5526 #38).
2034            if sub_head_oid.is_some() && submodule_head_object_broken(&sub_dir) {
2035                return Err(Error::ConfigError(format!(
2036                    "'git status --porcelain=2' failed in submodule {path_str_ref}"
2037                )));
2038            }
2039            let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
2040            if ignore_submodule_untracked {
2041                flags.untracked = false;
2042            }
2043            let inner_dirty = flags.modified || flags.untracked;
2044            if !ref_matches || inner_dirty {
2045                let path_owned = path_str_ref.to_owned();
2046                let new_oid = if !ref_matches {
2047                    sub_head_oid.unwrap_or_else(zero_oid)
2048                } else {
2049                    zero_oid()
2050                };
2051                result.push(DiffEntry {
2052                    status: DiffStatus::Modified,
2053                    old_path: Some(path_owned.clone()),
2054                    new_path: Some(path_owned),
2055                    old_mode: format_mode(ie.mode),
2056                    new_mode: format_mode(ie.mode),
2057                    old_oid: ie.oid,
2058                    new_oid,
2059                    score: None,
2060                });
2061            }
2062            continue;
2063        }
2064
2065        let file_path = work_tree.join(path_str_ref);
2066
2067        if is_intent_to_add {
2068            match fs::symlink_metadata(&file_path) {
2069                Ok(meta) => {
2070                    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
2071                    let worktree_oid = hash_worktree_file(
2072                        odb,
2073                        &file_path,
2074                        &meta,
2075                        &conv,
2076                        &file_attrs,
2077                        path_str_ref,
2078                        None,
2079                    )?;
2080                    let worktree_mode = mode_from_metadata(&meta);
2081                    result.push(DiffEntry {
2082                        status: DiffStatus::Added,
2083                        old_path: None,
2084                        new_path: Some(path_str_ref.to_owned()),
2085                        old_mode: "000000".to_owned(),
2086                        new_mode: format_mode(worktree_mode),
2087                        // `ita_invisible_in_index`: null OID on the index side for patch output
2088                        // (`index 0000000..`, t2203); index entry still stores the empty blob.
2089                        old_oid: zero_oid(),
2090                        new_oid: worktree_oid,
2091                        score: None,
2092                    });
2093                }
2094                Err(e)
2095                    if e.kind() == std::io::ErrorKind::NotFound
2096                        || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
2097                {
2098                    result.push(DiffEntry {
2099                        status: DiffStatus::Deleted,
2100                        old_path: Some(path_str_ref.to_owned()),
2101                        new_path: None,
2102                        old_mode: format_mode(ie.mode),
2103                        new_mode: "000000".to_owned(),
2104                        old_oid: ie.oid,
2105                        new_oid: zero_oid(),
2106                        score: None,
2107                    });
2108                }
2109                Err(e) => return Err(Error::Io(e)),
2110            }
2111            continue;
2112        }
2113
2114        // If any parent component of the path is a symlink, the file is effectively
2115        // deleted from the working tree (a symlink replaced a directory).
2116        if has_symlink_in_path(work_tree, path_str_ref) {
2117            result.push(DiffEntry {
2118                status: DiffStatus::Deleted,
2119                old_path: Some(path_str_ref.to_owned()),
2120                new_path: None,
2121                old_mode: format_mode(ie.mode),
2122                new_mode: "000000".to_owned(),
2123                old_oid: ie.oid,
2124                new_oid: zero_oid(),
2125                score: None,
2126            });
2127            continue;
2128        }
2129
2130        match fs::symlink_metadata(&file_path) {
2131            Ok(meta) if meta.is_dir() => {
2132                // A directory exists where the index expects a file. A populated submodule
2133                // checkout (`.git` present) is a blob→gitlink typechange with the submodule HEAD on
2134                // the new side (raw output re-zeros it); otherwise the indexed file is effectively
2135                // deleted. See t4041/t4060 #13.
2136                if file_path.join(".git").exists() {
2137                    let head = read_submodule_head_oid(&file_path).unwrap_or_else(zero_oid);
2138                    let path_owned = path_str_ref.to_owned();
2139                    result.push(DiffEntry {
2140                        status: DiffStatus::TypeChanged,
2141                        old_path: Some(path_owned.clone()),
2142                        new_path: Some(path_owned),
2143                        old_mode: format_mode(ie.mode),
2144                        new_mode: format_mode(0o160000),
2145                        old_oid: ie.oid,
2146                        new_oid: head,
2147                        score: None,
2148                    });
2149                    continue;
2150                }
2151                result.push(DiffEntry {
2152                    status: DiffStatus::Deleted,
2153                    old_path: Some(path_str_ref.to_owned()),
2154                    new_path: None,
2155                    old_mode: format_mode(ie.mode),
2156                    new_mode: String::new(),
2157                    old_oid: ie.oid,
2158                    new_oid: zero_oid(),
2159                    score: None,
2160                });
2161            }
2162            Ok(meta) => {
2163                let worktree_mode = mode_from_metadata(&meta);
2164                let stat_same = stat_matches(ie, &meta);
2165                // Mode-only change: stat still matches the index entry but executable bit differs.
2166                if stat_same && worktree_mode != ie.mode {
2167                    let path_owned = path_str_ref.to_owned();
2168                    result.push(DiffEntry {
2169                        status: DiffStatus::Modified,
2170                        old_path: Some(path_owned.clone()),
2171                        new_path: Some(path_owned),
2172                        old_mode: format_mode(ie.mode),
2173                        new_mode: format_mode(worktree_mode),
2174                        old_oid: ie.oid,
2175                        new_oid: ie.oid,
2176                        score: None,
2177                    });
2178                    continue;
2179                }
2180
2181                // Fast path: unchanged stat + unchanged mode + non-racy timestamp means this entry
2182                // is clean without re-hashing blob data.
2183                if stat_same && worktree_mode == ie.mode && !entry_is_racy(ie, options.index_mtime) {
2184                    continue;
2185                }
2186
2187                // Hash the worktree blob for uncertain/racy entries.
2188                let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
2189                let worktree_oid = hash_worktree_file(
2190                    odb,
2191                    &file_path,
2192                    &meta,
2193                    &conv,
2194                    &file_attrs,
2195                    path_str_ref,
2196                    Some(ie),
2197                )?;
2198
2199                // If clean conversion disagrees with the index but raw bytes match the
2200                // blob (e.g. mixed line endings committed with autocrlf off), Git reports
2201                // no diff (t0020: touch + git diff --exit-code).
2202                let mut eff_oid = worktree_oid;
2203                if eff_oid != ie.oid {
2204                    if let Ok(raw) = fs::read(&file_path) {
2205                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2206                        if raw_oid == ie.oid {
2207                            eff_oid = ie.oid;
2208                        }
2209                    }
2210                }
2211
2212                if eff_oid != ie.oid || worktree_mode != ie.mode {
2213                    let path_owned = path_str_ref.to_owned();
2214                    result.push(DiffEntry {
2215                        status: DiffStatus::Modified,
2216                        old_path: Some(path_owned.clone()),
2217                        new_path: Some(path_owned),
2218                        old_mode: format_mode(ie.mode),
2219                        new_mode: format_mode(worktree_mode),
2220                        old_oid: ie.oid,
2221                        new_oid: eff_oid,
2222                    score: None,
2223                    });
2224                }
2225            }
2226            Err(e) if e.kind() == std::io::ErrorKind::NotFound
2227                || e.raw_os_error() == Some(20) /* ENOTDIR */ => {
2228                // File deleted from working tree (or parent replaced by a file)
2229                result.push(DiffEntry {
2230                    status: DiffStatus::Deleted,
2231                    old_path: Some(path_str_ref.to_owned()),
2232                    new_path: None,
2233                    old_mode: format_mode(ie.mode),
2234                    new_mode: "000000".to_owned(),
2235                    old_oid: ie.oid,
2236                    new_oid: zero_oid(),
2237                    score: None,
2238                });
2239            }
2240            Err(e) => return Err(Error::Io(e)),
2241        }
2242    }
2243
2244    for (path, (_, base_entry)) in unmerged_base {
2245        let file_path = work_tree.join(&path);
2246        let wt_meta = match fs::symlink_metadata(&file_path) {
2247            Ok(meta) => Some(meta),
2248            Err(e)
2249                if e.kind() == std::io::ErrorKind::NotFound
2250                    || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
2251            {
2252                None
2253            }
2254            Err(e) => return Err(Error::Io(e)),
2255        };
2256
2257        let new_mode = wt_meta.as_ref().map_or_else(
2258            || "000000".to_owned(),
2259            |meta| format_mode(mode_from_metadata(meta)),
2260        );
2261        result.push(DiffEntry {
2262            status: DiffStatus::Unmerged,
2263            old_path: Some(path.clone()),
2264            new_path: Some(path.clone()),
2265            old_mode: "000000".to_owned(),
2266            new_mode,
2267            old_oid: zero_oid(),
2268            new_oid: zero_oid(),
2269            score: None,
2270        });
2271
2272        if let Some(meta) = wt_meta {
2273            let file_attrs = crlf::get_file_attrs(&attrs, &path, false, &config);
2274            let wt_oid = hash_worktree_file(
2275                odb,
2276                &file_path,
2277                &meta,
2278                &conv,
2279                &file_attrs,
2280                &path,
2281                Some(base_entry),
2282            )?;
2283            let wt_mode = mode_from_metadata(&meta);
2284            if wt_oid != base_entry.oid || wt_mode != base_entry.mode {
2285                result.push(DiffEntry {
2286                    status: DiffStatus::Modified,
2287                    old_path: Some(path.clone()),
2288                    new_path: Some(path),
2289                    old_mode: format_mode(base_entry.mode),
2290                    new_mode: format_mode(wt_mode),
2291                    old_oid: base_entry.oid,
2292                    new_oid: wt_oid,
2293                    score: None,
2294                });
2295            }
2296        }
2297    }
2298
2299    Ok(result)
2300}
2301
2302fn entry_is_racy(ie: &IndexEntry, index_mtime: Option<(u32, u32)>) -> bool {
2303    let Some((index_mtime_sec, index_mtime_nsec)) = index_mtime else {
2304        return false;
2305    };
2306    if index_mtime_sec == 0 {
2307        return false;
2308    }
2309    index_mtime_sec < ie.mtime_sec
2310        || (index_mtime_sec == ie.mtime_sec && index_mtime_nsec <= ie.mtime_nsec)
2311}
2312
2313/// Quick stat check: does the index entry's cached stat data match the file?
2314/// Returns true when the file at `ie`'s path differs from the index entry (mode or blob).
2315///
2316/// Used by commands such as `git mv` to detect "dirty" paths under sparse checkout.
2317/// Symlinks and submodules are compared in a Git-compatible way.
2318///
2319/// `ignore_submodule_untracked` mirrors [`diff_index_to_worktree`]'s same flag for gitlinks.
2320pub fn worktree_differs_from_index_entry(
2321    odb: &Odb,
2322    work_tree: &Path,
2323    ie: &IndexEntry,
2324    ignore_submodule_untracked: bool,
2325) -> Result<bool> {
2326    use crate::config::ConfigSet;
2327    use crate::crlf;
2328
2329    let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
2330    let file_path = work_tree.join(path_str_ref);
2331
2332    if ie.mode == 0o160000 {
2333        let sub_head_oid = read_submodule_head(&file_path);
2334        let ref_matches = match sub_head_oid {
2335            Some(oid) => oid == ie.oid,
2336            None => submodule_worktree_is_unpopulated_placeholder(&file_path),
2337        };
2338        let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
2339        if ignore_submodule_untracked {
2340            flags.untracked = false;
2341        }
2342        return Ok(!ref_matches || flags.modified || flags.untracked);
2343    }
2344
2345    let meta = match fs::symlink_metadata(&file_path) {
2346        Ok(m) => m,
2347        Err(e)
2348            if e.kind() == std::io::ErrorKind::NotFound
2349                || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
2350        {
2351            return Ok(true);
2352        }
2353        Err(e) => return Err(Error::Io(e)),
2354    };
2355
2356    if meta.is_dir() {
2357        return Ok(true);
2358    }
2359
2360    let worktree_mode = mode_from_metadata(&meta);
2361    if worktree_mode != ie.mode {
2362        return Ok(true);
2363    }
2364
2365    let git_dir = work_tree.join(".git");
2366    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
2367    let conv = crlf::ConversionConfig::from_config(&config);
2368    let attrs = crlf::load_gitattributes(work_tree);
2369    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
2370    let worktree_oid = hash_worktree_file(
2371        odb,
2372        &file_path,
2373        &meta,
2374        &conv,
2375        &file_attrs,
2376        path_str_ref,
2377        Some(ie),
2378    )?;
2379
2380    let mut eff_oid = worktree_oid;
2381    if eff_oid != ie.oid {
2382        if let Ok(raw) = fs::read(&file_path) {
2383            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2384            if raw_oid == ie.oid {
2385                eff_oid = ie.oid;
2386            }
2387        }
2388    }
2389
2390    Ok(eff_oid != ie.oid)
2391}
2392
2393pub fn stat_matches(ie: &IndexEntry, meta: &fs::Metadata) -> bool {
2394    // Compare size
2395    if meta.len() as u32 != ie.size {
2396        return false;
2397    }
2398    #[cfg(unix)]
2399    {
2400        use std::os::unix::fs::MetadataExt;
2401        // Compare mtime (seconds + nanoseconds)
2402        if meta.mtime() as u32 != ie.mtime_sec {
2403            return false;
2404        }
2405        if meta.mtime_nsec() as u32 != ie.mtime_nsec {
2406            return false;
2407        }
2408        // Compare ctime (seconds + nanoseconds)
2409        if meta.ctime() as u32 != ie.ctime_sec {
2410            return false;
2411        }
2412        if meta.ctime_nsec() as u32 != ie.ctime_nsec {
2413            return false;
2414        }
2415        // Compare inode and device
2416        if meta.ino() as u32 != ie.ino {
2417            return false;
2418        }
2419        if meta.dev() as u32 != ie.dev {
2420            return false;
2421        }
2422    }
2423    #[cfg(not(unix))]
2424    {
2425        use std::time::UNIX_EPOCH;
2426        if let Ok(mtime) = meta.modified() {
2427            if let Ok(dur) = mtime.duration_since(UNIX_EPOCH) {
2428                if dur.as_secs() as u32 != ie.mtime_sec {
2429                    return false;
2430                }
2431                if dur.subsec_nanos() != ie.mtime_nsec {
2432                    return false;
2433                }
2434            }
2435        }
2436    }
2437    true
2438}
2439
2440/// Refresh cached stat data for stage-0 file/symlink entries whose worktree content still matches
2441/// the recorded OID but whose on-disk stat went stale.
2442///
2443/// This mirrors Git's `refresh_index` / `refresh_cache_ent`: an entry is only marked clean (stat
2444/// adopted from the worktree) after its content is re-verified against the index OID. A genuinely
2445/// modified entry keeps its stale stat so `diff-files` / `status` continue to report it. Operations
2446/// that rewrite the worktree (`status`, `reset --mixed`, `stash`) call this before writing the
2447/// index so a subsequent `git diff-files` sees refreshed entries as clean.
2448///
2449/// Gitlinks, sparse (`skip_worktree`), `assume_unchanged` and intent-to-add entries are skipped.
2450/// The blob comparison is a raw-content hash, so a CRLF-smudged match is conservatively missed
2451/// (the entry simply stays stat-dirty and is re-hashed next time — never the reverse).
2452///
2453/// `index_mtime` is the on-disk index file's `(mtime_sec, mtime_nsec)` (see
2454/// `entry_is_racy` / Git `is_racy_timestamp`); pass `None` when unknown — racy detection is
2455/// then skipped, which is conservative for tree-built indexes whose zeroed stat never matches.
2456///
2457/// Returns `true` when at least one entry was refreshed or invalidated, so callers can write
2458/// the index opportunistically (Git only persists a refresh that changed something).
2459pub fn refresh_index_stat_content_verified(
2460    index: &mut Index,
2461    work_tree: &Path,
2462    index_mtime: Option<(u32, u32)>,
2463) -> bool {
2464    use crate::index::{MODE_EXECUTABLE, MODE_REGULAR, MODE_SYMLINK};
2465    let mut changed = false;
2466    for ie in &mut index.entries {
2467        if ie.stage() != 0 || ie.skip_worktree() || ie.assume_unchanged() || ie.intent_to_add() {
2468            continue;
2469        }
2470        if ie.mode != MODE_REGULAR && ie.mode != MODE_EXECUTABLE && ie.mode != MODE_SYMLINK {
2471            continue;
2472        }
2473        let Ok(path) = std::str::from_utf8(&ie.path) else {
2474            continue;
2475        };
2476        let abs = work_tree.join(path);
2477        let Ok(meta) = fs::symlink_metadata(&abs) else {
2478            continue;
2479        };
2480        if stat_matches(ie, &meta) {
2481            // Git `ie_match_stat`: a clean stat is trusted without reading the file unless the
2482            // entry is racy (written within the index's own mtime). Only then re-verify content;
2483            // stat can be refreshed from the work tree without matching the indexed blob (e.g.
2484            // after merge stat refresh while local edits remain) — invalidate so diff/status
2485            // re-hash.
2486            if entry_is_racy(ie, index_mtime)
2487                && !worktree_content_matches_index_oid(ie, &abs, &meta)
2488            {
2489                invalidate_index_stat_cache(ie);
2490                changed = true;
2491            }
2492            continue;
2493        }
2494        if !worktree_content_matches_index_oid(ie, &abs, &meta) {
2495            continue;
2496        }
2497        let refreshed = crate::index::entry_from_metadata(&meta, &ie.path, ie.oid, ie.mode);
2498        ie.ctime_sec = refreshed.ctime_sec;
2499        ie.ctime_nsec = refreshed.ctime_nsec;
2500        ie.mtime_sec = refreshed.mtime_sec;
2501        ie.mtime_nsec = refreshed.mtime_nsec;
2502        ie.dev = refreshed.dev;
2503        ie.ino = refreshed.ino;
2504        ie.uid = refreshed.uid;
2505        ie.gid = refreshed.gid;
2506        ie.size = refreshed.size;
2507        changed = true;
2508    }
2509    changed
2510}
2511
2512/// Whether the work tree blob at `abs` matches the index entry OID (raw bytes, no CRLF smudge).
2513fn worktree_content_matches_index_oid(ie: &IndexEntry, abs: &Path, meta: &fs::Metadata) -> bool {
2514    use crate::index::{MODE_EXECUTABLE, MODE_REGULAR, MODE_SYMLINK};
2515    if ie.mode == MODE_SYMLINK {
2516        if !meta.file_type().is_symlink() {
2517            return false;
2518        }
2519        use std::os::unix::ffi::OsStrExt as _;
2520        fs::read_link(abs)
2521            .map(|t| Odb::hash_object_data(ObjectKind::Blob, t.as_os_str().as_bytes()) == ie.oid)
2522            .unwrap_or(false)
2523    } else if ie.mode == MODE_REGULAR || ie.mode == MODE_EXECUTABLE {
2524        if !meta.file_type().is_file() {
2525            return false;
2526        }
2527        fs::read(abs)
2528            .map(|bytes| Odb::hash_object_data(ObjectKind::Blob, &bytes) == ie.oid)
2529            .unwrap_or(false)
2530    } else {
2531        false
2532    }
2533}
2534
2535/// Clear cached stat fields so the next diff/status pass re-reads the work tree.
2536fn invalidate_index_stat_cache(ie: &mut IndexEntry) {
2537    ie.ctime_sec = 0;
2538    ie.ctime_nsec = 0;
2539    ie.mtime_sec = 0;
2540    ie.mtime_nsec = 0;
2541    ie.dev = 0;
2542    ie.ino = 0;
2543    ie.size = 0;
2544}
2545
2546/// Hash a working tree file as a blob to get its OID.
2547/// Check if any parent component of `rel_path` (relative to `work_tree`) is a symlink.
2548fn has_symlink_in_path(work_tree: &Path, rel_path: &str) -> bool {
2549    let mut check = work_tree.to_path_buf();
2550    let components: Vec<&str> = rel_path.split('/').collect();
2551    // Check all components except the last one (which is the file itself)
2552    for component in &components[..components.len().saturating_sub(1)] {
2553        check.push(component);
2554        match fs::symlink_metadata(&check) {
2555            Ok(meta) if meta.file_type().is_symlink() => return true,
2556            _ => {}
2557        }
2558    }
2559    false
2560}
2561
2562pub fn hash_worktree_file(
2563    odb: &Odb,
2564    path: &Path,
2565    meta: &fs::Metadata,
2566    conv: &crate::crlf::ConversionConfig,
2567    file_attrs: &crate::crlf::FileAttrs,
2568    rel_path: &str,
2569    index_entry: Option<&IndexEntry>,
2570) -> Result<ObjectId> {
2571    let prior_blob: Option<Vec<u8>> = index_entry
2572        .filter(|e| e.oid != zero_oid())
2573        .and_then(|e| odb.read(&e.oid).ok().map(|o| o.data));
2574    let data = if meta.file_type().is_symlink() {
2575        // For symlinks, hash the target path
2576        let target = fs::read_link(path)?;
2577        target.to_string_lossy().into_owned().into_bytes()
2578    } else if meta.is_dir() {
2579        // `read()` on a directory fails with EISDIR; unmerged paths may leave an empty
2580        // placeholder directory (e.g. t4027 combined submodule conflict).
2581        Vec::new()
2582    } else {
2583        let raw = fs::read(path)?;
2584        // Apply clean conversion (CRLF→LF) so hash matches index blob.
2585        // Do not run safecrlf here: diff/commit use this for hashing and must not print warnings.
2586        let opts = crate::crlf::ConvertToGitOpts {
2587            index_blob: prior_blob.as_deref(),
2588            renormalize: false,
2589            check_safecrlf: false,
2590        };
2591        crate::crlf::convert_to_git_with_opts(&raw, rel_path, conv, file_attrs, opts).unwrap_or(raw)
2592    };
2593
2594    Ok(Odb::hash_object_data(ObjectKind::Blob, &data))
2595}
2596
2597/// Derive a Git file mode from filesystem metadata.
2598pub fn mode_from_metadata(meta: &fs::Metadata) -> u32 {
2599    if meta.file_type().is_symlink() {
2600        0o120000
2601    } else {
2602        #[cfg(unix)]
2603        {
2604            if meta.mode() & 0o111 != 0 {
2605                return 0o100755;
2606            }
2607        }
2608        0o100644
2609    }
2610}
2611
2612/// Compare a tree against the working tree.
2613///
2614/// Shows changes from `tree_oid` to the current working directory state.
2615/// Files tracked in the index but not in the tree are shown as Added.
2616/// Files in the tree but missing from the working tree are shown as Deleted.
2617///
2618/// # Parameters
2619///
2620/// - `odb` — object database.
2621/// - `tree_oid` — the tree to compare against (`None` for empty tree).
2622/// - `work_tree` — path to the working tree root.
2623/// - `index` — current index (used to discover new tracked files not in tree).
2624///
2625/// # Errors
2626///
2627/// Returns errors from ODB reads or I/O.
2628pub fn diff_tree_to_worktree(
2629    odb: &Odb,
2630    tree_oid: Option<&ObjectId>,
2631    work_tree: &Path,
2632    index: &Index,
2633) -> Result<Vec<DiffEntry>> {
2634    use crate::config::ConfigSet;
2635    use crate::crlf;
2636
2637    let git_dir = work_tree.join(".git");
2638    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
2639    let conv = crlf::ConversionConfig::from_config(&config);
2640    let attrs = crlf::load_gitattributes(work_tree);
2641
2642    // Flatten the tree into a BTreeMap keyed by path
2643    let tree_flat = match tree_oid {
2644        Some(oid) => flatten_tree(odb, oid, "")?,
2645        None => Vec::new(),
2646    };
2647    let tree_map: std::collections::BTreeMap<String, &FlatEntry> =
2648        tree_flat.iter().map(|e| (e.path.clone(), e)).collect();
2649
2650    // Build index lookup: path → &IndexEntry (stage 0 only)
2651    let mut index_entries: std::collections::BTreeMap<&[u8], &IndexEntry> =
2652        std::collections::BTreeMap::new();
2653    let mut index_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
2654    let mut stage0_paths: std::collections::BTreeSet<Vec<u8>> = std::collections::BTreeSet::new();
2655    for ie in &index.entries {
2656        if ie.stage() != 0 {
2657            continue;
2658        }
2659        let path = String::from_utf8_lossy(&ie.path).to_string();
2660        index_entries.insert(&ie.path, ie);
2661        index_paths.insert(path);
2662        stage0_paths.insert(ie.path.clone());
2663    }
2664
2665    // Paths with only unmerged stages (1–3) and no stage 0 — `git diff <rev>` must still list them
2666    // so combined `diff --cc` conflict hunks can be emitted (`t4108-apply-threeway`).
2667    let mut unmerged_only_paths: std::collections::BTreeSet<String> =
2668        std::collections::BTreeSet::new();
2669    for ie in &index.entries {
2670        if !(1..=3).contains(&ie.stage()) {
2671            continue;
2672        }
2673        if stage0_paths.contains(&ie.path) {
2674            continue;
2675        }
2676        unmerged_only_paths.insert(String::from_utf8_lossy(&ie.path).into_owned());
2677    }
2678
2679    // Union of tree paths + index paths
2680    let mut all_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
2681    all_paths.extend(tree_map.keys().cloned());
2682    all_paths.extend(index_paths.iter().cloned());
2683    all_paths.extend(unmerged_only_paths.iter().cloned());
2684
2685    let mut result = Vec::new();
2686
2687    for path in &all_paths {
2688        if index_entries
2689            .get(path.as_bytes())
2690            .is_some_and(|ie| ie.skip_worktree())
2691        {
2692            // Sparse checkout: `git diff <rev>` does not report tree↔worktree drift for
2693            // skip-worktree paths (they are outside the sparse cone). Matches t7012 stash flow.
2694            continue;
2695        }
2696
2697        let tree_entry = tree_map.get(path.as_str());
2698
2699        // Gitlink entries (submodules) — compare HEAD commit, not file content.
2700        let is_gitlink = tree_entry.is_some_and(|te| te.mode == 0o160000)
2701            || index_entries
2702                .get(path.as_bytes())
2703                .is_some_and(|ie| ie.mode == 0o160000);
2704        if is_gitlink {
2705            let sub_dir = work_tree.join(path);
2706            let index_gitlink_oid = index_entries
2707                .get(path.as_bytes())
2708                .filter(|ie| ie.mode == 0o160000)
2709                .map(|ie| ie.oid);
2710            match (tree_entry, index_gitlink_oid) {
2711                (Some(te), _) => {
2712                    let sub_head = read_submodule_head_oid(&sub_dir);
2713                    // A gitlink whose worktree directory no longer exists is a deleted submodule:
2714                    // Git's `diff-lib.c` reports it as a deletion (status `D`, new mode 000000) and
2715                    // `--submodule` renders the `(submodule deleted)` summary (t4041 #46).
2716                    if sub_head.is_none() && !sub_dir.exists() {
2717                        result.push(DiffEntry {
2718                            status: DiffStatus::Deleted,
2719                            old_path: Some(path.clone()),
2720                            new_path: Some(path.clone()),
2721                            old_mode: format_mode(te.mode),
2722                            new_mode: "000000".to_string(),
2723                            old_oid: te.oid,
2724                            new_oid: zero_oid(),
2725                            score: None,
2726                        });
2727                        continue;
2728                    }
2729                    let index_matches_tree = index_gitlink_oid.is_some_and(|oid| oid == te.oid);
2730                    let head_differs = sub_head.as_ref() != Some(&te.oid);
2731                    let dirty_while_aligned = index_matches_tree
2732                        && !head_differs
2733                        && submodule_has_dirty_worktree_for_super_diff(work_tree, path, &te.oid);
2734                    if head_differs || dirty_while_aligned {
2735                        // Raw `git diff <tree>` lines use a null OID on the worktree side when the
2736                        // checked-out submodule HEAD differs from the tree's gitlink; patch output
2737                        // still resolves the real commit from the submodule directory.
2738                        let new_oid = if head_differs { zero_oid() } else { te.oid };
2739                        result.push(DiffEntry {
2740                            status: DiffStatus::Modified,
2741                            old_path: Some(path.clone()),
2742                            new_path: Some(path.clone()),
2743                            old_mode: format_mode(te.mode),
2744                            new_mode: format_mode(te.mode),
2745                            old_oid: te.oid,
2746                            new_oid,
2747                            score: None,
2748                        });
2749                    }
2750                }
2751                (None, Some(idx_oid)) => {
2752                    // Gitlink staged in the index but absent from the tree: a new submodule. Git
2753                    // reports it as an addition (status `A`) with the index gitlink on the new side,
2754                    // so `--submodule` renders `(new submodule)` (t4041 #46). The patch renderer
2755                    // resolves the real commit from the index OID / submodule HEAD.
2756                    let new_oid = read_submodule_head_oid(&sub_dir).unwrap_or(idx_oid);
2757                    result.push(DiffEntry {
2758                        status: DiffStatus::Added,
2759                        old_path: Some(path.clone()),
2760                        new_path: Some(path.clone()),
2761                        old_mode: "000000".to_string(),
2762                        new_mode: format_mode(0o160000),
2763                        old_oid: zero_oid(),
2764                        new_oid,
2765                        score: None,
2766                    });
2767                }
2768                (None, None) => {}
2769            }
2770            continue;
2771        }
2772
2773        let file_path = work_tree.join(path);
2774
2775        let wt_meta = match fs::symlink_metadata(&file_path) {
2776            Ok(m) => Some(m),
2777            Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
2778            Err(e) => return Err(Error::Io(e)),
2779        };
2780
2781        if unmerged_only_paths.contains(path) {
2782            if let (Some(te), Some(meta)) = (tree_entry, wt_meta.as_ref()) {
2783                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
2784                let wt_oid =
2785                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, None)?;
2786                let wt_mode = mode_from_metadata(meta);
2787                if wt_oid != te.oid || wt_mode != te.mode {
2788                    result.push(DiffEntry {
2789                        status: DiffStatus::Modified,
2790                        old_path: Some(path.clone()),
2791                        new_path: Some(path.clone()),
2792                        old_mode: format_mode(te.mode),
2793                        new_mode: format_mode(wt_mode),
2794                        old_oid: te.oid,
2795                        new_oid: wt_oid,
2796                        score: None,
2797                    });
2798                }
2799            }
2800            continue;
2801        }
2802
2803        match (tree_entry, wt_meta) {
2804            (Some(te), Some(ref meta)) => {
2805                let wt_mode = mode_from_metadata(meta);
2806                let Some(ie) = index_entries.get(path.as_bytes()) else {
2807                    continue;
2808                };
2809
2810                let index_matches_tree = ie.oid == te.oid && ie.mode == te.mode;
2811
2812                // Fully clean: index matches `HEAD`, worktree matches index, stat cache fresh.
2813                if index_matches_tree && wt_mode == te.mode && stat_matches(ie, meta) {
2814                    continue;
2815                }
2816
2817                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
2818                let idx_ent = index_entries.get(path.as_bytes()).copied();
2819
2820                // Staged mode (same blob as `HEAD`, different mode recorded in the index).
2821                if ie.oid == te.oid && ie.mode != te.mode {
2822                    result.push(DiffEntry {
2823                        status: DiffStatus::Modified,
2824                        old_path: Some(path.clone()),
2825                        new_path: Some(path.clone()),
2826                        old_mode: format_mode(te.mode),
2827                        new_mode: format_mode(ie.mode),
2828                        old_oid: te.oid,
2829                        new_oid: te.oid,
2830                        score: None,
2831                    });
2832                    continue;
2833                }
2834
2835                // Index still matches `HEAD`: only unstaged worktree drift (content and/or
2836                // worktree-only exec bit when `update-index` was not run — t4049 harness).
2837                if index_matches_tree {
2838                    let wt_oid = hash_worktree_file(
2839                        odb,
2840                        &file_path,
2841                        meta,
2842                        &conv,
2843                        &file_attrs,
2844                        path,
2845                        idx_ent,
2846                    )?;
2847                    let mut eff_oid = wt_oid;
2848                    if eff_oid != te.oid {
2849                        if let Ok(raw) = fs::read(&file_path) {
2850                            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2851                            if raw_oid == te.oid {
2852                                eff_oid = te.oid;
2853                            }
2854                        }
2855                    }
2856                    if eff_oid != te.oid {
2857                        result.push(DiffEntry {
2858                            status: DiffStatus::Modified,
2859                            old_path: Some(path.clone()),
2860                            new_path: Some(path.clone()),
2861                            old_mode: format_mode(te.mode),
2862                            new_mode: format_mode(wt_mode),
2863                            old_oid: te.oid,
2864                            new_oid: eff_oid,
2865                            score: None,
2866                        });
2867                    } else if wt_mode != te.mode {
2868                        result.push(DiffEntry {
2869                            status: DiffStatus::Modified,
2870                            old_path: Some(path.clone()),
2871                            new_path: Some(path.clone()),
2872                            old_mode: format_mode(te.mode),
2873                            new_mode: format_mode(wt_mode),
2874                            old_oid: te.oid,
2875                            new_oid: te.oid,
2876                            score: None,
2877                        });
2878                    }
2879                    continue;
2880                }
2881
2882                // Staged content (and possibly mode): `git diff <rev>` is tree vs working tree.
2883                let wt_oid =
2884                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, idx_ent)?;
2885                let mut eff_oid = wt_oid;
2886                if eff_oid != te.oid {
2887                    if let Ok(raw) = fs::read(&file_path) {
2888                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
2889                        if raw_oid == te.oid {
2890                            eff_oid = te.oid;
2891                        }
2892                    }
2893                }
2894                if eff_oid != te.oid || wt_mode != te.mode {
2895                    result.push(DiffEntry {
2896                        status: DiffStatus::Modified,
2897                        old_path: Some(path.clone()),
2898                        new_path: Some(path.clone()),
2899                        old_mode: format_mode(te.mode),
2900                        new_mode: format_mode(wt_mode),
2901                        old_oid: te.oid,
2902                        new_oid: eff_oid,
2903                        score: None,
2904                    });
2905                }
2906            }
2907            (Some(te), None) => {
2908                // In tree but missing from worktree
2909                result.push(DiffEntry {
2910                    status: DiffStatus::Deleted,
2911                    old_path: Some(path.clone()),
2912                    new_path: None,
2913                    old_mode: format_mode(te.mode),
2914                    new_mode: "000000".to_owned(),
2915                    old_oid: te.oid,
2916                    new_oid: zero_oid(),
2917                    score: None,
2918                });
2919            }
2920            (None, Some(ref meta)) => {
2921                // In index but not in tree, and exists in worktree
2922                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
2923                let wt_oid = hash_worktree_file(
2924                    odb,
2925                    &file_path,
2926                    meta,
2927                    &conv,
2928                    &file_attrs,
2929                    path,
2930                    index_entries.get(path.as_bytes()).copied(),
2931                )?;
2932                let wt_mode = mode_from_metadata(meta);
2933                result.push(DiffEntry {
2934                    status: DiffStatus::Added,
2935                    old_path: None,
2936                    new_path: Some(path.clone()),
2937                    old_mode: "000000".to_owned(),
2938                    new_mode: format_mode(wt_mode),
2939                    old_oid: zero_oid(),
2940                    new_oid: wt_oid,
2941                    score: None,
2942                });
2943            }
2944            (None, None) => {
2945                // Tracked in index but neither in tree nor worktree — skip
2946            }
2947        }
2948    }
2949
2950    result.sort_by(|a, b| a.path().cmp(b.path()));
2951    Ok(result)
2952}
2953
2954// ── Rename detection ────────────────────────────────────────────────
2955
2956fn read_added_entry_bytes(
2957    odb: &Odb,
2958    entry: &DiffEntry,
2959    work_root: Option<&Path>,
2960) -> Option<Vec<u8>> {
2961    if entry.new_oid != zero_oid() {
2962        return odb.read(&entry.new_oid).ok().map(|obj| obj.data);
2963    }
2964    let path = entry.new_path.as_deref()?;
2965    let root = work_root?;
2966    fs::read(root.join(path)).ok()
2967}
2968
2969fn modified_as_copy_from_sources(
2970    odb: &Odb,
2971    work_root: Option<&Path>,
2972    e: &DiffEntry,
2973    threshold: u32,
2974    sources: &[(String, ObjectId, bool)],
2975    source_contents: &[Option<Vec<u8>>],
2976    source_tree_entries: &[(String, String, ObjectId)],
2977) -> Option<DiffEntry> {
2978    fn regular_file_mode(mode: &str) -> bool {
2979        mode == "100644" || mode == "100755"
2980    }
2981
2982    if e.status != DiffStatus::Modified || !regular_file_mode(&e.new_mode) {
2983        return None;
2984    }
2985    let new_data = read_added_entry_bytes(odb, e, work_root)?;
2986    let new_oid_eff = if e.new_oid != zero_oid() {
2987        e.new_oid
2988    } else {
2989        Odb::hash_object_data(ObjectKind::Blob, &new_data)
2990    };
2991
2992    let mut best: Option<(usize, u32)> = None;
2993    for (si, (src_path, src_oid, is_deleted)) in sources.iter().enumerate() {
2994        if *is_deleted {
2995            continue;
2996        }
2997        if e.new_path.as_deref() == Some(src_path.as_str()) {
2998            continue;
2999        }
3000        let src_mode_str = source_tree_entries
3001            .iter()
3002            .find(|(p, _, _)| p == src_path)
3003            .map(|(_, m, _)| m.as_str())
3004            .unwrap_or("100644");
3005        if !regular_file_mode(src_mode_str) {
3006            continue;
3007        }
3008
3009        let score = if *src_oid == new_oid_eff {
3010            100
3011        } else {
3012            match (&source_contents[si], Some(new_data.as_slice())) {
3013                (Some(old_data), Some(nd)) => compute_similarity(old_data, nd),
3014                _ => 0,
3015            }
3016        };
3017        if score >= threshold {
3018            let replace = match best {
3019                None => true,
3020                Some((_, s)) => score > s,
3021            };
3022            if replace {
3023                best = Some((si, score));
3024            }
3025        }
3026    }
3027
3028    let (si, score) = best?;
3029    let (src_path, src_oid, _) = &sources[si];
3030    let src_mode = source_tree_entries
3031        .iter()
3032        .find(|(p, _, _)| p == src_path)
3033        .map(|(_, m, _)| m.clone())
3034        .unwrap_or_else(|| e.old_mode.clone());
3035
3036    Some(DiffEntry {
3037        status: DiffStatus::Copied,
3038        old_path: Some(src_path.clone()),
3039        new_path: e.new_path.clone(),
3040        old_mode: src_mode,
3041        new_mode: e.new_mode.clone(),
3042        old_oid: *src_oid,
3043        new_oid: e.new_oid,
3044        score: Some(score),
3045    })
3046}
3047
3048/// Detect renames by pairing Deleted and Added entries with similar content.
3049///
3050/// `threshold` is the minimum similarity percentage (0–100) for a pair to
3051/// be considered a rename (Git's default is 50%).  The function reads blob
3052/// content from the ODB to compute a line-level similarity score.
3053///
3054/// Exact-OID matches are always 100% similar regardless of content.
3055///
3056/// When `work_root` is set, added entries whose `new_oid` is the zero placeholder (as in
3057/// uncached `diff-index` when the work tree diverged from the index) load content from disk
3058/// under that root instead of the object database.
3059pub fn detect_renames(
3060    odb: &Odb,
3061    work_root: Option<&Path>,
3062    entries: Vec<DiffEntry>,
3063    threshold: u32,
3064) -> Vec<DiffEntry> {
3065    // Split entries into deleted, added, and others.
3066    let mut deleted: Vec<DiffEntry> = Vec::new();
3067    let mut added: Vec<DiffEntry> = Vec::new();
3068    let mut others: Vec<DiffEntry> = Vec::new();
3069
3070    for entry in entries {
3071        match entry.status {
3072            DiffStatus::Deleted => deleted.push(entry),
3073            DiffStatus::Added => added.push(entry),
3074            _ => others.push(entry),
3075        }
3076    }
3077
3078    if deleted.is_empty() || added.is_empty() {
3079        // Nothing to pair — return original order.
3080        let mut result = others;
3081        result.extend(deleted);
3082        result.extend(added);
3083        result.sort_by(|a, b| a.path().cmp(b.path()));
3084        return result;
3085    }
3086
3087    // Read content for all deleted blobs.
3088    let deleted_contents: Vec<Option<Vec<u8>>> = deleted
3089        .iter()
3090        .map(|d| odb.read(&d.old_oid).ok().map(|obj| obj.data))
3091        .collect();
3092
3093    // Read content for all added blobs.
3094    let added_contents: Vec<Option<Vec<u8>>> = added
3095        .iter()
3096        .map(|a| read_added_entry_bytes(odb, a, work_root))
3097        .collect();
3098
3099    // Build a matrix of similarity scores and find the best pairings.
3100    // We use a greedy approach: pick the highest-scoring pair first.
3101    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
3102
3103    fn is_regularish_mode(mode: &str) -> bool {
3104        mode == "100644" || mode == "100755"
3105    }
3106
3107    fn same_path_same_blob(del: &DiffEntry, add: &DiffEntry) -> bool {
3108        del.old_path == add.new_path && del.old_oid == add.new_oid && del.old_mode == add.new_mode
3109    }
3110
3111    for (di, del) in deleted.iter().enumerate() {
3112        for (ai, add) in added.iter().enumerate() {
3113            // Exact OID match → 100%
3114            if del.old_oid == add.new_oid {
3115                scores.push((100, di, ai));
3116                continue;
3117            }
3118
3119            // Do not use line similarity across file types (e.g. regular ↔ symlink); Git keeps these
3120            // as separate changes (`t4008-diff-break-rewrite` #7).
3121            if !is_regularish_mode(&del.old_mode) || !is_regularish_mode(&add.new_mode) {
3122                continue;
3123            }
3124
3125            let score = match (&deleted_contents[di], &added_contents[ai]) {
3126                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
3127                _ => 0,
3128            };
3129
3130            if score >= threshold {
3131                scores.push((score, di, ai));
3132            }
3133        }
3134    }
3135
3136    // Sort: prefer real path-changing pairs before same-path no-op pairs, then
3137    // same-basename pairs, then by score descending.
3138    // This matches Git's behavior where basename matches are checked first.
3139    scores.sort_by(|a, b| {
3140        let a_noop = same_path_same_blob(&deleted[a.1], &added[a.2]);
3141        let b_noop = same_path_same_blob(&deleted[b.1], &added[b.2]);
3142        let a_same = same_basename(&deleted[a.1], &added[a.2]);
3143        let b_same = same_basename(&deleted[b.1], &added[b.2]);
3144        a_noop
3145            .cmp(&b_noop)
3146            .then_with(|| b_same.cmp(&a_same))
3147            .then_with(|| b.0.cmp(&a.0))
3148    });
3149
3150    let mut used_deleted = vec![false; deleted.len()];
3151    let mut used_added = vec![false; added.len()];
3152    let mut renames: Vec<DiffEntry> = Vec::new();
3153
3154    for (score, di, ai) in &scores {
3155        if used_deleted[*di] || used_added[*ai] {
3156            continue;
3157        }
3158        used_deleted[*di] = true;
3159        used_added[*ai] = true;
3160
3161        let del = &deleted[*di];
3162        let add = &added[*ai];
3163
3164        // A "rename" whose source and destination are the same path with the
3165        // same blob is not a change at all (this arises with pathological
3166        // duplicate tree entries, t4058). Git pairs and then drops it, leaving
3167        // no diff entry; mirror that by skipping emission.
3168        if same_path_same_blob(del, add) {
3169            continue;
3170        }
3171
3172        renames.push(DiffEntry {
3173            status: DiffStatus::Renamed,
3174            old_path: del.old_path.clone(),
3175            new_path: add.new_path.clone(),
3176            old_mode: del.old_mode.clone(),
3177            new_mode: add.new_mode.clone(),
3178            old_oid: del.old_oid,
3179            new_oid: add.new_oid,
3180            score: Some(*score),
3181        });
3182    }
3183
3184    // Collect unmatched entries.
3185    let mut result = others;
3186    result.extend(renames);
3187    for (i, entry) in deleted.into_iter().enumerate() {
3188        if !used_deleted[i] {
3189            result.push(entry);
3190        }
3191    }
3192    for (i, entry) in added.into_iter().enumerate() {
3193        if !used_added[i] {
3194            result.push(entry);
3195        }
3196    }
3197
3198    result.sort_by(|a, b| a.path().cmp(b.path()));
3199    result
3200}
3201
3202/// Detect copies among diff entries.
3203///
3204/// This first runs rename detection (pairing Deleted+Added), then for any
3205/// remaining Added entries, looks for copy sources.
3206///
3207/// - `find_copies_harder` = false: only Modified entries are copy source candidates.
3208/// - `find_copies_harder` = true: also examine unmodified files from `source_tree_entries`.
3209///
3210/// `source_tree_entries` should be a list of (path, mode, oid) from the source tree;
3211/// used when `find_copies_harder` is true to consider unmodified files as copy sources.
3212pub fn detect_copies(
3213    odb: &Odb,
3214    work_root: Option<&Path>,
3215    entries: Vec<DiffEntry>,
3216    threshold: u32,
3217    find_copies_harder: bool,
3218    source_tree_entries: &[(String, String, ObjectId)],
3219) -> Vec<DiffEntry> {
3220    use std::collections::{HashMap, HashSet};
3221
3222    // Separate entries by status.
3223    let mut deleted: Vec<DiffEntry> = Vec::new();
3224    let mut added: Vec<DiffEntry> = Vec::new();
3225    let mut others: Vec<DiffEntry> = Vec::new();
3226
3227    for entry in entries {
3228        match entry.status {
3229            DiffStatus::Deleted => deleted.push(entry),
3230            DiffStatus::Added => added.push(entry),
3231            _ => others.push(entry),
3232        }
3233    }
3234
3235    // Build source candidates: deleted files, modified files, and optionally tree entries.
3236    // Track which sources are from deleted files (can become renames).
3237    let mut sources: Vec<(String, ObjectId, bool)> = Vec::new(); // (path, oid, is_deleted)
3238    let mut deleted_source_idx: HashMap<String, usize> = HashMap::new();
3239
3240    for entry in &deleted {
3241        if let Some(ref path) = entry.old_path {
3242            deleted_source_idx.insert(path.clone(), sources.len());
3243            sources.push((path.clone(), entry.old_oid, true));
3244        }
3245    }
3246
3247    // Modified and type-changed files are candidates for `-C` (e.g. symlink rewrite leaves the
3248    // old blob available as a copy source for another path; see `t4008-diff-break-rewrite`).
3249    for entry in &others {
3250        if matches!(entry.status, DiffStatus::Modified | DiffStatus::TypeChanged) {
3251            if let Some(ref old_path) = entry.old_path {
3252                if !sources.iter().any(|(p, _, _)| p == old_path) {
3253                    sources.push((old_path.clone(), entry.old_oid, false));
3254                }
3255            }
3256        }
3257    }
3258
3259    // With find_copies_harder, add all source tree entries.
3260    if find_copies_harder {
3261        for (path, _mode, oid) in source_tree_entries {
3262            if !sources.iter().any(|(p, _, _)| p == path) {
3263                sources.push((path.clone(), *oid, false));
3264            }
3265        }
3266    }
3267
3268    if sources.is_empty() {
3269        let mut result = others;
3270        result.extend(deleted);
3271        result.extend(added);
3272        result.sort_by(|a, b| a.path().cmp(b.path()));
3273        return result;
3274    }
3275
3276    // Read content for sources.
3277    let source_contents: Vec<Option<Vec<u8>>> = sources
3278        .iter()
3279        .map(|(_, oid, _)| odb.read(oid).ok().map(|obj| obj.data))
3280        .collect();
3281
3282    let mut result_entries: Vec<DiffEntry> = Vec::new();
3283    let mut renamed_deleted: HashSet<usize> = HashSet::new();
3284    let mut used_added2 = vec![false; added.len()];
3285
3286    if !added.is_empty() {
3287        // Read content for added blobs.
3288        let added_contents: Vec<Option<Vec<u8>>> = added
3289            .iter()
3290            .map(|a| read_added_entry_bytes(odb, a, work_root))
3291            .collect();
3292
3293        // Build score matrix: (score, source_idx, added_idx)
3294        let mut scores: Vec<(u32, usize, usize)> = Vec::new();
3295        for (si, (src_path, src_oid, _)) in sources.iter().enumerate() {
3296            for (ai, add) in added.iter().enumerate() {
3297                // Never pair a path with itself as copy source (matches Git; avoids
3298                // arbitrary tie-breaking when several sources share the same blob).
3299                if add.new_path.as_deref() == Some(src_path.as_str()) {
3300                    continue;
3301                }
3302                let add_oid = if add.new_oid != zero_oid() {
3303                    add.new_oid
3304                } else if let Some(ref data) = added_contents[ai] {
3305                    Odb::hash_object_data(ObjectKind::Blob, data)
3306                } else {
3307                    zero_oid()
3308                };
3309                if *src_oid == add_oid {
3310                    scores.push((100, si, ai));
3311                    continue;
3312                }
3313                let score = match (&source_contents[si], &added_contents[ai]) {
3314                    (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
3315                    _ => 0,
3316                };
3317                if score >= threshold {
3318                    scores.push((score, si, ai));
3319                }
3320            }
3321        }
3322
3323        // Sort by score descending.
3324        scores.sort_by(|a, b| b.0.cmp(&a.0));
3325
3326        // Build source->added mappings, each added file assigned to best source.
3327        let mut used_added = vec![false; added.len()];
3328        let mut source_to_added: HashMap<usize, Vec<(usize, u32)>> = HashMap::new();
3329        for &(score, si, ai) in &scores {
3330            if used_added[ai] {
3331                continue;
3332            }
3333            used_added[ai] = true;
3334            source_to_added.entry(si).or_default().push((ai, score));
3335        }
3336
3337        // For each deleted source, pick one assignment as Rename, rest as Copy.
3338        for (&si, assignments_for_src) in &source_to_added {
3339            let (_, _, is_deleted) = &sources[si];
3340            if *is_deleted && !assignments_for_src.is_empty() {
3341                // Pick the last one (by path) as the rename target.
3342                // Git tends to pick the rename as the last alphabetically.
3343                let rename_ai = assignments_for_src
3344                    .iter()
3345                    .max_by_key(|(ai, _score)| added[*ai].path().to_string())
3346                    .map(|(ai, _)| *ai);
3347
3348                for &(ai, score) in assignments_for_src {
3349                    let (ref src_path, _, _) = sources[si];
3350                    let add = &added[ai];
3351                    let src_mode = source_tree_entries
3352                        .iter()
3353                        .find(|(p, _, _)| p == src_path)
3354                        .map(|(_, m, _)| m.clone())
3355                        .unwrap_or_else(|| add.old_mode.clone());
3356
3357                    let is_rename = Some(ai) == rename_ai;
3358                    result_entries.push(DiffEntry {
3359                        status: if is_rename {
3360                            DiffStatus::Renamed
3361                        } else {
3362                            DiffStatus::Copied
3363                        },
3364                        old_path: Some(src_path.clone()),
3365                        new_path: add.new_path.clone(),
3366                        old_mode: src_mode,
3367                        new_mode: add.new_mode.clone(),
3368                        old_oid: sources[si].1,
3369                        new_oid: add.new_oid,
3370                        score: Some(score),
3371                    });
3372                    used_added2[ai] = true;
3373                }
3374                renamed_deleted.insert(si);
3375            } else {
3376                // Non-deleted source: all assignments are copies.
3377                for &(ai, score) in assignments_for_src {
3378                    let (ref src_path, _, _) = sources[si];
3379                    let add = &added[ai];
3380                    let src_mode = source_tree_entries
3381                        .iter()
3382                        .find(|(p, _, _)| p == src_path)
3383                        .map(|(_, m, _)| m.clone())
3384                        .unwrap_or_else(|| add.old_mode.clone());
3385
3386                    result_entries.push(DiffEntry {
3387                        status: DiffStatus::Copied,
3388                        old_path: Some(src_path.clone()),
3389                        new_path: add.new_path.clone(),
3390                        old_mode: src_mode,
3391                        new_mode: add.new_mode.clone(),
3392                        old_oid: sources[si].1,
3393                        new_oid: add.new_oid,
3394                        score: Some(score),
3395                    });
3396                    used_added2[ai] = true;
3397                }
3398            }
3399        }
3400    }
3401
3402    // Keep deleted entries that weren't consumed by a rename.
3403    for entry in deleted.into_iter() {
3404        if let Some(ref path) = entry.old_path {
3405            if let Some(&si) = deleted_source_idx.get(path) {
3406                if renamed_deleted.contains(&si) {
3407                    // This deletion was consumed by a rename; skip it.
3408                    continue;
3409                }
3410            }
3411        }
3412        result_entries.push(entry);
3413    }
3414
3415    let mut result = others;
3416    result.extend(result_entries);
3417    // Keep unmatched added entries.
3418    for (i, entry) in added.into_iter().enumerate() {
3419        if !used_added2[i] {
3420            result.push(entry);
3421        }
3422    }
3423
3424    let mut final_result = Vec::with_capacity(result.len());
3425    for e in result {
3426        if let Some(c) = modified_as_copy_from_sources(
3427            odb,
3428            work_root,
3429            &e,
3430            threshold,
3431            &sources,
3432            &source_contents,
3433            source_tree_entries,
3434        ) {
3435            final_result.push(c);
3436        } else {
3437            final_result.push(e);
3438        }
3439    }
3440
3441    final_result.sort_by(|a, b| a.path().cmp(b.path()));
3442    final_result
3443}
3444
3445/// Apply Git-style rename and optional copy detection for index↔worktree diffs.
3446///
3447/// When `copies` is true (Git `diff.renames` / `status.renames` set to `copy`/`copies`),
3448/// runs copy detection, which also pairs deleted sources with one rename and any additional
3449/// destinations as copies.
3450///
3451/// # Errors
3452///
3453/// Propagates errors from reading the `head_tree` object from `odb`.
3454pub fn status_apply_rename_copy_detection(
3455    odb: &Odb,
3456    unstaged_raw: Vec<DiffEntry>,
3457    threshold: u32,
3458    copies: bool,
3459    head_tree: Option<&ObjectId>,
3460) -> Result<Vec<DiffEntry>> {
3461    if !copies {
3462        return Ok(detect_renames(odb, None, unstaged_raw, threshold));
3463    }
3464    let source_tree_entries: Vec<(String, String, ObjectId)> = match head_tree {
3465        Some(oid) => flatten_tree(odb, oid, "")?
3466            .into_iter()
3467            .map(|e| (e.path, format_mode(e.mode), e.oid))
3468            .collect(),
3469        None => Vec::new(),
3470    };
3471    Ok(detect_copies(
3472        odb,
3473        None,
3474        unstaged_raw,
3475        threshold,
3476        false,
3477        &source_tree_entries,
3478    ))
3479}
3480
3481/// Format a rename pair using Git's compact path format.
3482///
3483/// Examples:
3484/// - `a/b/c` → `c/b/a` → `a/b/c => c/b/a`
3485/// - `c/b/a` → `c/d/e` → `c/{b/a => d/e}`
3486/// - `c/d/e` → `d/e` → `{c/d => d}/e`
3487/// - `d/e` → `d/f/e` → `d/{ => f}/e`
3488pub fn format_rename_path(old: &str, new: &str) -> String {
3489    let ob = old.as_bytes();
3490    let nb = new.as_bytes();
3491
3492    // Find common prefix length, snapped to '/' boundary.
3493    let pfx = {
3494        let mut last_sep = 0usize;
3495        let min_len = ob.len().min(nb.len());
3496        for i in 0..min_len {
3497            if ob[i] != nb[i] {
3498                break;
3499            }
3500            if ob[i] == b'/' {
3501                last_sep = i + 1;
3502            }
3503        }
3504        last_sep
3505    };
3506
3507    // Find common suffix length, snapped to '/' boundary.
3508    let mut sfx = {
3509        let mut last_sep = 0usize;
3510        let min_len = ob.len().min(nb.len());
3511        for i in 0..min_len {
3512            let oi = ob.len() - 1 - i;
3513            let ni = nb.len() - 1 - i;
3514            if ob[oi] != nb[ni] {
3515                break;
3516            }
3517            if ob[oi] == b'/' {
3518                last_sep = i + 1;
3519            }
3520        }
3521        last_sep
3522    };
3523
3524    // Suffix starts at this position in each string.
3525    let mut sfx_at_old = ob.len() - sfx;
3526    let mut sfx_at_new = nb.len() - sfx;
3527
3528    // If prefix and suffix overlap in both strings (both middles empty),
3529    // reduce the suffix so that at least the longer string has a non-empty middle.
3530    while pfx > sfx_at_old && pfx > sfx_at_new && sfx > 0 {
3531        // Reduce suffix by snapping to the next smaller '/' boundary.
3532        let suffix_bytes = &ob[sfx_at_old..];
3533        let mut new_sfx = 0;
3534        // Find the next '/' after sfx_at_old (i.e., reduce suffix).
3535        for (i, &b) in suffix_bytes.iter().enumerate().skip(1) {
3536            if b == b'/' {
3537                new_sfx = sfx - i;
3538                break;
3539            }
3540        }
3541        if new_sfx == 0 || new_sfx >= sfx {
3542            sfx_at_old = ob.len();
3543            sfx_at_new = nb.len();
3544            break;
3545        }
3546        sfx = new_sfx;
3547        sfx_at_old = ob.len() - sfx;
3548        sfx_at_new = nb.len() - sfx;
3549    }
3550
3551    // When prefix and suffix overlap in the shorter string, they share
3552    // the '/' boundary character. In the output format, the shared '/'
3553    // appears in both positions (e.g. "d/{ => f}/e" for d/e → d/f/e).
3554    // Compute the middle parts. When prefix and suffix overlap in a
3555    // string, the middle for that string is empty. The shared '/' shows
3556    // in both prefix (trailing) and suffix (leading) positions.
3557    let prefix = &old[..pfx];
3558    let suffix = &old[sfx_at_old..];
3559    let old_mid = if pfx <= sfx_at_old {
3560        &old[pfx..sfx_at_old]
3561    } else {
3562        ""
3563    };
3564    let new_mid = if pfx <= sfx_at_new {
3565        &new[pfx..sfx_at_new]
3566    } else {
3567        ""
3568    };
3569
3570    if prefix.is_empty() && suffix.is_empty() {
3571        return format!("{old} => {new}");
3572    }
3573
3574    format!("{prefix}{{{old_mid} => {new_mid}}}{suffix}")
3575}
3576
3577/// Check if two entries share the same filename (basename).
3578fn same_basename(del: &DiffEntry, add: &DiffEntry) -> bool {
3579    let old = del.old_path.as_deref().unwrap_or("");
3580    let new = add.new_path.as_deref().unwrap_or("");
3581    let old_base = old.rsplit('/').next().unwrap_or(old);
3582    let new_base = new.rsplit('/').next().unwrap_or(new);
3583    old_base == new_base && !old_base.is_empty()
3584}
3585
3586/// Compute a similarity percentage (0–100) between two byte slices.
3587///
3588/// Uses Git's approach: count the bytes that are "shared" (appear in
3589/// equal lines), then compute `score = shared_bytes * 2 * 100 / (src_size + dst_size)`.
3590fn compute_similarity(old: &[u8], new: &[u8]) -> u32 {
3591    // Normalize CRLF → LF before comparing so that files differing
3592    // only in line endings are detected as renames.
3593    let old_norm = crate::crlf::crlf_to_lf(old);
3594    let new_norm = crate::crlf::crlf_to_lf(new);
3595
3596    let src_size = old_norm.len();
3597    let dst_size = new_norm.len();
3598
3599    if src_size == 0 && dst_size == 0 {
3600        return 100;
3601    }
3602    let total = src_size + dst_size;
3603    if total == 0 {
3604        return 100;
3605    }
3606
3607    // Use line-level diff to find shared content, then count bytes.
3608    use similar::{ChangeTag, TextDiff};
3609    let old_str = String::from_utf8_lossy(&old_norm);
3610    let new_str = String::from_utf8_lossy(&new_norm);
3611    let diff = TextDiff::from_lines(&old_str as &str, &new_str as &str);
3612
3613    let mut shared_bytes = 0usize;
3614    for change in diff.iter_all_changes() {
3615        if change.tag() == ChangeTag::Equal {
3616            // Count bytes in the matching line (including newline).
3617            shared_bytes += change.value().len();
3618        }
3619    }
3620
3621    // Git: score = copied * MAX_SCORE / max(src_size, dst_size)
3622    // We normalize to 0-100.
3623    let max_size = src_size.max(dst_size);
3624
3625    ((shared_bytes * 100) / max_size).min(100) as u32
3626}
3627
3628/// Compute rename/copy similarity percentage (0–100) between two byte slices.
3629///
3630/// This uses the same scoring logic as internal rename detection.
3631#[must_use]
3632pub fn rename_similarity_score(old: &[u8], new: &[u8]) -> u32 {
3633    compute_similarity(old, new)
3634}
3635
3636// ── Output formatting ───────────────────────────────────────────────
3637
3638/// Format a diff entry in Git's raw diff format.
3639///
3640/// Example: `:100644 100644 abc1234... def5678... M\tfile.txt`
3641pub fn format_raw(entry: &DiffEntry) -> String {
3642    let path = match entry.status {
3643        DiffStatus::Renamed | DiffStatus::Copied => {
3644            format!(
3645                "{}\t{}",
3646                entry.old_path.as_deref().unwrap_or(""),
3647                entry.new_path.as_deref().unwrap_or("")
3648            )
3649        }
3650        _ => entry.path().to_owned(),
3651    };
3652
3653    let status_str = match (entry.status, entry.score) {
3654        (DiffStatus::Renamed, Some(s)) => format!("R{:03}", s),
3655        (DiffStatus::Copied, Some(s)) => format!("C{:03}", s),
3656        _ => entry.status.letter().to_string(),
3657    };
3658
3659    let (old_hex, new_hex) = raw_oid_hex_pair(&entry.old_oid, &entry.new_oid);
3660    format!(
3661        ":{} {} {} {} {}\t{}",
3662        entry.old_mode, entry.new_mode, old_hex, new_hex, status_str, path
3663    )
3664}
3665
3666/// Render a diff entry's `(old, new)` OIDs as full hex for `--raw` output,
3667/// widening any null OID to the repository's hash width.
3668///
3669/// Diff entries store null OIDs at SHA-1 width regardless of the repository
3670/// algorithm, but `git diff --raw` prints them at the real hash width (64
3671/// zeros in a SHA-256 repo). A diff entry never has both sides null, so the
3672/// width is taken from whichever side carries a real object.
3673fn raw_oid_hex_pair(old: &ObjectId, new: &ObjectId) -> (String, String) {
3674    let width = if !old.is_zero() {
3675        old.algo().hex_len()
3676    } else if !new.is_zero() {
3677        new.algo().hex_len()
3678    } else {
3679        old.algo().hex_len()
3680    };
3681    let render = |oid: &ObjectId| {
3682        if oid.is_zero() {
3683            "0".repeat(width)
3684        } else {
3685            oid.to_hex()
3686        }
3687    };
3688    (render(old), render(new))
3689}
3690
3691/// Format a diff entry with abbreviated OIDs.
3692pub fn format_raw_abbrev(entry: &DiffEntry, abbrev_len: usize) -> String {
3693    let ellipsis = if std::env::var("GIT_PRINT_SHA1_ELLIPSIS").ok().as_deref() == Some("yes") {
3694        "..."
3695    } else {
3696        ""
3697    };
3698    let old_hex = format!("{}", entry.old_oid);
3699    let new_hex = format!("{}", entry.new_oid);
3700    let old_abbrev = &old_hex[..abbrev_len.min(old_hex.len())];
3701    let new_abbrev = &new_hex[..abbrev_len.min(new_hex.len())];
3702
3703    // Renames/copies carry a similarity score and a `<old>\t<new>` path pair.
3704    let path = match entry.status {
3705        DiffStatus::Renamed | DiffStatus::Copied => format!(
3706            "{}\t{}",
3707            entry.old_path.as_deref().unwrap_or(""),
3708            entry.new_path.as_deref().unwrap_or("")
3709        ),
3710        _ => entry.path().to_owned(),
3711    };
3712    let status_str = match (entry.status, entry.score) {
3713        (DiffStatus::Renamed, Some(s)) => format!("R{s:03}"),
3714        (DiffStatus::Copied, Some(s)) => format!("C{s:03}"),
3715        _ => entry.status.letter().to_string(),
3716    };
3717
3718    format!(
3719        ":{} {} {}{} {}{} {}\t{}",
3720        entry.old_mode,
3721        entry.new_mode,
3722        old_abbrev,
3723        ellipsis,
3724        new_abbrev,
3725        ellipsis,
3726        status_str,
3727        path
3728    )
3729}
3730
3731/// Generate a unified diff patch for two blobs.
3732///
3733/// # Parameters
3734///
3735/// - `old_content` — the old file content (empty for added files).
3736/// - `new_content` — the new file content (empty for deleted files).
3737/// - `old_path` — display path for the old side.
3738/// - `new_path` — display path for the new side.
3739/// - `context_lines` — number of context lines around changes (default: 3).
3740/// - Inter-hunk context defaults to `0` (see [`unified_diff_with_prefix`]).
3741///
3742/// # Returns
3743///
3744/// The unified diff as a string.
3745pub fn unified_diff(
3746    old_content: &str,
3747    new_content: &str,
3748    old_path: &str,
3749    new_path: &str,
3750    context_lines: usize,
3751    indent_heuristic: bool,
3752    quote_path_fully: bool,
3753) -> String {
3754    unified_diff_with_prefix(
3755        old_content,
3756        new_content,
3757        old_path,
3758        new_path,
3759        context_lines,
3760        0,
3761        "a/",
3762        "b/",
3763        indent_heuristic,
3764        quote_path_fully,
3765    )
3766}
3767
3768/// Same as `unified_diff` but with configurable source/destination prefixes.
3769///
3770/// `inter_hunk_context` is Git's `--inter-hunk-context`: adjacent hunks merge when
3771/// the unchanged gap between them is at most `2 * context_lines + inter_hunk_context` lines.
3772#[allow(clippy::too_many_arguments)] // Mirrors Git-style unified diff parameters.
3773pub fn unified_diff_with_prefix(
3774    old_content: &str,
3775    new_content: &str,
3776    old_path: &str,
3777    new_path: &str,
3778    context_lines: usize,
3779    inter_hunk_context: usize,
3780    src_prefix: &str,
3781    dst_prefix: &str,
3782    indent_heuristic: bool,
3783    quote_path_fully: bool,
3784) -> String {
3785    unified_diff_with_prefix_and_funcname(
3786        old_content,
3787        new_content,
3788        old_path,
3789        new_path,
3790        context_lines,
3791        inter_hunk_context,
3792        src_prefix,
3793        dst_prefix,
3794        None,
3795        indent_heuristic,
3796        quote_path_fully,
3797    )
3798}
3799
3800/// Same as [`unified_diff_with_prefix`] with optional custom hunk-header
3801/// function-name matching.
3802#[allow(clippy::too_many_arguments)]
3803pub fn unified_diff_with_prefix_and_funcname(
3804    old_content: &str,
3805    new_content: &str,
3806    old_path: &str,
3807    new_path: &str,
3808    context_lines: usize,
3809    inter_hunk_context: usize,
3810    src_prefix: &str,
3811    dst_prefix: &str,
3812    funcname_matcher: Option<&FuncnameMatcher>,
3813    indent_heuristic: bool,
3814    quote_path_fully: bool,
3815) -> String {
3816    unified_diff_with_prefix_and_funcname_and_algorithm(
3817        old_content,
3818        new_content,
3819        old_path,
3820        new_path,
3821        context_lines,
3822        inter_hunk_context,
3823        src_prefix,
3824        dst_prefix,
3825        funcname_matcher,
3826        similar::Algorithm::Myers,
3827        false,
3828        false,
3829        indent_heuristic,
3830        quote_path_fully,
3831    )
3832}
3833
3834/// Same as [`unified_diff_with_prefix_and_funcname`] but allows callers to
3835/// choose the line diff algorithm used for hunk generation.
3836///
3837/// When `function_context` is true (`git diff -W`), hunks are expanded to
3838/// whole logical functions using the same rules as Git's `XDL_EMIT_FUNCCONTEXT`.
3839#[allow(clippy::too_many_arguments)]
3840pub fn unified_diff_with_prefix_and_funcname_and_algorithm(
3841    old_content: &str,
3842    new_content: &str,
3843    old_path: &str,
3844    new_path: &str,
3845    context_lines: usize,
3846    inter_hunk_context: usize,
3847    src_prefix: &str,
3848    dst_prefix: &str,
3849    funcname_matcher: Option<&FuncnameMatcher>,
3850    algorithm: similar::Algorithm,
3851    function_context: bool,
3852    use_git_histogram: bool,
3853    indent_heuristic: bool,
3854    quote_path_fully: bool,
3855) -> String {
3856    // `--function-context` (`-W`) expansion must apply regardless of the line
3857    // algorithm; the histogram body printer below does not do it, so route `-W`
3858    // through the function-context emitter first (t4015 #136).
3859    if function_context {
3860        return unified_diff_with_function_context(
3861            old_content,
3862            new_content,
3863            old_path,
3864            new_path,
3865            context_lines,
3866            inter_hunk_context,
3867            src_prefix,
3868            dst_prefix,
3869            funcname_matcher,
3870            algorithm,
3871            indent_heuristic,
3872            quote_path_fully,
3873        );
3874    }
3875
3876    if use_git_histogram {
3877        return unified_diff_histogram_with_prefix_and_funcname(
3878            old_content,
3879            new_content,
3880            old_path,
3881            new_path,
3882            context_lines,
3883            inter_hunk_context,
3884            src_prefix,
3885            dst_prefix,
3886            funcname_matcher,
3887            quote_path_fully,
3888        );
3889    }
3890
3891    use crate::quote_path::format_diff_path_with_prefix;
3892    use similar::{udiff::UnifiedDiffHunk, TextDiff};
3893
3894    let diff = TextDiff::configure()
3895        .algorithm(algorithm)
3896        .diff_lines(old_content, new_content);
3897    let compacted_ops = diff_indent_heuristic::diff_lines_ops_compacted(
3898        old_content,
3899        new_content,
3900        algorithm,
3901        indent_heuristic,
3902    );
3903
3904    let mut output = String::new();
3905    if old_path == "/dev/null" {
3906        output.push_str("--- /dev/null\n");
3907    } else if src_prefix.is_empty() {
3908        // Callers (e.g. `diff-tree`, `diff-index`) may pass a fully formatted token
3909        // (already includes `a/` and any C-style quoting).
3910        output.push_str(&format!("--- {old_path}\n"));
3911    } else {
3912        output.push_str("--- ");
3913        output.push_str(&format_diff_path_with_prefix(
3914            src_prefix,
3915            old_path,
3916            quote_path_fully,
3917        ));
3918        output.push('\n');
3919    }
3920    if new_path == "/dev/null" {
3921        output.push_str("+++ /dev/null\n");
3922    } else if dst_prefix.is_empty() {
3923        output.push_str(&format!("+++ {new_path}\n"));
3924    } else {
3925        output.push_str("+++ ");
3926        output.push_str(&format_diff_path_with_prefix(
3927            dst_prefix,
3928            new_path,
3929            quote_path_fully,
3930        ));
3931        output.push('\n');
3932    }
3933
3934    let old_lines: Vec<&str> = old_content.lines().collect();
3935
3936    // Git's xdiff merges adjacent changes while the gap between them in the old file is at most
3937    // `2 * context_lines + inter_hunk_context` (see `xdl_get_hunk` in xemit.c).
3938    // `similar::group_diff_ops` couples the split threshold and the displayed edge context to a
3939    // single radius (split at `> 2n`), which over-merges when the gap limit is odd
3940    // (t4032: `-U0 --inter-hunk-context=1` with 2 common lines must stay 2 hunks).
3941    let max_common_gap = context_lines
3942        .saturating_mul(2)
3943        .saturating_add(inter_hunk_context);
3944    let op_groups = group_diff_ops_gap(compacted_ops, context_lines, max_common_gap);
3945
3946    for ops in op_groups {
3947        if ops.is_empty() {
3948            continue;
3949        }
3950        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
3951        let hunk_str = format!("{hunk}");
3952        // The similar crate outputs @@ -a,b +c,d @@\n but Git adds
3953        // function context after the closing @@. Extract the hunk header
3954        // and add function context.
3955        if let Some(first_newline) = hunk_str.find('\n') {
3956            let header_line = &hunk_str[..first_newline];
3957            let rest = &hunk_str[first_newline..];
3958
3959            // Parse the old start line from the @@ header
3960            if let Some(func_ctx) =
3961                extract_function_context(header_line, &old_lines, funcname_matcher)
3962            {
3963                output.push_str(header_line);
3964                output.push(' ');
3965                output.push_str(&func_ctx);
3966                output.push_str(rest);
3967            } else {
3968                output.push_str(&hunk_str);
3969            }
3970        } else {
3971            output.push_str(&hunk_str);
3972        }
3973    }
3974
3975    output
3976}
3977
3978/// Group diff ops into hunks like Git's `xdl_get_hunk`: two changes merge into one hunk while
3979/// the run of unchanged lines between them is at most `max_common_gap`
3980/// (`2 * context + inter_hunk_context`), and each hunk keeps at most `context` unchanged lines
3981/// at its edges. Unlike `similar::group_diff_ops`, the split threshold is decoupled from the
3982/// edge context so odd gap limits group exactly like Git.
3983fn group_diff_ops_gap(
3984    mut ops: Vec<similar::DiffOp>,
3985    context: usize,
3986    max_common_gap: usize,
3987) -> Vec<Vec<similar::DiffOp>> {
3988    use similar::DiffOp;
3989    if ops.is_empty() {
3990        return vec![];
3991    }
3992
3993    let mut pending_group = Vec::new();
3994    let mut rv = Vec::new();
3995
3996    if let Some(DiffOp::Equal {
3997        old_index,
3998        new_index,
3999        len,
4000    }) = ops.first_mut()
4001    {
4002        let offset = (*len).saturating_sub(context);
4003        *old_index += offset;
4004        *new_index += offset;
4005        *len -= offset;
4006    }
4007
4008    if let Some(DiffOp::Equal { len, .. }) = ops.last_mut() {
4009        *len -= (*len).saturating_sub(context);
4010    }
4011
4012    for op in ops.into_iter() {
4013        if let DiffOp::Equal {
4014            old_index,
4015            new_index,
4016            len,
4017        } = op
4018        {
4019            // End the current group and start a new one whenever the unchanged
4020            // run is too long to fuse the surrounding changes.
4021            if len > max_common_gap {
4022                pending_group.push(DiffOp::Equal {
4023                    old_index,
4024                    new_index,
4025                    len: context,
4026                });
4027                rv.push(pending_group);
4028                let offset = len.saturating_sub(context);
4029                pending_group = vec![DiffOp::Equal {
4030                    old_index: old_index + offset,
4031                    new_index: new_index + offset,
4032                    len: len - offset,
4033                }];
4034                continue;
4035            }
4036        }
4037        pending_group.push(op);
4038    }
4039
4040    match &pending_group[..] {
4041        &[] | &[similar::DiffOp::Equal { .. }] => {}
4042        _ => rv.push(pending_group),
4043    }
4044
4045    rv
4046}
4047
4048/// `git diff -W`: expand each hunk to include full function bodies (see Git `xemit.c`).
4049fn unified_diff_with_function_context(
4050    old_content: &str,
4051    new_content: &str,
4052    old_path: &str,
4053    new_path: &str,
4054    context_lines: usize,
4055    inter_hunk_context: usize,
4056    src_prefix: &str,
4057    dst_prefix: &str,
4058    funcname_matcher: Option<&FuncnameMatcher>,
4059    algorithm: similar::Algorithm,
4060    indent_heuristic: bool,
4061    quote_path_fully: bool,
4062) -> String {
4063    use crate::quote_path::format_diff_path_with_prefix;
4064    use similar::{udiff::UnifiedDiffHunk, TextDiff};
4065
4066    let diff = TextDiff::configure()
4067        .algorithm(algorithm)
4068        .diff_lines(old_content, new_content);
4069
4070    let old_lines: Vec<&str> = old_content.lines().collect();
4071    let new_lines: Vec<&str> = new_content.lines().collect();
4072    let n_old = old_lines.len();
4073    let n_new = new_lines.len();
4074
4075    // Group changes the way Git's xdl_get_hunk does: merge while the unchanged
4076    // gap is at most `2*context + inter_hunk_context`. `similar::group_diff_ops`
4077    // splits only at gap > `2*radius`, which over-merges changes in *different*
4078    // functions (e.g. a leading insertion and a body change) into one hunk and
4079    // then over-expands the function context (t4015 #136). Use the gap-correct
4080    // grouping (same as the non-function-context path).
4081    let max_common_gap = context_lines
4082        .saturating_mul(2)
4083        .saturating_add(inter_hunk_context);
4084    let all_ops = diff.ops().to_vec();
4085    let op_groups = group_diff_ops_gap(all_ops.clone(), context_lines, max_common_gap);
4086
4087    let mut ranges: Vec<(usize, usize, usize, usize)> = Vec::new();
4088
4089    for ops in op_groups {
4090        if ops.is_empty() {
4091            continue;
4092        }
4093        let i1_anchor = func_context_old_anchor(&ops, n_old);
4094        let i1_end = hunk_old_change_end_exclusive(&ops);
4095        let skip_preimage_pull =
4096            append_with_whole_function_added(&ops, n_old, n_new, &new_lines, funcname_matcher);
4097        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
4098        let hunk_str = format!("{hunk}");
4099        let header_line = hunk_str
4100            .lines()
4101            .next()
4102            .unwrap_or("")
4103            .trim_end_matches(['\r', '\n']);
4104        let Some((base_s1, _base_e1, _base_s2, _base_e2)) =
4105            parse_unified_hunk_header_ranges(header_line)
4106        else {
4107            continue;
4108        };
4109
4110        let ctx = context_lines;
4111        let (s1, e1, s2, e2) = if skip_preimage_pull {
4112            let s = n_old.saturating_sub(ctx);
4113            let s2 = map_old_line_to_new(&all_ops, s, n_new).min(n_new);
4114            (s, n_old, s2, n_new)
4115        } else {
4116            let mut s1 = base_s1.saturating_sub(ctx);
4117            let mut s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
4118
4119            let base_pre_s1 = i1_anchor.saturating_sub(ctx);
4120            if base_pre_s1 < s1 {
4121                s1 = base_pre_s1;
4122                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
4123            }
4124
4125            let fs1 = expand_func_pre_start(s1, i1_anchor, n_old, &old_lines, funcname_matcher);
4126            if fs1 < s1 {
4127                s1 = fs1;
4128                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
4129            }
4130
4131            // `i1_end` is the exclusive end of the changed region; its post-image
4132            // context is `ctx` lines (Git's `xche->i1 + xche->chg1 + lctx`). The
4133            // hunk's `base_e1` already includes the group's trailing context, so
4134            // use the change end + ctx directly to avoid double-counting context
4135            // (which over-extended the hunk and merged separate functions — t4015 #136).
4136            let mut e1 = (i1_end + ctx).min(n_old);
4137            let mut e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
4138            let fe1 = expand_func_post_end(e1, i1_end, n_old, &old_lines, funcname_matcher);
4139            if fe1 > e1 {
4140                e1 = fe1;
4141                e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
4142            }
4143            (s1, e1, s2, e2)
4144        };
4145
4146        ranges.push((s1, e1, s2, e2));
4147    }
4148
4149    // Merge ranges whose function-context expansion made them overlap on the old
4150    // side (Git emits a single hunk in that case).
4151    ranges.sort_by_key(|r| (r.0, r.2));
4152    let mut merged: Vec<(usize, usize, usize, usize)> = Vec::with_capacity(ranges.len());
4153    for (s1, e1, s2, e2) in ranges {
4154        if let Some(last) = merged.last_mut() {
4155            if s1 < last.1 {
4156                last.1 = last.1.max(e1);
4157                last.3 = last.3.max(e2);
4158                continue;
4159            }
4160        }
4161        merged.push((s1, e1, s2, e2));
4162    }
4163    let ranges = merged;
4164
4165    let mut output = String::new();
4166    if old_path == "/dev/null" {
4167        output.push_str("--- /dev/null\n");
4168    } else if src_prefix.is_empty() {
4169        output.push_str(&format!("--- {old_path}\n"));
4170    } else {
4171        output.push_str("--- ");
4172        output.push_str(&format_diff_path_with_prefix(
4173            src_prefix,
4174            old_path,
4175            quote_path_fully,
4176        ));
4177        output.push('\n');
4178    }
4179    if new_path == "/dev/null" {
4180        output.push_str("+++ /dev/null\n");
4181    } else if dst_prefix.is_empty() {
4182        output.push_str(&format!("+++ {new_path}\n"));
4183    } else {
4184        output.push_str("+++ ");
4185        output.push_str(&format_diff_path_with_prefix(
4186            dst_prefix,
4187            new_path,
4188            quote_path_fully,
4189        ));
4190        output.push('\n');
4191    }
4192
4193    for (s1, e1, s2, e2) in ranges {
4194        if s1 >= e1 && s2 >= e2 {
4195            continue;
4196        }
4197        let old_seg =
4198            line_slice_for_diff_with_eof_nl(&old_lines, s1, e1, old_content.ends_with('\n'));
4199        let new_seg =
4200            line_slice_for_diff_with_eof_nl(&new_lines, s2, e2, new_content.ends_with('\n'));
4201        let inner_ctx = old_seg.lines().count().max(new_seg.lines().count()).max(1);
4202        let piece = unified_diff_with_prefix_and_funcname_and_algorithm(
4203            &old_seg,
4204            &new_seg,
4205            old_path,
4206            new_path,
4207            inner_ctx,
4208            0,
4209            src_prefix,
4210            dst_prefix,
4211            funcname_matcher,
4212            algorithm,
4213            false,
4214            false,
4215            indent_heuristic,
4216            quote_path_fully,
4217        );
4218        let shifted = shift_unified_hunk_headers_to_full_file(&piece, s1, s2);
4219        let with_func =
4220            enrich_unified_hunk_headers_funcname(&shifted, &old_lines, funcname_matcher);
4221        for line in with_func.lines() {
4222            if line.starts_with("--- ") || line.starts_with("+++ ") {
4223                continue;
4224            }
4225            output.push_str(line);
4226            output.push('\n');
4227        }
4228    }
4229
4230    output
4231}
4232
4233/// `piece` is a unified diff for a slice of the file; hunk headers use 1-based
4234/// coordinates relative to that slice. Shift them by `delta_old` / `delta_new`
4235/// (0-based offsets of the slice in the full file) so the combined patch applies
4236/// to the whole file.
4237fn shift_unified_hunk_headers_to_full_file(
4238    patch: &str,
4239    delta_old: usize,
4240    delta_new: usize,
4241) -> String {
4242    if delta_old == 0 && delta_new == 0 {
4243        return patch.to_owned();
4244    }
4245    let mut out = String::with_capacity(patch.len());
4246    for line in patch.lines() {
4247        if let Some(shifted) = shift_one_unified_hunk_header(line, delta_old, delta_new) {
4248            out.push_str(&shifted);
4249        } else {
4250            out.push_str(line);
4251        }
4252        out.push('\n');
4253    }
4254    out
4255}
4256
4257fn shift_one_unified_hunk_header(line: &str, delta_old: usize, delta_new: usize) -> Option<String> {
4258    let rest = line.strip_prefix("@@ ")?;
4259    let (old_chunk, after_plus) = rest.split_once(" +")?;
4260    let old_spec = old_chunk.strip_prefix('-')?;
4261    let (new_spec, suffix) = after_plus.split_once(" @@")?;
4262    let shifted_old = shift_unified_range_spec(old_spec, delta_old)?;
4263    let shifted_new = shift_unified_range_spec(new_spec, delta_new)?;
4264    Some(format!("@@ -{shifted_old} +{shifted_new} @@{suffix}"))
4265}
4266
4267fn shift_unified_range_spec(spec: &str, delta: usize) -> Option<String> {
4268    let spec = spec.trim();
4269    if let Some((start_s, count_s)) = spec.split_once(',') {
4270        let start: usize = start_s.parse().ok()?;
4271        let count: usize = count_s.parse().ok()?;
4272        Some(format!("{},{}", start.saturating_add(delta), count))
4273    } else {
4274        let start: usize = spec.parse().ok()?;
4275        Some(format!("{}", start.saturating_add(delta)))
4276    }
4277}
4278
4279/// Re-attach `@@ ... @@ <funcname>` using full-file line indices (inner diffs use slices).
4280fn enrich_unified_hunk_headers_funcname(
4281    patch: &str,
4282    full_old_lines: &[&str],
4283    funcname_matcher: Option<&FuncnameMatcher>,
4284) -> String {
4285    let mut out = String::with_capacity(patch.len());
4286    for line in patch.lines() {
4287        if let Some(fixed) = enrich_one_hunk_header_funcname(line, full_old_lines, funcname_matcher)
4288        {
4289            out.push_str(&fixed);
4290        } else {
4291            out.push_str(line);
4292        }
4293        out.push('\n');
4294    }
4295    out
4296}
4297
4298fn enrich_one_hunk_header_funcname(
4299    line: &str,
4300    full_old_lines: &[&str],
4301    funcname_matcher: Option<&FuncnameMatcher>,
4302) -> Option<String> {
4303    let after_at = line.strip_prefix("@@ ")?;
4304    let idx = after_at.find(" @@")?;
4305    let mid = after_at[..idx].trim();
4306    let tail = after_at[idx + 3..].trim_start();
4307    let header_for_parse = format!("@@ {mid} @@");
4308    let func = extract_function_context(&header_for_parse, full_old_lines, funcname_matcher);
4309    Some(if let Some(f) = func {
4310        format!("@@ {mid} @@ {f}")
4311    } else if !tail.is_empty() {
4312        format!("@@ {mid} @@ {tail}")
4313    } else {
4314        format!("@@ {mid} @@")
4315    })
4316}
4317
4318fn line_slice_for_diff_with_eof_nl(
4319    lines: &[&str],
4320    start: usize,
4321    end: usize,
4322    full_file_ends_with_newline: bool,
4323) -> String {
4324    if start >= end {
4325        return String::new();
4326    }
4327    let mut s = lines[start..end].join("\n");
4328    let slice_is_suffix_of_file = end == lines.len();
4329    let need_trailing_nl = if slice_is_suffix_of_file {
4330        full_file_ends_with_newline
4331    } else {
4332        true
4333    };
4334    if need_trailing_nl && !s.ends_with('\n') {
4335        s.push('\n');
4336    }
4337    s
4338}
4339
4340/// Map a 0-based old line index to the corresponding 0-based new line index using the full-file
4341/// diff ops (Git aligns context across deletions/insertions).
4342fn map_old_line_to_new(ops: &[similar::DiffOp], old_line: usize, n_new: usize) -> usize {
4343    use similar::DiffOp;
4344    let mut n = 0usize;
4345    for op in ops {
4346        match *op {
4347            DiffOp::Equal {
4348                old_index,
4349                new_index,
4350                len,
4351            } => {
4352                if old_index + len <= old_line {
4353                    n = new_index + len;
4354                    continue;
4355                }
4356                if old_index < old_line {
4357                    let take = old_line - old_index;
4358                    return (new_index + take).min(n_new);
4359                }
4360                return new_index.min(n_new);
4361            }
4362            DiffOp::Delete {
4363                old_index,
4364                old_len,
4365                new_index,
4366            } => {
4367                if old_index + old_len <= old_line {
4368                    n = new_index;
4369                    continue;
4370                }
4371                if old_index < old_line {
4372                    return new_index.min(n_new);
4373                }
4374            }
4375            DiffOp::Insert {
4376                old_index,
4377                new_index,
4378                new_len,
4379            } => {
4380                if old_index < old_line {
4381                    n = new_index + new_len;
4382                    continue;
4383                }
4384                if old_index == old_line {
4385                    // `old_line` is an exclusive end or insertion point aligned with this insert
4386                    // (e.g. EOF append maps to after the inserted block).
4387                    return (new_index + new_len).min(n_new);
4388                }
4389                return new_index.min(n_new);
4390            }
4391            DiffOp::Replace {
4392                old_index,
4393                old_len,
4394                new_index,
4395                new_len,
4396            } => {
4397                if old_index + old_len <= old_line {
4398                    n = new_index + new_len;
4399                    continue;
4400                }
4401                if old_index < old_line {
4402                    let into_old = old_line - old_index;
4403                    let mapped = new_index + into_old.min(new_len);
4404                    return mapped.min(n_new);
4405                }
4406                return new_index.min(n_new);
4407            }
4408        }
4409    }
4410    n.min(n_new)
4411}
4412
4413/// Parse `@@ -old +new @@` into 0-based half-open ranges in each file.
4414fn parse_unified_hunk_header_ranges(header: &str) -> Option<(usize, usize, usize, usize)> {
4415    let rest = header.strip_prefix("@@ ")?;
4416    let (old_tok, rest2) = rest.split_once(" +")?;
4417    let old_tok = old_tok.strip_prefix('-')?;
4418    let new_tok = rest2.split_once(" @@").map(|(a, _)| a)?;
4419
4420    fn parse_side(spec: &str) -> Option<(usize, usize)> {
4421        let spec = spec.trim();
4422        let (start_one_based, count) = if let Some((a, b)) = spec.split_once(',') {
4423            (a.parse::<usize>().ok()?, b.parse::<usize>().ok()?)
4424        } else {
4425            let s = spec.parse::<usize>().ok()?;
4426            (s, 1usize)
4427        };
4428        let s0 = start_one_based.saturating_sub(1);
4429        let e0 = s0.saturating_add(count);
4430        Some((s0, e0))
4431    }
4432
4433    let (os, oe) = parse_side(old_tok)?;
4434    let (ns, ne) = parse_side(new_tok)?;
4435    Some((os, oe, ns, ne))
4436}
4437
4438/// Git `xemit.c`: when a hunk only inserts at EOF (first inserted line is `new_index == n_old`)
4439/// and the added text already contains a funcname line, do not pull extra context from the preimage.
4440fn append_with_whole_function_added(
4441    ops: &[similar::DiffOp],
4442    n_old: usize,
4443    n_new: usize,
4444    new_lines: &[&str],
4445    matcher: Option<&FuncnameMatcher>,
4446) -> bool {
4447    use similar::DiffOp;
4448    if n_old == 0 {
4449        return false;
4450    }
4451    let mut only_ins_or_eq = true;
4452    let mut min_new_ins = usize::MAX;
4453    for op in ops {
4454        match *op {
4455            DiffOp::Equal { .. } => {}
4456            DiffOp::Insert {
4457                new_index, new_len, ..
4458            } => {
4459                min_new_ins = min_new_ins.min(new_index);
4460                if new_len == 0 {
4461                    only_ins_or_eq = false;
4462                }
4463            }
4464            DiffOp::Delete { .. } | DiffOp::Replace { .. } => {
4465                only_ins_or_eq = false;
4466            }
4467        }
4468    }
4469    let mut insert_at_eof = false;
4470    for op in ops {
4471        if let DiffOp::Insert { old_index, .. } = *op {
4472            if old_index == n_old {
4473                insert_at_eof = true;
4474                break;
4475            }
4476        }
4477    }
4478    let append_at_eof = min_new_ins == n_old || insert_at_eof;
4479    if !only_ins_or_eq || !append_at_eof || min_new_ins == usize::MAX {
4480        return false;
4481    }
4482    // Git only skips preimage pull when the inserted block is clearly a new logical
4483    // function (see `xemit.c` walking `xdf2` for `is_func_rec`). A loose "any line
4484    // looks like a function" check would match `return` / `printf` and break `-W`
4485    // hunks that still need preimage context (t4051 `extended`).
4486    let mut j = min_new_ins;
4487    while j < n_new {
4488        let line = new_lines[j];
4489        if line.trim().is_empty() {
4490            j += 1;
4491            continue;
4492        }
4493        if let Some(m) = matcher {
4494            if m.match_line(line).is_some() {
4495                return true;
4496            }
4497        } else if inserted_block_starts_with_c_like_function_definition(line) {
4498            return true;
4499        }
4500        j += 1;
4501    }
4502    false
4503}
4504
4505fn inserted_block_starts_with_c_like_function_definition(line: &str) -> bool {
4506    let t = line.trim_start();
4507    let Some(open_paren) = t.find('(') else {
4508        return false;
4509    };
4510    let head = &t[..open_paren];
4511    let tokens: Vec<&str> = head.split_whitespace().collect();
4512    if tokens.len() < 2 {
4513        // `printf(...)`, `return (`, etc. — not `return_type name(`.
4514        return false;
4515    }
4516    let nameish = tokens.last().copied().unwrap_or("");
4517    let name = nameish.trim_end_matches(['*', '&']);
4518    if name.is_empty() || !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
4519        return false;
4520    }
4521    let type_or_modifier = |tok: &str| {
4522        matches!(
4523            tok,
4524            "static"
4525                | "extern"
4526                | "inline"
4527                | "void"
4528                | "int"
4529                | "char"
4530                | "short"
4531                | "long"
4532                | "float"
4533                | "double"
4534                | "unsigned"
4535                | "signed"
4536                | "struct"
4537                | "enum"
4538                | "union"
4539                | "const"
4540                | "volatile"
4541                | "typedef"
4542        )
4543    };
4544    tokens[..tokens.len() - 1]
4545        .iter()
4546        .any(|tok| type_or_modifier(tok))
4547}
4548
4549fn hunk_old_change_end_exclusive(ops: &[similar::DiffOp]) -> usize {
4550    use similar::DiffOp;
4551    let mut max_o = 0usize;
4552    for op in ops {
4553        match *op {
4554            DiffOp::Delete {
4555                old_index, old_len, ..
4556            } => {
4557                max_o = max_o.max(old_index + old_len);
4558            }
4559            DiffOp::Replace {
4560                old_index, old_len, ..
4561            } => {
4562                max_o = max_o.max(old_index + old_len);
4563            }
4564            DiffOp::Insert { old_index, .. } => {
4565                // Pure insertions do not consume old lines; Git's post-context anchor is the
4566                // insertion point (`old_index`), not 0 (t4051 `extended`).
4567                max_o = max_o.max(old_index);
4568            }
4569            DiffOp::Equal { .. } => {}
4570        }
4571    }
4572    max_o
4573}
4574
4575fn func_context_old_anchor(ops: &[similar::DiffOp], n_old: usize) -> usize {
4576    use similar::DiffOp;
4577    let mut has_delete_or_replace = false;
4578    let mut min_del = usize::MAX;
4579    let mut min_ins_old = usize::MAX;
4580
4581    for op in ops {
4582        match *op {
4583            DiffOp::Delete {
4584                old_index, old_len, ..
4585            } => {
4586                has_delete_or_replace = true;
4587                min_del = min_del.min(old_index);
4588                min_del = min_del.min(old_index + old_len.saturating_sub(1));
4589            }
4590            DiffOp::Replace {
4591                old_index, old_len, ..
4592            } => {
4593                has_delete_or_replace = true;
4594                min_del = min_del.min(old_index);
4595                min_del = min_del.min(old_index + old_len.saturating_sub(1));
4596            }
4597            DiffOp::Insert { old_index, .. } => {
4598                min_ins_old = min_ins_old.min(old_index);
4599            }
4600            DiffOp::Equal { .. } => {}
4601        }
4602    }
4603
4604    let mut i1 = if has_delete_or_replace {
4605        min_del
4606    } else if min_ins_old != usize::MAX {
4607        min_ins_old
4608    } else {
4609        0
4610    };
4611
4612    let pure_insert = ops
4613        .iter()
4614        .all(|op| matches!(op, DiffOp::Insert { .. } | DiffOp::Equal { .. }))
4615        && ops.iter().any(|op| matches!(op, DiffOp::Insert { .. }));
4616
4617    if pure_insert && i1 >= n_old && n_old > 0 {
4618        i1 = n_old - 1;
4619    }
4620
4621    i1.min(n_old.saturating_sub(1))
4622}
4623
4624fn expand_func_pre_start(
4625    s1: usize,
4626    i1: usize,
4627    n_old: usize,
4628    old_lines: &[&str],
4629    matcher: Option<&FuncnameMatcher>,
4630) -> usize {
4631    if n_old == 0 {
4632        return s1;
4633    }
4634    let i1 = i1.min(n_old.saturating_sub(1));
4635    let mut fs1 = get_func_line_backward(old_lines, i1, matcher).unwrap_or(i1);
4636    while fs1 > 0
4637        && !is_line_empty_for_func_context(old_lines[fs1 - 1])
4638        && !is_func_line(old_lines[fs1 - 1], matcher)
4639    {
4640        fs1 -= 1;
4641    }
4642    s1.min(fs1)
4643}
4644
4645fn expand_func_post_end(
4646    e1: usize,
4647    i1_end: usize,
4648    n_old: usize,
4649    old_lines: &[&str],
4650    matcher: Option<&FuncnameMatcher>,
4651) -> usize {
4652    let from = i1_end.min(n_old);
4653    let fe1 = get_func_line_forward(old_lines, from, matcher).unwrap_or(n_old);
4654    let mut fe1_adj = fe1;
4655    while fe1_adj > 0 && is_line_empty_for_func_context(old_lines[fe1_adj - 1]) {
4656        fe1_adj -= 1;
4657    }
4658    e1.max(fe1_adj).min(n_old)
4659}
4660
4661fn is_line_empty_for_func_context(line: &str) -> bool {
4662    line.chars().all(|c| c.is_whitespace())
4663}
4664
4665fn is_func_line(line: &str, matcher: Option<&FuncnameMatcher>) -> bool {
4666    if let Some(m) = matcher {
4667        return m.match_line(line).is_some();
4668    }
4669    let t = line.trim_end_matches(['\n', '\r']);
4670    if t.is_empty() {
4671        return false;
4672    }
4673    let b = t.as_bytes()[0];
4674    b.is_ascii_alphabetic() || b == b'_' || b == b'$'
4675}
4676
4677fn get_func_line_backward(
4678    old_lines: &[&str],
4679    start: usize,
4680    matcher: Option<&FuncnameMatcher>,
4681) -> Option<usize> {
4682    let mut l = start.min(old_lines.len().saturating_sub(1));
4683    if old_lines.is_empty() {
4684        return None;
4685    }
4686    loop {
4687        if is_func_line(old_lines[l], matcher) {
4688            return Some(l);
4689        }
4690        if l == 0 {
4691            break;
4692        }
4693        l -= 1;
4694    }
4695    None
4696}
4697
4698fn get_func_line_forward(
4699    old_lines: &[&str],
4700    start: usize,
4701    matcher: Option<&FuncnameMatcher>,
4702) -> Option<usize> {
4703    let mut l = start;
4704    while l < old_lines.len() {
4705        if is_func_line(old_lines[l], matcher) {
4706            return Some(l);
4707        }
4708        l += 1;
4709    }
4710    None
4711}
4712
4713/// Compute a unified diff with anchored lines.
4714///
4715/// Anchored lines that appear exactly once in both old and new content are
4716/// forced to match, splitting the diff into segments around those anchor points.
4717/// This produces diffs where the anchored text stays as context and surrounding
4718/// lines are shown as additions/removals.
4719///
4720/// Segment diffs use `algorithm`. When `use_git_histogram` is true, histogram uses imara-diff
4721/// (Git-compatible); otherwise `algorithm` is passed to `similar`.
4722pub fn anchored_unified_diff(
4723    old_content: &str,
4724    new_content: &str,
4725    old_path: &str,
4726    new_path: &str,
4727    context_lines: usize,
4728    anchors: &[String],
4729    algorithm: similar::Algorithm,
4730    use_git_histogram: bool,
4731    indent_heuristic: bool,
4732    quote_path_fully: bool,
4733) -> String {
4734    use crate::quote_path::format_diff_path_with_prefix;
4735    use similar::TextDiff;
4736
4737    let old_lines: Vec<&str> = old_content.lines().collect();
4738    let new_lines: Vec<&str> = new_content.lines().collect();
4739
4740    // Find anchored lines that appear exactly once in both old and new
4741    let mut anchor_pairs: Vec<(usize, usize)> = Vec::new(); // (old_idx, new_idx)
4742
4743    for anchor in anchors {
4744        let anchor_str = anchor.as_str();
4745
4746        // Count occurrences in old
4747        let old_positions: Vec<usize> = old_lines
4748            .iter()
4749            .enumerate()
4750            .filter(|(_, l)| l.trim_end() == anchor_str)
4751            .map(|(i, _)| i)
4752            .collect();
4753
4754        // Count occurrences in new
4755        let new_positions: Vec<usize> = new_lines
4756            .iter()
4757            .enumerate()
4758            .filter(|(_, l)| l.trim_end() == anchor_str)
4759            .map(|(i, _)| i)
4760            .collect();
4761
4762        // Only anchor if unique in both
4763        if old_positions.len() == 1 && new_positions.len() == 1 {
4764            anchor_pairs.push((old_positions[0], new_positions[0]));
4765        }
4766    }
4767
4768    // If no valid anchors, fall back to normal diff
4769    if anchor_pairs.is_empty() {
4770        return unified_diff_with_prefix_and_funcname_and_algorithm(
4771            old_content,
4772            new_content,
4773            old_path,
4774            new_path,
4775            context_lines,
4776            0,
4777            "a/",
4778            "b/",
4779            None,
4780            algorithm,
4781            false,
4782            use_git_histogram,
4783            indent_heuristic,
4784            quote_path_fully,
4785        );
4786    }
4787
4788    // Sort anchor pairs by their position in the old file
4789    anchor_pairs.sort_by_key(|&(old_idx, _)| old_idx);
4790
4791    // Filter to only keep pairs where new positions are also increasing
4792    // (longest increasing subsequence of new positions)
4793    let mut filtered: Vec<(usize, usize)> = Vec::new();
4794    for &pair in &anchor_pairs {
4795        if filtered.is_empty() || filtered.last().is_some_and(|last| pair.1 > last.1) {
4796            filtered.push(pair);
4797        }
4798    }
4799    let anchor_pairs = filtered;
4800
4801    // Build a modified version of old/new where we diff segments between anchors.
4802    // We'll construct the diff by processing segments:
4803    // - Before first anchor
4804    // - Between consecutive anchors
4805    // - After last anchor
4806    // Each anchor line itself is a fixed context match.
4807
4808    // Collect all diff operations
4809    struct LineDiffOp {
4810        tag: char, // ' ', '+', '-'
4811        line: String,
4812    }
4813
4814    let append_segment_diff =
4815        |ops: &mut Vec<LineDiffOp>, old_seg_input: &str, new_seg_input: &str| {
4816            use similar::ChangeTag;
4817            let old_ls: Vec<&str> = old_seg_input.lines().collect();
4818            let new_ls: Vec<&str> = new_seg_input.lines().collect();
4819            if old_ls.is_empty() && new_ls.is_empty() {
4820                return;
4821            }
4822            let seg_diff = TextDiff::configure()
4823                .algorithm(algorithm)
4824                .diff_slices(&old_ls, &new_ls);
4825            let raw = seg_diff.ops().to_vec();
4826            let compacted = diff_indent_heuristic::apply_change_compact_to_ops(
4827                &raw,
4828                &old_ls,
4829                &new_ls,
4830                indent_heuristic,
4831            );
4832            for op in &compacted {
4833                for ch in op.iter_changes(&old_ls, &new_ls) {
4834                    let t = match ch.tag() {
4835                        ChangeTag::Equal => ' ',
4836                        ChangeTag::Delete => '-',
4837                        ChangeTag::Insert => '+',
4838                    };
4839                    ops.push(LineDiffOp {
4840                        tag: t,
4841                        line: ch.value().to_string(),
4842                    });
4843                }
4844            }
4845        };
4846
4847    let mut ops: Vec<LineDiffOp> = Vec::new();
4848    let mut old_pos = 0usize;
4849    let mut new_pos = 0usize;
4850
4851    for &(old_anchor, new_anchor) in &anchor_pairs {
4852        // Diff the segment before this anchor
4853        let old_segment: Vec<&str> = old_lines[old_pos..old_anchor].to_vec();
4854        let new_segment: Vec<&str> = new_lines[new_pos..new_anchor].to_vec();
4855
4856        let old_seg_text = old_segment.join("\n");
4857        let new_seg_text = new_segment.join("\n");
4858
4859        if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
4860            let old_seg_input = if old_seg_text.is_empty() {
4861                String::new()
4862            } else {
4863                format!("{}\n", old_seg_text)
4864            };
4865            let new_seg_input = if new_seg_text.is_empty() {
4866                String::new()
4867            } else {
4868                format!("{}\n", new_seg_text)
4869            };
4870            append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
4871        }
4872
4873        // The anchor line itself is always context
4874        ops.push(LineDiffOp {
4875            tag: ' ',
4876            line: old_lines[old_anchor].to_string(),
4877        });
4878
4879        old_pos = old_anchor + 1;
4880        new_pos = new_anchor + 1;
4881    }
4882
4883    // Diff the remaining segment after the last anchor
4884    let old_segment: Vec<&str> = old_lines[old_pos..].to_vec();
4885    let new_segment: Vec<&str> = new_lines[new_pos..].to_vec();
4886    let old_seg_text = old_segment.join("\n");
4887    let new_seg_text = new_segment.join("\n");
4888
4889    if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
4890        let old_seg_input = if old_seg_text.is_empty() {
4891            String::new()
4892        } else {
4893            format!("{}\n", old_seg_text)
4894        };
4895        let new_seg_input = if new_seg_text.is_empty() {
4896            String::new()
4897        } else {
4898            format!("{}\n", new_seg_text)
4899        };
4900        append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
4901    }
4902
4903    // Now format as unified diff with hunks
4904    let mut output = String::new();
4905    if old_path == "/dev/null" {
4906        output.push_str("--- /dev/null\n");
4907    } else {
4908        output.push_str("--- ");
4909        output.push_str(&format_diff_path_with_prefix(
4910            "a/",
4911            old_path,
4912            quote_path_fully,
4913        ));
4914        output.push('\n');
4915    }
4916    if new_path == "/dev/null" {
4917        output.push_str("+++ /dev/null\n");
4918    } else {
4919        output.push_str("+++ ");
4920        output.push_str(&format_diff_path_with_prefix(
4921            "b/",
4922            new_path,
4923            quote_path_fully,
4924        ));
4925        output.push('\n');
4926    }
4927
4928    // Group ops into hunks with context
4929    let total_ops = ops.len();
4930    if total_ops == 0 {
4931        return output;
4932    }
4933
4934    // Find ranges of changes
4935    let mut hunks: Vec<(usize, usize)> = Vec::new(); // (start, end) indices into ops
4936    let mut i = 0;
4937    while i < total_ops {
4938        if ops[i].tag != ' ' {
4939            let start = i.saturating_sub(context_lines);
4940            let mut end = i;
4941            // Extend to include consecutive changes and their context
4942            while end < total_ops {
4943                if ops[end].tag != ' ' {
4944                    end += 1;
4945                    continue;
4946                }
4947                // Check if there's another change within context_lines
4948                let mut next_change = end;
4949                while next_change < total_ops && ops[next_change].tag == ' ' {
4950                    next_change += 1;
4951                }
4952                if next_change < total_ops && next_change - end <= context_lines * 2 {
4953                    end = next_change + 1;
4954                } else {
4955                    end = (end + context_lines).min(total_ops);
4956                    break;
4957                }
4958            }
4959            // Merge with previous hunk if overlapping
4960            if let Some(last) = hunks.last_mut() {
4961                if start <= last.1 {
4962                    last.1 = end;
4963                } else {
4964                    hunks.push((start, end));
4965                }
4966            } else {
4967                hunks.push((start, end));
4968            }
4969            i = end;
4970        } else {
4971            i += 1;
4972        }
4973    }
4974
4975    // Output each hunk
4976    for (start, end) in hunks {
4977        // Count old/new lines in this hunk
4978        let mut old_start = 1usize;
4979        let mut new_start = 1usize;
4980        // Calculate line numbers by counting ops before this hunk
4981        for op in &ops[..start] {
4982            match op.tag {
4983                ' ' => {
4984                    old_start += 1;
4985                    new_start += 1;
4986                }
4987                '-' => {
4988                    old_start += 1;
4989                }
4990                '+' => {
4991                    new_start += 1;
4992                }
4993                _ => {}
4994            }
4995        }
4996        let mut old_count = 0usize;
4997        let mut new_count = 0usize;
4998        for op in &ops[start..end] {
4999            match op.tag {
5000                ' ' => {
5001                    old_count += 1;
5002                    new_count += 1;
5003                }
5004                '-' => {
5005                    old_count += 1;
5006                }
5007                '+' => {
5008                    new_count += 1;
5009                }
5010                _ => {}
5011            }
5012        }
5013
5014        output.push_str(&format!(
5015            "@@ -{},{} +{},{} @@\n",
5016            old_start, old_count, new_start, new_count
5017        ));
5018        for op in &ops[start..end] {
5019            output.push(op.tag);
5020            output.push_str(&op.line);
5021            output.push('\n');
5022        }
5023    }
5024
5025    output
5026}
5027
5028/// Extract function context for a hunk header.
5029///
5030/// Given a hunk header like `@@ -8,7 +8,7 @@`, find the last line
5031/// before line 8 in the old content that looks like a function header
5032/// (starts with a non-whitespace character, like Git's default).
5033fn extract_function_context(
5034    header: &str,
5035    old_lines: &[&str],
5036    funcname_matcher: Option<&FuncnameMatcher>,
5037) -> Option<String> {
5038    // Parse the old start line number from "@@ -<start>,<count> ..."
5039    let at_pos = header.find("-")?;
5040    let rest = &header[at_pos + 1..];
5041    let comma_or_space = rest.find([',', ' '])?;
5042    let start_str = &rest[..comma_or_space];
5043    let start_line: usize = start_str.parse().ok()?;
5044
5045    // Parse the old line count; "@@ -<start>,<count> ..." (no comma means count 1).
5046    // Only look for the comma inside the old-range token itself — searching the
5047    // whole remainder would pick up the comma from the new side (e.g. "+0,0").
5048    let old_token_end = rest.find([' ', '\t']).unwrap_or(rest.len());
5049    let old_token = &rest[..old_token_end];
5050    let old_count: usize = if let Some(comma) = old_token.find(',') {
5051        old_token[comma + 1..].parse().unwrap_or(1)
5052    } else {
5053        1
5054    };
5055
5056    if start_line == 0 {
5057        return None;
5058    }
5059
5060    // Look backwards for a line that matches the funcname pattern. start_line is
5061    // 1-indexed. For a normal hunk the first changed pre-image line is
5062    // old_lines[start_line-1], so we search lines strictly before it
5063    // (old_lines[0..start_line-1]). For a pure insertion (old count 0) the
5064    // content is inserted *after* old line start_line, so Git's function search
5065    // begins at that line itself: search old_lines[0..start_line].
5066    let search_end = if old_count == 0 {
5067        start_line.min(old_lines.len())
5068    } else {
5069        if start_line <= 1 {
5070            return None;
5071        }
5072        (start_line - 1).min(old_lines.len())
5073    };
5074    let truncate = |text: &str| {
5075        if text.len() > 80 {
5076            let mut end = 80;
5077            while end > 0 && !text.is_char_boundary(end) {
5078                end -= 1;
5079            }
5080            text[..end].to_owned()
5081        } else {
5082            text.to_owned()
5083        }
5084    };
5085
5086    for i in (0..search_end).rev() {
5087        let line = old_lines[i];
5088        if line.is_empty() {
5089            continue;
5090        }
5091        if let Some(matcher) = funcname_matcher {
5092            if let Some(matched) = matcher.match_line(line) {
5093                return Some(truncate(&matched));
5094            }
5095            continue;
5096        }
5097
5098        let first = line.as_bytes()[0];
5099        if first.is_ascii_alphabetic() || first == b'_' || first == b'$' {
5100            return Some(truncate(line.trim_end_matches(char::is_whitespace)));
5101        }
5102    }
5103    None
5104}
5105
5106/// Generate diff stat output (file name + insertions/deletions).
5107///
5108/// Returns a single line like: ` file.txt | 5 ++---`
5109pub fn format_stat_line(
5110    path: &str,
5111    insertions: usize,
5112    deletions: usize,
5113    max_path_len: usize,
5114) -> String {
5115    format_stat_line_width(path, insertions, deletions, max_path_len, 0)
5116}
5117
5118pub fn format_stat_line_width(
5119    path: &str,
5120    insertions: usize,
5121    deletions: usize,
5122    max_path_len: usize,
5123    count_width: usize,
5124) -> String {
5125    let total = insertions + deletions;
5126    let plus = "+".repeat(insertions.min(50));
5127    let minus = "-".repeat(deletions.min(50));
5128    let cw = if count_width > 0 {
5129        count_width
5130    } else {
5131        format!("{}", total).len()
5132    };
5133    let bar = format!("{}{}", plus, minus);
5134    if bar.is_empty() {
5135        format!(
5136            " {:<width$} | {:>cw$}",
5137            path,
5138            total,
5139            width = max_path_len,
5140            cw = cw
5141        )
5142    } else {
5143        format!(
5144            " {:<width$} | {:>cw$} {}",
5145            path,
5146            total,
5147            bar,
5148            width = max_path_len,
5149            cw = cw
5150        )
5151    }
5152}
5153
5154/// Normalise one line like Git's `-b` / `--ignore-space-change`.
5155#[must_use]
5156pub fn normalize_ignore_space_change_line(line: &str) -> String {
5157    let mut result = String::with_capacity(line.len());
5158    let mut in_space = false;
5159    for c in line.chars() {
5160        if c.is_whitespace() {
5161            if !in_space {
5162                result.push(' ');
5163                in_space = true;
5164            }
5165        } else {
5166            result.push(c);
5167            in_space = false;
5168        }
5169    }
5170    while result.ends_with(' ') {
5171        result.pop();
5172    }
5173    result
5174}
5175
5176/// Normalise text like Git's `-b` / `--ignore-space-change`: on each line, collapse runs of
5177/// whitespace to a single ASCII space and trim trailing spaces.
5178///
5179/// Line breaks are preserved by splitting on [`str::lines`] and rejoining with `\n` (same approach
5180/// as the porcelain `diff` whitespace handling in `grit`).
5181#[must_use]
5182pub fn normalize_ignore_space_change(content: &str) -> String {
5183    content
5184        .lines()
5185        .map(normalize_ignore_space_change_line)
5186        .collect::<Vec<_>>()
5187        .join("\n")
5188}
5189
5190/// Count insertions and deletions between two strings.
5191///
5192/// Returns `(insertions, deletions)`.
5193pub fn count_changes(old_content: &str, new_content: &str) -> (usize, usize) {
5194    count_changes_with_algorithm(old_content, new_content, similar::Algorithm::Myers, false)
5195}
5196
5197/// Count insertions and deletions using the given line-diff algorithm.
5198///
5199/// Git's `--stat` / `--numstat` follow the configured diff algorithm; this mirrors that by
5200/// running [`similar::TextDiff`] with an explicit [`similar::Algorithm`].
5201#[must_use]
5202pub fn count_changes_with_algorithm(
5203    old_content: &str,
5204    new_content: &str,
5205    algorithm: similar::Algorithm,
5206    use_git_histogram: bool,
5207) -> (usize, usize) {
5208    if use_git_histogram {
5209        use imara_diff::{Algorithm, Diff, InternedInput};
5210        let input = InternedInput::new(old_content, new_content);
5211        let mut d = Diff::compute(Algorithm::Histogram, &input);
5212        d.postprocess_lines(&input);
5213        return (d.count_additions() as usize, d.count_removals() as usize);
5214    }
5215
5216    use similar::{ChangeTag, TextDiff};
5217
5218    let diff = TextDiff::configure()
5219        .algorithm(algorithm)
5220        .diff_lines(old_content, new_content);
5221    let mut ins = 0;
5222    let mut del = 0;
5223
5224    for change in diff.iter_all_changes() {
5225        match change.tag() {
5226            ChangeTag::Insert => ins += 1,
5227            ChangeTag::Delete => del += 1,
5228            ChangeTag::Equal => {}
5229        }
5230    }
5231
5232    (ins, del)
5233}
5234
5235/// Line count for diffstat/`--numstat`, matching Git's `count_lines()` in `diff.c`.
5236///
5237/// Counts newline-terminated lines; a final line without trailing newline still counts as one line.
5238/// An empty buffer yields `0`.
5239#[must_use]
5240pub fn count_git_lines(data: &[u8]) -> usize {
5241    if data.is_empty() {
5242        return 0;
5243    }
5244    let mut count = 0usize;
5245    let mut nl_just_seen = false;
5246    for &ch in data {
5247        if ch == b'\n' {
5248            count += 1;
5249            nl_just_seen = true;
5250        } else {
5251            nl_just_seen = false;
5252        }
5253    }
5254    if !nl_just_seen {
5255        count += 1;
5256    }
5257    count
5258}
5259
5260/// Internal maximum diff score used by Git rename/break heuristics (`MAX_SCORE` in `diffcore.h`).
5261pub const GIT_DIFF_MAX_SCORE: u64 = 60_000;
5262const DIFF_MAX_SCORE: u64 = GIT_DIFF_MAX_SCORE;
5263const DIFF_MINIMUM_BREAK_SIZE: usize = 400;
5264const DIFF_DEFAULT_BREAK_SCORE: u64 = 30_000;
5265/// Default break threshold (`DEFAULT_BREAK_SCORE` in `diffcore.h`), internal 0–[`GIT_DIFF_MAX_SCORE`] scale.
5266pub const GIT_DIFF_DEFAULT_BREAK_SCORE: u64 = DIFF_DEFAULT_BREAK_SCORE;
5267/// Default merge threshold after a break (`DEFAULT_MERGE_SCORE` in `diffcore.h`): pairs broken for
5268/// rename/copy but not consumed are merged back when deletion-weight is below this (60% by default).
5269pub const GIT_DIFF_DEFAULT_MERGE_SCORE_AFTER_BREAK: u64 = 36_000;
5270const DIFF_HASHBASE: u32 = 107_927;
5271
5272#[derive(Clone, Copy, Default)]
5273struct SpanSlot {
5274    hashval: u32,
5275    cnt: u32,
5276}
5277
5278struct SpanHashTop {
5279    alloc_log2: u8,
5280    free_slots: i32,
5281    data: Vec<SpanSlot>,
5282}
5283
5284impl SpanHashTop {
5285    fn new(initial_log2: u8) -> Self {
5286        let cap = 1usize << initial_log2;
5287        Self {
5288            alloc_log2: initial_log2,
5289            free_slots: initial_free(initial_log2),
5290            data: vec![SpanSlot::default(); cap],
5291        }
5292    }
5293
5294    fn len(&self) -> usize {
5295        1usize << self.alloc_log2
5296    }
5297
5298    fn add_span(&mut self, hashval: u32, cnt: u32) {
5299        loop {
5300            let lim = self.len();
5301            let mut bucket = (hashval as usize) & (lim - 1);
5302            loop {
5303                let h = &mut self.data[bucket];
5304                if h.cnt == 0 {
5305                    h.hashval = hashval;
5306                    h.cnt = cnt;
5307                    self.free_slots -= 1;
5308                    if self.free_slots < 0 {
5309                        self.rehash();
5310                        break;
5311                    }
5312                    return;
5313                }
5314                if h.hashval == hashval {
5315                    h.cnt = h.cnt.saturating_add(cnt);
5316                    return;
5317                }
5318                bucket += 1;
5319                if bucket >= lim {
5320                    bucket = 0;
5321                }
5322            }
5323        }
5324    }
5325
5326    fn rehash(&mut self) {
5327        let old = std::mem::take(&mut self.data);
5328        let old_log = self.alloc_log2;
5329        self.alloc_log2 = old_log.saturating_add(1);
5330        let new_len = 1usize << self.alloc_log2;
5331        self.free_slots = initial_free(self.alloc_log2);
5332        self.data = vec![SpanSlot::default(); new_len];
5333        let old_sz = 1usize << old_log;
5334        for o in old.iter().take(old_sz) {
5335            let o = *o;
5336            if o.cnt == 0 {
5337                continue;
5338            }
5339            self.add_span_after_rehash(o.hashval, o.cnt);
5340        }
5341    }
5342
5343    fn add_span_after_rehash(&mut self, hashval: u32, cnt: u32) {
5344        loop {
5345            let lim = self.len();
5346            let mut bucket = (hashval as usize) & (lim - 1);
5347            loop {
5348                let h = &mut self.data[bucket];
5349                if h.cnt == 0 {
5350                    h.hashval = hashval;
5351                    h.cnt = cnt;
5352                    self.free_slots -= 1;
5353                    if self.free_slots < 0 {
5354                        self.rehash();
5355                        break;
5356                    }
5357                    return;
5358                }
5359                if h.hashval == hashval {
5360                    h.cnt = h.cnt.saturating_add(cnt);
5361                    return;
5362                }
5363                bucket += 1;
5364                if bucket >= lim {
5365                    bucket = 0;
5366                }
5367            }
5368        }
5369    }
5370
5371    fn sort_by_hashval(&mut self) {
5372        let sz = self.len();
5373        self.data[..sz].sort_by(|a, b| {
5374            if a.cnt == 0 {
5375                return std::cmp::Ordering::Greater;
5376            }
5377            if b.cnt == 0 {
5378                return std::cmp::Ordering::Less;
5379            }
5380            a.hashval.cmp(&b.hashval)
5381        });
5382    }
5383}
5384
5385fn initial_free(sz_log2: u8) -> i32 {
5386    let sz = sz_log2 as i32;
5387    ((1i32 << sz_log2) * (sz - 3) / sz).max(0)
5388}
5389
5390fn hash_blob_spans(buf: &[u8], is_text: bool) -> SpanHashTop {
5391    let mut hash = SpanHashTop::new(9);
5392    let mut n = 0u32;
5393    let mut accum1: u32 = 0;
5394    let mut accum2: u32 = 0;
5395    let mut i = 0usize;
5396    while i < buf.len() {
5397        let c = buf[i] as u32;
5398        let old_1 = accum1;
5399        i += 1;
5400
5401        if is_text && c == b'\r' as u32 && i < buf.len() && buf[i] == b'\n' {
5402            continue;
5403        }
5404
5405        accum1 = accum1.wrapping_shl(7) ^ accum2.wrapping_shr(25);
5406        accum2 = accum2.wrapping_shl(7) ^ old_1.wrapping_shr(25);
5407        accum1 = accum1.wrapping_add(c);
5408        n += 1;
5409        if n < 64 && c != b'\n' as u32 {
5410            continue;
5411        }
5412        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
5413        hash.add_span(hashval, n);
5414        n = 0;
5415        accum1 = 0;
5416        accum2 = 0;
5417    }
5418    if n > 0 {
5419        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
5420        hash.add_span(hashval, n);
5421    }
5422    hash.sort_by_hashval();
5423    hash
5424}
5425
5426/// Approximate copied vs added material between two blobs (Git `diffcore_count_changes`).
5427///
5428/// Returns `(copied_bytes_from_src, literal_added_bytes_in_dst)` matching Git's
5429/// `diffcore_count_changes` semantics (used for `--dirstat=changes` damage).
5430#[must_use]
5431pub fn diffcore_count_changes(old: &[u8], new: &[u8]) -> (u64, u64) {
5432    let src_is_text = !crate::merge_file::is_binary(old);
5433    let dst_is_text = !crate::merge_file::is_binary(new);
5434    let src_count = hash_blob_spans(old, src_is_text);
5435    let dst_count = hash_blob_spans(new, dst_is_text);
5436    let mut sc: u64 = 0;
5437    let mut la: u64 = 0;
5438    let mut si = 0usize;
5439    let mut di = 0usize;
5440    let src_len = src_count.len();
5441    let dst_len = dst_count.len();
5442    loop {
5443        if si >= src_len || src_count.data[si].cnt == 0 {
5444            break;
5445        }
5446        let s_hash = src_count.data[si].hashval;
5447        let s_cnt = u64::from(src_count.data[si].cnt);
5448        while di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval < s_hash {
5449            la += u64::from(dst_count.data[di].cnt);
5450            di += 1;
5451        }
5452        let mut dst_cnt = 0u64;
5453        if di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval == s_hash {
5454            dst_cnt = u64::from(dst_count.data[di].cnt);
5455            di += 1;
5456        }
5457        if s_cnt < dst_cnt {
5458            la += dst_cnt - s_cnt;
5459            sc += s_cnt;
5460        } else {
5461            sc += dst_cnt;
5462        }
5463        si += 1;
5464    }
5465    while di < dst_len && dst_count.data[di].cnt != 0 {
5466        la += u64::from(dst_count.data[di].cnt);
5467        di += 1;
5468    }
5469    (sc, la)
5470}
5471
5472/// Whether this modified blob pair should use Git's "complete rewrite" diffstat path when
5473/// `--break-rewrites` is in effect (`should_break` in `diffcore-break.c`).
5474#[must_use]
5475pub fn should_break_rewrite_for_stat(old: &[u8], new: &[u8]) -> bool {
5476    should_break_rewrite_inner(old, new, DIFF_DEFAULT_BREAK_SCORE)
5477}
5478
5479/// Whether an in-place blob edit should be split into delete+create for rename/copy (`should_break`
5480/// in `diffcore-break.c`). `break_score` is on the internal 0–[`GIT_DIFF_MAX_SCORE`] scale (default
5481/// [`DIFF_DEFAULT_BREAK_SCORE`]).
5482#[must_use]
5483pub fn should_break_rewrite_pair(old: &[u8], new: &[u8], break_score: u64) -> bool {
5484    should_break_rewrite_inner(old, new, break_score)
5485}
5486
5487/// Parse a single Git `parse_rename_score` token (`50`, `50%`, decimal forms) into internal
5488/// 0–[`GIT_DIFF_MAX_SCORE`] units.
5489pub fn parse_diff_rename_score_token(arg: &str) -> Option<u64> {
5490    let mut num: u64 = 0;
5491    let mut scale: u64 = 1;
5492    let mut dot = false;
5493    let mut saw_digit = false;
5494    for ch in arg.chars() {
5495        if !dot && ch == '.' {
5496            scale = 1;
5497            dot = true;
5498            continue;
5499        }
5500        if ch == '%' {
5501            scale = if dot { scale.saturating_mul(100) } else { 100 };
5502            break;
5503        }
5504        if ch.is_ascii_digit() {
5505            saw_digit = true;
5506            if scale < 100_000 {
5507                scale = scale.saturating_mul(10);
5508                num = num.saturating_mul(10) + u64::from(ch as u8 - b'0');
5509            }
5510        } else {
5511            break;
5512        }
5513    }
5514    if !saw_digit {
5515        return None;
5516    }
5517    Some(if num >= scale {
5518        GIT_DIFF_MAX_SCORE
5519    } else {
5520        GIT_DIFF_MAX_SCORE * num / scale
5521    })
5522}
5523
5524/// Git `merge_score` from `diffcore-break.c` when a pair is considered broken: how much of the
5525/// source blob was removed (0–[`DIFF_MAX_SCORE`] scale). Used for `dissimilarity index` metadata.
5526#[must_use]
5527pub fn rewrite_merge_score(old: &[u8], new: &[u8]) -> Option<u64> {
5528    if old.is_empty() {
5529        return None;
5530    }
5531    let max_size = old.len().max(new.len());
5532    if max_size < DIFF_MINIMUM_BREAK_SIZE {
5533        return None;
5534    }
5535    let (src_copied, _) = diffcore_count_changes(old, new);
5536    let src_copied = src_copied.min(old.len() as u64);
5537    let src_removed = (old.len() as u64).saturating_sub(src_copied);
5538    Some(src_removed * DIFF_MAX_SCORE / old.len() as u64)
5539}
5540
5541/// Percentage shown in `dissimilarity index N%` for a rewrite (`similarity_index` in Git's diff.c).
5542#[must_use]
5543pub fn rewrite_dissimilarity_index_percent(old: &[u8], new: &[u8]) -> Option<u32> {
5544    let score = rewrite_merge_score(old, new)?;
5545    Some((score * 100 / DIFF_MAX_SCORE).min(100) as u32)
5546}
5547
5548fn should_break_rewrite_inner(src: &[u8], dst: &[u8], break_score: u64) -> bool {
5549    if src.is_empty() {
5550        return false;
5551    }
5552    let max_size = src.len().max(dst.len());
5553    if max_size < DIFF_MINIMUM_BREAK_SIZE {
5554        return false;
5555    }
5556    let (src_copied, literal_added) = diffcore_count_changes(src, dst);
5557    let src_copied = src_copied.min(src.len() as u64);
5558    let mut literal_added = literal_added;
5559    let dst_len = dst.len() as u64;
5560    if src_copied < dst_len && literal_added + src_copied > dst_len {
5561        literal_added = dst_len.saturating_sub(src_copied);
5562    }
5563    let src_removed = (src.len() as u64).saturating_sub(src_copied);
5564    let merge_score = src_removed * DIFF_MAX_SCORE / src.len() as u64;
5565    if merge_score > break_score {
5566        return true;
5567    }
5568    let delta_size = src_removed.saturating_add(literal_added);
5569    if delta_size * DIFF_MAX_SCORE / (max_size as u64) < break_score {
5570        return false;
5571    }
5572    let s = src.len() as u64;
5573    if (s * break_score < src_removed * DIFF_MAX_SCORE)
5574        && (literal_added * 20 < src_removed)
5575        && (literal_added * 20 < src_copied)
5576    {
5577        return false;
5578    }
5579    true
5580}
5581
5582// ── Helpers ─────────────────────────────────────────────────────────
5583
5584/// Flatten a tree object recursively into a sorted list of (path, mode, oid).
5585struct FlatEntry {
5586    path: String,
5587    mode: u32,
5588    oid: ObjectId,
5589}
5590
5591fn flatten_tree(odb: &Odb, tree_oid: &ObjectId, prefix: &str) -> Result<Vec<FlatEntry>> {
5592    let entries = read_tree(odb, tree_oid)?;
5593    let mut result = Vec::new();
5594
5595    for entry in entries {
5596        let name_str = String::from_utf8_lossy(&entry.name);
5597        let path = format_path(prefix, &name_str);
5598        if is_tree_mode(entry.mode) {
5599            let nested = flatten_tree(odb, &entry.oid, &path)?;
5600            result.extend(nested);
5601        } else {
5602            result.push(FlatEntry {
5603                path,
5604                mode: entry.mode,
5605                oid: entry.oid,
5606            });
5607        }
5608    }
5609
5610    Ok(result)
5611}
5612
5613/// Paths present in `HEAD`'s tree with mode and blob/commit OID (for status porcelain v2).
5614pub fn head_path_states(
5615    odb: &Odb,
5616    head_tree: Option<&ObjectId>,
5617) -> Result<std::collections::BTreeMap<String, (u32, ObjectId)>> {
5618    let mut m = std::collections::BTreeMap::new();
5619    let Some(t) = head_tree else {
5620        return Ok(m);
5621    };
5622    for fe in flatten_tree(odb, t, "")? {
5623        m.insert(fe.path, (fe.mode, fe.oid));
5624    }
5625    Ok(m)
5626}
5627
5628/// Whether a mode represents a tree (directory).
5629fn is_tree_mode(mode: u32) -> bool {
5630    mode == 0o040000
5631}
5632
5633/// Build a path with an optional prefix.
5634fn format_path(prefix: &str, name: &str) -> String {
5635    if prefix.is_empty() {
5636        name.to_owned()
5637    } else {
5638        format!("{prefix}/{name}")
5639    }
5640}
5641
5642/// Format a numeric mode as a zero-padded octal string.
5643pub fn format_mode(mode: u32) -> String {
5644    format!("{mode:06o}")
5645}
5646
5647/// Read the HEAD commit OID from a submodule checkout directory.
5648///
5649/// Returns `None` if the path is missing, not a submodule checkout, or has no resolvable HEAD.
5650#[must_use]
5651pub fn read_submodule_head_for_checkout(sub_dir: &Path) -> Option<ObjectId> {
5652    read_submodule_head(sub_dir)
5653}
5654
5655/// First line of a commit's message for `git diff --submodule=log` output.
5656///
5657/// Honors `encoding` in the commit object (Latin-1 vs UTF-8) using the same
5658/// rules as Git's submodule summary.
5659#[must_use]
5660pub fn submodule_commit_subject_line(c: &CommitData) -> String {
5661    let enc = c.encoding.as_deref().unwrap_or("UTF-8");
5662    let is_latin1 = enc.eq_ignore_ascii_case("ISO8859-1")
5663        || enc.eq_ignore_ascii_case("ISO-8859-1")
5664        || enc.eq_ignore_ascii_case("LATIN1")
5665        || enc.eq_ignore_ascii_case("ISO-8859-15");
5666    if let Some(raw) = c.raw_message.as_deref() {
5667        let line = raw.split(|b| *b == b'\n').next().unwrap_or(raw);
5668        if is_latin1 {
5669            return line
5670                .iter()
5671                .map(|&b| b as char)
5672                .collect::<String>()
5673                .trim()
5674                .to_owned();
5675        }
5676        return String::from_utf8_lossy(line).trim().to_string();
5677    }
5678    c.message.lines().next().unwrap_or("").trim().to_owned()
5679}
5680
5681/// True when `sub_dir` is an empty directory (or missing), i.e. the placeholder left by
5682/// `git apply --index` before `git submodule update`.
5683fn submodule_worktree_is_unpopulated_placeholder(sub_dir: &Path) -> bool {
5684    match fs::read_dir(sub_dir) {
5685        Ok(mut it) => it.next().is_none(),
5686        Err(e) if e.kind() == std::io::ErrorKind::NotFound => true,
5687        Err(_) => false,
5688    }
5689}
5690
5691fn read_submodule_head(sub_dir: &Path) -> Option<ObjectId> {
5692    read_submodule_head_oid(sub_dir)
5693}
5694
5695/// Resolve the embedded git directory for a submodule work tree (`sub_dir/.git`).
5696#[must_use]
5697pub fn submodule_embedded_git_dir(sub_dir: &Path) -> Option<PathBuf> {
5698    let gitfile = sub_dir.join(".git");
5699    if gitfile.is_file() {
5700        let content = fs::read_to_string(&gitfile).ok()?;
5701        let gitdir = content
5702            .lines()
5703            .find_map(|l| l.strip_prefix("gitdir: "))?
5704            .trim();
5705        Some(if Path::new(gitdir).is_absolute() {
5706            PathBuf::from(gitdir)
5707        } else {
5708            sub_dir.join(gitdir)
5709        })
5710    } else if gitfile.is_dir() {
5711        Some(gitfile)
5712    } else {
5713        None
5714    }
5715}
5716
5717/// Walk upward from `sub_dir` to find the nearest containing Git work tree.
5718fn find_superproject_git(sub_dir: &Path) -> Option<(PathBuf, PathBuf)> {
5719    let mut cur = sub_dir.parent()?;
5720    loop {
5721        let git_path = cur.join(".git");
5722        if git_path.exists() {
5723            let gd = if git_path.is_file() {
5724                let content = fs::read_to_string(&git_path).ok()?;
5725                let line = content
5726                    .lines()
5727                    .find_map(|l| l.strip_prefix("gitdir: "))?
5728                    .trim();
5729                if Path::new(line).is_absolute() {
5730                    PathBuf::from(line)
5731                } else {
5732                    cur.join(line)
5733                }
5734            } else {
5735                git_path
5736            };
5737            return Some((cur.to_path_buf(), gd));
5738        }
5739        cur = cur.parent()?;
5740    }
5741}
5742
5743/// Read the HEAD commit OID from a submodule working tree directory.
5744///
5745/// Handles both embedded `.git` directories and `gitdir:` gitfiles pointing at
5746/// `.git/modules/...` (or other locations). Returns `None` if the path is not
5747/// a checkout or has no resolvable HEAD.
5748pub fn read_submodule_head_oid(sub_dir: &Path) -> Option<ObjectId> {
5749    // Submodule `.git` may be a gitfile pointing at `.git/modules/<name>` in another superproject
5750    // after `cp -R`. Prefer the current superproject's module dir when present.
5751    let mut git_dir = submodule_embedded_git_dir(sub_dir)?;
5752    if let Some((super_wt, super_git_dir)) = find_superproject_git(sub_dir) {
5753        let rel = sub_dir.strip_prefix(&super_wt).ok()?;
5754        let rel_str = rel.to_string_lossy().replace('\\', "/");
5755        let local_mod = super_git_dir
5756            .join("modules")
5757            .join(rel_str.trim_start_matches('/'));
5758        if local_mod.join("HEAD").exists() {
5759            let sg = super_git_dir.canonicalize().unwrap_or(super_git_dir);
5760            let cur = git_dir.canonicalize().unwrap_or_else(|_| git_dir.clone());
5761            if !cur.starts_with(&sg) {
5762                git_dir = local_mod;
5763            }
5764        }
5765    }
5766    let head_content = fs::read_to_string(git_dir.join("HEAD")).ok()?;
5767    let head_trimmed = head_content.trim();
5768    if head_trimmed.starts_with("ref: ") {
5769        // Use the full ref resolver so packed-refs and worktrees match Git. If `HEAD` is a stale
5770        // symref (e.g. still `refs/heads/master` while only `main` exists), fall back like
5771        // `resolve_gitlink_ref` / `git add` on embedded repos (`t6437-submodule-merge`).
5772        match crate::refs::resolve_ref(&git_dir, "HEAD") {
5773            Ok(oid) => Some(oid),
5774            Err(_) => {
5775                let mut found = None;
5776                for branch in ["main", "master"] {
5777                    let p = git_dir.join("refs/heads").join(branch);
5778                    if let Ok(s) = fs::read_to_string(&p) {
5779                        if let Ok(o) = ObjectId::from_hex(s.trim()) {
5780                            found = Some(o);
5781                            break;
5782                        }
5783                    }
5784                }
5785                found
5786            }
5787        }
5788    } else {
5789        ObjectId::from_hex(head_trimmed).ok()
5790    }
5791}
5792
5793/// True when a populated submodule checkout is *broken*: its `HEAD` resolves to a commit OID, but
5794/// that commit object cannot be read from the submodule's own object database.
5795///
5796/// This mirrors Git's [`is_submodule_modified`], which shells out to `git status --porcelain=2`
5797/// inside the submodule; when the submodule's object store is corrupt (e.g. `rm -r .git/objects`),
5798/// that inner status fails and Git aborts the surrounding `status`/`diff`/`fetch`. We detect the
5799/// same condition in-process so the superproject operation can return a fatal error rather than
5800/// silently treating the submodule as clean (t5526 "fetching submodule into a broken repository").
5801///
5802/// Returns `false` when the submodule is not checked out, has no embedded git dir, or has an
5803/// unresolvable HEAD (those are handled separately by the unpopulated/placeholder logic).
5804#[must_use]
5805pub fn submodule_head_object_broken(sub_dir: &Path) -> bool {
5806    let Some(sub_git_dir) = submodule_embedded_git_dir(sub_dir) else {
5807        return false;
5808    };
5809    let Some(head_oid) = read_submodule_head_oid(sub_dir) else {
5810        return false;
5811    };
5812    let odb = Odb::new(&sub_git_dir.join("objects"));
5813    match odb.read(&head_oid) {
5814        // HEAD object present but not a commit would be a different kind of corruption; only the
5815        // missing-object case is the broken-repo scenario Git guards against here.
5816        Ok(obj) => parse_commit(&obj.data).is_err(),
5817        Err(_) => true,
5818    }
5819}
5820
5821/// True when a checked-out submodule at `rel_path` has modified or untracked content relative to
5822/// the gitlink `recorded_oid` stored in the superproject (used for `git diff <tree>` parity).
5823fn submodule_has_dirty_worktree_for_super_diff(
5824    super_worktree: &Path,
5825    rel_path: &str,
5826    recorded_oid: &ObjectId,
5827) -> bool {
5828    let flags = submodule_porcelain_flags(super_worktree, rel_path, *recorded_oid);
5829    flags.modified || flags.untracked
5830}
5831
5832/// Submodule dirty bits aligned with Git's `DIRTY_SUBMODULE_*` / porcelain v2 `S???` token.
5833#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
5834pub struct SubmodulePorcelainFlags {
5835    /// Submodule checkout HEAD differs from the gitlink OID recorded in the parent index.
5836    pub new_commits: bool,
5837    /// The submodule has its own staged or unstaged changes (`DIRTY_SUBMODULE_MODIFIED`).
5838    pub modified: bool,
5839    /// The submodule work tree contains paths not in its index (`DIRTY_SUBMODULE_UNTRACKED`).
5840    pub untracked: bool,
5841}
5842
5843/// Inspect a checked-out submodule at `rel_path` (relative to `super_worktree`) and return
5844/// flags used for `git status --porcelain=v2` submodule tokens.
5845///
5846/// `recorded_oid` is the gitlink OID stored in the **parent** index (stage 0). When the
5847/// submodule is not checked out or cannot be opened, returns [`Default::default()`].
5848pub fn submodule_porcelain_flags(
5849    super_worktree: &Path,
5850    rel_path: &str,
5851    recorded_oid: ObjectId,
5852) -> SubmodulePorcelainFlags {
5853    let sub_dir = super_worktree.join(rel_path);
5854    let Some(sub_git_dir) = submodule_embedded_git_dir(&sub_dir) else {
5855        return SubmodulePorcelainFlags::default();
5856    };
5857    let Some(sub_head) = read_submodule_head_oid(&sub_dir) else {
5858        return SubmodulePorcelainFlags::default();
5859    };
5860
5861    let new_commits = sub_head != recorded_oid;
5862
5863    let index_path = sub_git_dir.join("index");
5864    let sub_index = match crate::index::Index::load(&index_path) {
5865        Ok(ix) => ix,
5866        Err(_) => {
5867            return SubmodulePorcelainFlags {
5868                new_commits,
5869                ..Default::default()
5870            }
5871        }
5872    };
5873
5874    let tracked: std::collections::BTreeSet<String> = sub_index
5875        .entries
5876        .iter()
5877        .filter(|e| e.stage() == 0)
5878        .map(|e| String::from_utf8_lossy(&e.path).into_owned())
5879        .collect();
5880    let untracked = submodule_dir_has_untracked_inner(&sub_dir, &sub_dir, &tracked, &sub_index);
5881
5882    let objects_dir = sub_git_dir.join("objects");
5883    let odb = Odb::new(&objects_dir);
5884
5885    let sub_head_tree = (|| -> Option<ObjectId> {
5886        let h = fs::read_to_string(sub_git_dir.join("HEAD")).ok()?;
5887        let h_str = h.trim();
5888        let commit_oid = if let Some(r) = h_str.strip_prefix("ref: ") {
5889            let oid_hex = fs::read_to_string(sub_git_dir.join(r)).ok()?;
5890            ObjectId::from_hex(oid_hex.trim()).ok()?
5891        } else {
5892            ObjectId::from_hex(h_str).ok()?
5893        };
5894        let obj = odb.read(&commit_oid).ok()?;
5895        let commit = parse_commit(&obj.data).ok()?;
5896        Some(commit.tree)
5897    })();
5898
5899    let staged_dirty = sub_head_tree
5900        .as_ref()
5901        .map(|t| diff_index_to_tree(&odb, &sub_index, Some(t), false).map(|v| !v.is_empty()))
5902        .unwrap_or(Ok(false));
5903    let staged_dirty = staged_dirty.unwrap_or(false);
5904
5905    let unstaged_dirty = diff_index_to_worktree(&odb, &sub_index, &sub_dir, false, true)
5906        .map(|v| !v.is_empty())
5907        .unwrap_or(false);
5908
5909    let mut modified = staged_dirty || unstaged_dirty;
5910
5911    // Nested submodule has its own index: OR `modified` from immediate gitlink children so a
5912    // dirty nested checkout (e.g. staged `file` under `sub1/sub2`) marks the parent gitlink as
5913    // modified in the superproject (t7506). Do **not** OR `untracked` — untracked-only inside a
5914    // nested submodule must stay `S..U` on the parent, not `S.U` / `S.M.`.
5915    for e in &sub_index.entries {
5916        if e.stage() != 0 || e.mode != 0o160000 {
5917            continue;
5918        }
5919        let child = String::from_utf8_lossy(&e.path).into_owned();
5920        let full_rel = if rel_path.is_empty() {
5921            child
5922        } else {
5923            format!("{rel_path}/{child}")
5924        };
5925        let nested = submodule_porcelain_flags(super_worktree, &full_rel, e.oid);
5926        modified |= nested.modified;
5927    }
5928
5929    SubmodulePorcelainFlags {
5930        new_commits,
5931        modified,
5932        untracked,
5933    }
5934}
5935
5936fn submodule_dir_has_untracked_inner(
5937    dir: &Path,
5938    root: &Path,
5939    tracked: &std::collections::BTreeSet<String>,
5940    owning_index: &Index,
5941) -> bool {
5942    let entries = match fs::read_dir(dir) {
5943        Ok(e) => e,
5944        Err(_) => return false,
5945    };
5946    let mut sorted: Vec<_> = entries.filter_map(|e| e.ok()).collect();
5947    sorted.sort_by_key(|e| e.file_name());
5948
5949    for entry in sorted {
5950        let name = entry.file_name().to_string_lossy().to_string();
5951        if name == ".git" {
5952            continue;
5953        }
5954        let path = entry.path();
5955        let rel = path
5956            .strip_prefix(root)
5957            .map(|p| p.to_string_lossy().to_string())
5958            .unwrap_or_else(|_| name.clone());
5959
5960        let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
5961        if is_dir {
5962            let is_gitlink = owning_index
5963                .get(rel.as_bytes(), 0)
5964                .is_some_and(|e| e.mode == 0o160000);
5965            if is_gitlink {
5966                let Some(nested_git) = submodule_embedded_git_dir(&path) else {
5967                    continue;
5968                };
5969                let nested_index_path = nested_git.join("index");
5970                let Ok(nested_ix) = crate::index::Index::load(&nested_index_path) else {
5971                    continue;
5972                };
5973                let nested_tracked: std::collections::BTreeSet<String> = nested_ix
5974                    .entries
5975                    .iter()
5976                    .filter(|e| e.stage() == 0)
5977                    .map(|e| String::from_utf8_lossy(&e.path).into_owned())
5978                    .collect();
5979                if submodule_dir_has_untracked_inner(&path, &path, &nested_tracked, &nested_ix) {
5980                    return true;
5981                }
5982            } else if submodule_dir_has_untracked_inner(&path, root, tracked, owning_index) {
5983                return true;
5984            }
5985        } else if !tracked.contains(&rel) {
5986            return true;
5987        }
5988    }
5989    false
5990}
5991
5992/// Reorder diff entries by a `-O<orderfile>` (`diff.orderFile`): entries are sorted by the index of
5993/// the first orderfile glob pattern that matches their path; unmatched entries sort last (stable).
5994pub fn apply_orderfile_entries(
5995    entries: Vec<DiffEntry>,
5996    order_path: &str,
5997    cwd: &Path,
5998) -> Result<Vec<DiffEntry>> {
5999    apply_orderfile(entries, order_path, cwd)
6000}
6001
6002fn apply_orderfile(
6003    mut entries: Vec<DiffEntry>,
6004    order_path: &str,
6005    cwd: &Path,
6006) -> Result<Vec<DiffEntry>> {
6007    let patterns = read_orderfile_patterns(order_path, cwd)?;
6008    let sort_key = |entry: &DiffEntry| -> usize {
6009        let path = entry
6010            .new_path
6011            .as_ref()
6012            .or(entry.old_path.as_ref())
6013            .cloned()
6014            .unwrap_or_default();
6015        for (i, pat) in patterns.iter().enumerate() {
6016            if orderfile_pattern_matches(pat, &path) {
6017                return i;
6018            }
6019        }
6020        patterns.len()
6021    };
6022    entries.sort_by_key(|e| sort_key(e));
6023    Ok(entries)
6024}
6025
6026/// Read non-empty, non-comment glob patterns (one per line) from a `-O<orderfile>` file.
6027pub fn read_orderfile_patterns(order_path: &str, cwd: &Path) -> Result<Vec<String>> {
6028    let path = Path::new(order_path);
6029    let resolved = if path.is_absolute() {
6030        path.to_path_buf()
6031    } else {
6032        cwd.join(path)
6033    };
6034    let _meta = std::fs::metadata(&resolved)
6035        .map_err(|e| Error::Message(format!("could not read orderfile {order_path}: {e}")))?;
6036    let mut f = std::fs::File::open(&resolved)
6037        .map_err(|e| Error::Message(format!("could not read orderfile {order_path}: {e}")))?;
6038    let mut content = String::new();
6039    std::io::Read::read_to_string(&mut f, &mut content)
6040        .map_err(|e| Error::Message(format!("could not read orderfile {order_path}: {e}")))?;
6041    Ok(content
6042        .lines()
6043        .map(|l| l.trim().to_string())
6044        .filter(|l| !l.is_empty() && !l.starts_with('#'))
6045        .collect())
6046}
6047
6048/// Reorder diff entries for `git diff` `--rotate-to` / `--skip-to` (changed paths only).
6049pub fn apply_rotate_skip_entries(
6050    mut entries: Vec<DiffEntry>,
6051    rotate_to: Option<&str>,
6052    skip_to: Option<&str>,
6053) -> Result<Vec<DiffEntry>> {
6054    let Some(needle) = rotate_to.or(skip_to) else {
6055        return Ok(entries);
6056    };
6057    let needle = needle.trim();
6058    if needle.is_empty() {
6059        return Ok(entries);
6060    }
6061    let idx = entries
6062        .iter()
6063        .position(|e| e.path() == needle)
6064        .ok_or_else(|| Error::Message(format!("fatal: No such path '{needle}' in the diff")))?;
6065    if rotate_to.is_some() {
6066        entries.rotate_left(idx);
6067    }
6068    if let Some(skip) = skip_to.filter(|s| !s.trim().is_empty()) {
6069        let pos = entries
6070            .iter()
6071            .position(|e| e.path() == skip)
6072            .ok_or_else(|| Error::Message(format!("fatal: No such path '{skip}' in the diff")))?;
6073        entries.drain(..pos);
6074    }
6075    Ok(entries)
6076}
6077
6078/// `git log` rotate/skip: reorder using the **commit tree** path order (all blobs), then keep only
6079/// paths present in `entries` — matches Git's `diff --rotate-to` with history walks.
6080pub fn apply_rotate_skip_log_entries(
6081    odb: &Odb,
6082    commit_tree: &ObjectId,
6083    entries: Vec<DiffEntry>,
6084    rotate_to: Option<&str>,
6085    skip_to: Option<&str>,
6086) -> Result<Vec<DiffEntry>> {
6087    // Without --rotate-to/--skip-to this is a no-op (the ordered-paths helper
6088    // returns the entries untouched), so skip the full-tree walk that
6089    // `all_blob_paths_in_tree_order` performs — per displayed commit it read
6090    // every subtree of the commit's tree just to discard the result.
6091    fn trimmed(s: Option<&str>) -> Option<&str> {
6092        s.map(str::trim).filter(|t| !t.is_empty())
6093    }
6094    if trimmed(rotate_to).is_none() && trimmed(skip_to).is_none() {
6095        return Ok(entries);
6096    }
6097    let tree_paths = crate::merge_diff::all_blob_paths_in_tree_order(odb, commit_tree);
6098    apply_rotate_skip_ordered_paths(&tree_paths, entries, rotate_to, skip_to)
6099}
6100
6101fn apply_rotate_skip_ordered_paths(
6102    tree_paths: &[String],
6103    entries: Vec<DiffEntry>,
6104    rotate_to: Option<&str>,
6105    skip_to: Option<&str>,
6106) -> Result<Vec<DiffEntry>> {
6107    let rotate = rotate_to.and_then(|s| {
6108        let t = s.trim();
6109        (!t.is_empty()).then_some(t)
6110    });
6111    let skip = skip_to.and_then(|s| {
6112        let t = s.trim();
6113        (!t.is_empty()).then_some(t)
6114    });
6115    if rotate.is_none() && skip.is_none() {
6116        return Ok(entries);
6117    }
6118
6119    use std::collections::HashMap;
6120    let mut by_path: HashMap<String, DiffEntry> = HashMap::new();
6121    for e in entries {
6122        by_path.insert(e.path().to_string(), e);
6123    }
6124
6125    // `git log --skip-to`: only list changed paths from the skip point onward (unmodified paths
6126    // in the tree-order suffix are omitted). `--rotate-to` still lists every changed file in order.
6127    if rotate.is_none() {
6128        let Some(skip_path) = skip else {
6129            return Ok(by_path.into_values().collect());
6130        };
6131        let idx = tree_paths
6132            .iter()
6133            .position(|p| p == skip_path)
6134            .ok_or_else(|| {
6135                Error::Message(format!("fatal: No such path '{skip_path}' in the diff"))
6136            })?;
6137        let mut out = Vec::new();
6138        for p in tree_paths.iter().skip(idx) {
6139            if let Some(e) = by_path.remove(p) {
6140                out.push(e);
6141            }
6142        }
6143        return Ok(out);
6144    }
6145
6146    let Some(needle) = rotate else {
6147        return Ok(by_path.into_values().collect());
6148    };
6149    let idx = tree_paths
6150        .iter()
6151        .position(|p| p == needle)
6152        .ok_or_else(|| Error::Message(format!("fatal: No such path '{needle}' in the diff")))?;
6153    let mut order: Vec<String> = tree_paths.to_vec();
6154    order.rotate_left(idx);
6155    if let Some(skip_path) = skip {
6156        let pos = order.iter().position(|p| p == skip_path).ok_or_else(|| {
6157            Error::Message(format!("fatal: No such path '{skip_path}' in the diff"))
6158        })?;
6159        order.drain(..pos);
6160    }
6161    let mut out = Vec::new();
6162    for p in order {
6163        if let Some(e) = by_path.remove(&p) {
6164            out.push(e);
6165        }
6166    }
6167    Ok(out)
6168}
6169
6170/// Check if an orderfile pattern matches a path (matches the basename or the full path).
6171/// Supports basic glob patterns: `*` matches any sequence, `?` matches one char.
6172pub fn orderfile_pattern_matches(pattern: &str, path: &str) -> bool {
6173    let name = path.rsplit('/').next().unwrap_or(path);
6174    orderfile_glob_match(pattern, name) || orderfile_glob_match(pattern, path)
6175}
6176
6177/// Basic glob matching (supports `*` and `?`).
6178fn orderfile_glob_match(pattern: &str, text: &str) -> bool {
6179    let mut pi = 0;
6180    let mut ti = 0;
6181    let pb = pattern.as_bytes();
6182    let tb = text.as_bytes();
6183    let mut star_pi = usize::MAX;
6184    let mut star_ti = 0;
6185
6186    while ti < tb.len() {
6187        if pi < pb.len() && (pb[pi] == b'?' || pb[pi] == tb[ti]) {
6188            pi += 1;
6189            ti += 1;
6190        } else if pi < pb.len() && pb[pi] == b'*' {
6191            star_pi = pi;
6192            star_ti = ti;
6193            pi += 1;
6194        } else if star_pi != usize::MAX {
6195            pi = star_pi + 1;
6196            star_ti += 1;
6197            ti = star_ti;
6198        } else {
6199            return false;
6200        }
6201    }
6202    while pi < pb.len() && pb[pi] == b'*' {
6203        pi += 1;
6204    }
6205    pi == pb.len()
6206}
grit_lib/diff.rs

grit_lib/
diff.rs