grit_lib/
diff.rs

1//! Diff machinery — compare trees, index entries, and working tree files.
2//!
3//! # Overview
4//!
5//! This module provides the core diffing infrastructure shared by `diff`,
6//! `diff-index`, `status`, `log`, `show`, `commit`, and `merge`.
7//!
8//! ## Levels of comparison
9//!
10//! 1. **Tree-to-tree** — compare two tree objects (e.g. for `log`/`show`).
11//! 2. **Tree-to-index** — compare a tree (usually HEAD) against the index
12//!    (staged changes, used by `diff --cached` and `status`).
13//! 3. **Index-to-worktree** — compare index against the working directory
14//!    (unstaged changes, used by `diff` and `status`).
15//!
16//! ## Content diff
17//!
18//! Line-level diffing uses the `similar` crate (Myers, patience, minimal) and,
19//! for Git's `histogram` algorithm, `imara-diff` for output compatible with upstream Git.
20//! Output formats: unified patch, raw (`:old-mode new-mode ...`), stat,
21//! numstat.
22
23use std::fs;
24use std::os::unix::fs::MetadataExt;
25use std::path::{Path, PathBuf};
26
27use crate::config::ConfigSet;
28use crate::diff_indent_heuristic;
29use crate::error::{Error, Result};
30use crate::index::{Index, IndexEntry};
31use crate::objects::{parse_commit, parse_tree, CommitData, ObjectId, ObjectKind, TreeEntry};
32use crate::odb::Odb;
33use crate::userdiff::FuncnameMatcher;
34
35/// Splits imara-diff unified body (concatenated hunks) into per-hunk slices for post-processing.
36fn imara_unified_hunk_slices(body: &str) -> Vec<&str> {
37    let mut starts: Vec<usize> = Vec::new();
38    if body.starts_with("@@") {
39        starts.push(0);
40    }
41    for (idx, _) in body.match_indices("\n@@ ") {
42        starts.push(idx + 1);
43    }
44    starts.push(body.len());
45    starts.windows(2).map(|w| &body[w[0]..w[1]]).collect()
46}
47
48/// Effective context length for imara's hunk merger so it approximates Git's
49/// `2 * U + inter_hunk_context` unchanged-line merge threshold.
50fn imara_context_len_for_git(context_lines: usize, inter_hunk_context: usize) -> u32 {
51    (2usize
52        .saturating_mul(context_lines)
53        .saturating_add(inter_hunk_context))
54    .div_ceil(2)
55    .min(u32::MAX as usize) as u32
56}
57
58fn histogram_unified_body_raw(
59    old_content: &str,
60    new_content: &str,
61    context_lines: usize,
62    inter_hunk_context: usize,
63) -> String {
64    use imara_diff::{Algorithm, BasicLineDiffPrinter, Diff, InternedInput, UnifiedDiffConfig};
65
66    let input = InternedInput::new(old_content, new_content);
67    let mut diff = Diff::compute(Algorithm::Histogram, &input);
68    diff.postprocess_lines(&input);
69    let mut config = UnifiedDiffConfig::default();
70    config.context_len(imara_context_len_for_git(context_lines, inter_hunk_context));
71    let printer = BasicLineDiffPrinter(&input.interner);
72    diff.unified_diff(&printer, config, &input).to_string()
73}
74
75/// Unified diff hunks for Git's histogram algorithm (no `---` / `+++` lines).
76///
77/// Used by `--no-index` when whitespace normalization is off so the patch matches upstream Git.
78#[must_use]
79pub fn unified_diff_histogram_hunks_only(
80    old_content: &str,
81    new_content: &str,
82    context_lines: usize,
83    inter_hunk_context: usize,
84) -> String {
85    histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context)
86}
87
88/// Full unified diff (`---` / `+++` / hunks) using Git's histogram algorithm.
89#[must_use]
90pub fn unified_diff_histogram_with_prefix_and_funcname(
91    old_content: &str,
92    new_content: &str,
93    old_path: &str,
94    new_path: &str,
95    context_lines: usize,
96    inter_hunk_context: usize,
97    src_prefix: &str,
98    dst_prefix: &str,
99    funcname_matcher: Option<&FuncnameMatcher>,
100    quote_path_fully: bool,
101) -> String {
102    use crate::quote_path::format_diff_path_with_prefix;
103
104    let body =
105        histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context);
106
107    let mut output = String::new();
108    if old_path == "/dev/null" {
109        output.push_str("--- /dev/null\n");
110    } else if src_prefix.is_empty() {
111        output.push_str(&format!("--- {old_path}\n"));
112    } else {
113        output.push_str("--- ");
114        output.push_str(&format_diff_path_with_prefix(
115            src_prefix,
116            old_path,
117            quote_path_fully,
118        ));
119        output.push('\n');
120    }
121    if new_path == "/dev/null" {
122        output.push_str("+++ /dev/null\n");
123    } else if dst_prefix.is_empty() {
124        output.push_str(&format!("+++ {new_path}\n"));
125    } else {
126        output.push_str("+++ ");
127        output.push_str(&format_diff_path_with_prefix(
128            dst_prefix,
129            new_path,
130            quote_path_fully,
131        ));
132        output.push('\n');
133    }
134
135    let old_lines: Vec<&str> = old_content.lines().collect();
136    for hunk_str in imara_unified_hunk_slices(&body) {
137        if hunk_str.is_empty() {
138            continue;
139        }
140        if let Some(first_newline) = hunk_str.find('\n') {
141            let header_line = &hunk_str[..first_newline];
142            let rest = &hunk_str[first_newline..];
143            if let Some(func_ctx) =
144                extract_function_context(header_line, &old_lines, funcname_matcher)
145            {
146                output.push_str(header_line);
147                output.push(' ');
148                output.push_str(&func_ctx);
149                output.push_str(rest);
150            } else {
151                output.push_str(hunk_str);
152            }
153        } else {
154            output.push_str(hunk_str);
155        }
156    }
157
158    output
159}
160
161/// `diff.indentHeuristic` from config (Git defaults to true when unset).
162#[must_use]
163pub fn indent_heuristic_from_config(config: &ConfigSet) -> bool {
164    match config.get_bool("diff.indentHeuristic") {
165        Some(Ok(b)) => b,
166        Some(Err(_)) | None => true,
167    }
168}
169
170/// Resolve indent heuristic: `--no-indent-heuristic` and `--indent-heuristic` override config.
171#[must_use]
172pub fn resolve_indent_heuristic(
173    config: &ConfigSet,
174    cli_indent_heuristic: bool,
175    cli_no_indent_heuristic: bool,
176) -> bool {
177    if cli_no_indent_heuristic {
178        false
179    } else if cli_indent_heuristic {
180        true
181    } else {
182        indent_heuristic_from_config(config)
183    }
184}
185
186/// Parse `--indent-heuristic` / `--no-indent-heuristic` from a plumbing argv slice (last occurrence wins).
187#[must_use]
188pub fn parse_indent_heuristic_cli_flags(argv: &[String]) -> (bool, bool) {
189    let mut indent_heuristic = false;
190    let mut no_indent_heuristic = false;
191    for a in argv {
192        match a.as_str() {
193            "--indent-heuristic" => {
194                indent_heuristic = true;
195                no_indent_heuristic = false;
196            }
197            "--no-indent-heuristic" => {
198                no_indent_heuristic = true;
199                indent_heuristic = false;
200            }
201            _ => {}
202        }
203    }
204    (indent_heuristic, no_indent_heuristic)
205}
206
207/// Line-diff ops for string slices after Git `xdl_change_compact` (and optional indent heuristic).
208#[must_use]
209pub fn diff_slice_ops_compacted(
210    old_lines: &[&str],
211    new_lines: &[&str],
212    algorithm: similar::Algorithm,
213    indent_heuristic: bool,
214) -> Vec<similar::DiffOp> {
215    diff_indent_heuristic::diff_slice_ops_compacted(
216        old_lines,
217        new_lines,
218        algorithm,
219        indent_heuristic,
220    )
221}
222
223/// Map each line in `new_joined` to its origin in `old_joined` after Git-style compaction (for blame).
224#[must_use]
225pub fn map_new_to_old_lines_compacted(
226    old_joined: &str,
227    new_joined: &str,
228    algorithm: similar::Algorithm,
229    indent_heuristic: bool,
230    new_line_count: usize,
231) -> Vec<Option<usize>> {
232    let ops = diff_indent_heuristic::diff_lines_ops_compacted(
233        old_joined,
234        new_joined,
235        algorithm,
236        indent_heuristic,
237    );
238    diff_indent_heuristic::map_new_to_old_from_ops(&ops, new_line_count)
239}
240
241/// The kind of change between two sides of a diff.
242#[derive(Debug, Clone, Copy, PartialEq, Eq)]
243pub enum DiffStatus {
244    /// File was added.
245    Added,
246    /// File was deleted.
247    Deleted,
248    /// File was modified (content or mode change).
249    Modified,
250    /// File was renamed (with optional content change).
251    Renamed,
252    /// File was copied.
253    Copied,
254    /// File type changed (e.g. regular → symlink).
255    TypeChanged,
256    /// Unmerged (conflict).
257    Unmerged,
258}
259
260impl DiffStatus {
261    /// Single-character status letter used in raw diff output.
262    #[must_use]
263    pub fn letter(&self) -> char {
264        match self {
265            Self::Added => 'A',
266            Self::Deleted => 'D',
267            Self::Modified => 'M',
268            Self::Renamed => 'R',
269            Self::Copied => 'C',
270            Self::TypeChanged => 'T',
271            Self::Unmerged => 'U',
272        }
273    }
274}
275
276/// A single diff entry representing one changed path.
277#[derive(Debug, Clone, PartialEq, Eq)]
278pub struct DiffEntry {
279    /// The status of this change.
280    pub status: DiffStatus,
281    /// Path in the "old" side (None for Added).
282    pub old_path: Option<String>,
283    /// Path in the "new" side (None for Deleted).
284    pub new_path: Option<String>,
285    /// Old file mode (as octal string, e.g. "100644").
286    pub old_mode: String,
287    /// New file mode.
288    pub new_mode: String,
289    /// Old object ID (zero OID for Added).
290    pub old_oid: ObjectId,
291    /// New object ID (zero OID for Deleted).
292    pub new_oid: ObjectId,
293    /// Similarity score (0–100) for renames/copies.
294    pub score: Option<u32>,
295}
296
297impl DiffEntry {
298    /// The primary path for display (new_path for adds, old_path for deletes).
299    #[must_use]
300    pub fn path(&self) -> &str {
301        self.new_path
302            .as_deref()
303            .or(self.old_path.as_deref())
304            .unwrap_or("")
305    }
306
307    /// Return a human-oriented path display for this entry.
308    ///
309    /// For renames and copies this returns `old -> new`; for all other entry
310    /// kinds this returns the primary path.
311    #[must_use]
312    pub fn display_path(&self) -> String {
313        match self.status {
314            DiffStatus::Renamed | DiffStatus::Copied => {
315                let old = self.old_path.as_deref().unwrap_or("");
316                let new = self.new_path.as_deref().unwrap_or("");
317                if old.is_empty() || new.is_empty() {
318                    self.path().to_owned()
319                } else {
320                    format!("{old} -> {new}")
321                }
322            }
323            _ => self.path().to_owned(),
324        }
325    }
326}
327
328/// The zero (null) object ID used for "no object" in diff output.
329pub const ZERO_OID: &str = "0000000000000000000000000000000000000000";
330
331/// Return the zero ObjectId.
332#[must_use]
333pub fn zero_oid() -> ObjectId {
334    ObjectId::from_bytes(&[0u8; 20]).unwrap_or_else(|_| {
335        // This should never fail since we pass exactly 20 bytes
336        panic!("internal error: failed to create zero OID");
337    })
338}
339
340/// Return the ObjectId for the empty blob object.
341#[must_use]
342pub fn empty_blob_oid() -> ObjectId {
343    ObjectId::from_hex("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391").unwrap_or_else(|_| {
344        // This should never fail since the object ID literal is valid.
345        panic!("internal error: failed to create empty blob OID");
346    })
347}
348
349// ── Tree-to-tree diff ───────────────────────────────────────────────
350
351/// Compare two trees and return the list of changed entries.
352///
353/// # Parameters
354///
355/// - `odb` — object database to read tree objects from.
356/// - `old_tree_oid` — OID of the old tree (or `None` for comparison against empty).
357/// - `new_tree_oid` — OID of the new tree (or `None` for comparison against empty).
358/// - `prefix` — path prefix for nested tree recursion (empty string for root).
359///
360/// # Errors
361///
362/// Returns errors from object database reads.
363pub fn diff_trees(
364    odb: &Odb,
365    old_tree_oid: Option<&ObjectId>,
366    new_tree_oid: Option<&ObjectId>,
367    prefix: &str,
368) -> Result<Vec<DiffEntry>> {
369    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, false)
370}
371
372/// Like `diff_trees` but with `show_trees` flag: when true, emit entries for
373/// tree objects themselves in addition to their recursive contents (the `-t`
374/// flag of `diff-tree`).
375pub fn diff_trees_show_tree_entries(
376    odb: &Odb,
377    old_tree_oid: Option<&ObjectId>,
378    new_tree_oid: Option<&ObjectId>,
379    prefix: &str,
380) -> Result<Vec<DiffEntry>> {
381    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, true)
382}
383
384fn diff_trees_opts(
385    odb: &Odb,
386    old_tree_oid: Option<&ObjectId>,
387    new_tree_oid: Option<&ObjectId>,
388    prefix: &str,
389    show_trees: bool,
390) -> Result<Vec<DiffEntry>> {
391    let old_entries = match old_tree_oid {
392        Some(oid) => read_tree(odb, oid)?,
393        None => Vec::new(),
394    };
395    let new_entries = match new_tree_oid {
396        Some(oid) => read_tree(odb, oid)?,
397        None => Vec::new(),
398    };
399
400    let mut result = Vec::new();
401    diff_tree_entries_opts(
402        odb,
403        &old_entries,
404        &new_entries,
405        prefix,
406        show_trees,
407        &mut result,
408    )?;
409    Ok(result)
410}
411
412/// Read and parse a tree object from the ODB.
413fn read_tree(odb: &Odb, oid: &ObjectId) -> Result<Vec<TreeEntry>> {
414    let obj = odb.read(oid)?;
415    if obj.kind != ObjectKind::Tree {
416        return Err(Error::CorruptObject(format!(
417            "expected tree, got {}",
418            obj.kind.as_str()
419        )));
420    }
421    parse_tree(&obj.data)
422}
423
424/// Compare two sorted lists of tree entries, recursing into subtrees.
425fn diff_tree_entries_opts(
426    odb: &Odb,
427    old: &[TreeEntry],
428    new: &[TreeEntry],
429    prefix: &str,
430    show_trees: bool,
431    result: &mut Vec<DiffEntry>,
432) -> Result<()> {
433    let mut oi = 0;
434    let mut ni = 0;
435
436    while oi < old.len() || ni < new.len() {
437        match (old.get(oi), new.get(ni)) {
438            (Some(o), Some(n)) => {
439                let cmp = crate::objects::tree_entry_cmp(
440                    &o.name,
441                    is_tree_mode(o.mode),
442                    &n.name,
443                    is_tree_mode(n.mode),
444                );
445                match cmp {
446                    std::cmp::Ordering::Less => {
447                        // Old entry not in new → deleted
448                        emit_deleted_opts(odb, o, prefix, show_trees, result)?;
449                        oi += 1;
450                    }
451                    std::cmp::Ordering::Greater => {
452                        // New entry not in old → added
453                        emit_added_opts(odb, n, prefix, show_trees, result)?;
454                        ni += 1;
455                    }
456                    std::cmp::Ordering::Equal => {
457                        // Both present — check for changes
458                        if o.oid != n.oid || o.mode != n.mode {
459                            let name_str = String::from_utf8_lossy(&o.name);
460                            let path = format_path(prefix, &name_str);
461                            if is_tree_mode(o.mode) && is_tree_mode(n.mode) {
462                                // Both are trees
463                                if show_trees {
464                                    result.push(DiffEntry {
465                                        status: DiffStatus::Modified,
466                                        old_path: Some(path.clone()),
467                                        new_path: Some(path.clone()),
468                                        old_mode: format_mode(o.mode),
469                                        new_mode: format_mode(n.mode),
470                                        old_oid: o.oid,
471                                        new_oid: n.oid,
472                                        score: None,
473                                    });
474                                }
475                                // Recurse
476                                let nested = diff_trees_opts(
477                                    odb,
478                                    Some(&o.oid),
479                                    Some(&n.oid),
480                                    &path,
481                                    show_trees,
482                                )?;
483                                result.extend(nested);
484                            } else if is_tree_mode(o.mode) && !is_tree_mode(n.mode) {
485                                // Tree → blob: delete tree contents, add blob
486                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
487                                emit_added_opts(odb, n, prefix, show_trees, result)?;
488                            } else if !is_tree_mode(o.mode) && is_tree_mode(n.mode) {
489                                // Blob → tree: delete blob, add tree contents
490                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
491                                emit_added_opts(odb, n, prefix, show_trees, result)?;
492                            } else {
493                                // Both blobs — modified.
494                                // A mode-only change (e.g. chmod) is Modified.
495                                // TypeChanged is only for actual type changes (blob ↔ symlink).
496                                let old_type = o.mode & 0o170000;
497                                let new_type = n.mode & 0o170000;
498                                result.push(DiffEntry {
499                                    status: if old_type != new_type {
500                                        DiffStatus::TypeChanged
501                                    } else {
502                                        DiffStatus::Modified
503                                    },
504                                    old_path: Some(path.clone()),
505                                    new_path: Some(path),
506                                    old_mode: format_mode(o.mode),
507                                    new_mode: format_mode(n.mode),
508                                    old_oid: o.oid,
509                                    new_oid: n.oid,
510                                    score: None,
511                                });
512                            }
513                        }
514                        oi += 1;
515                        ni += 1;
516                    }
517                }
518            }
519            (Some(o), None) => {
520                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
521                oi += 1;
522            }
523            (None, Some(n)) => {
524                emit_added_opts(odb, n, prefix, show_trees, result)?;
525                ni += 1;
526            }
527            (None, None) => break,
528        }
529    }
530
531    Ok(())
532}
533
534fn emit_deleted_opts(
535    odb: &Odb,
536    entry: &TreeEntry,
537    prefix: &str,
538    show_trees: bool,
539    result: &mut Vec<DiffEntry>,
540) -> Result<()> {
541    let name_str = String::from_utf8_lossy(&entry.name);
542    let path = format_path(prefix, &name_str);
543    if is_tree_mode(entry.mode) {
544        if show_trees {
545            result.push(DiffEntry {
546                status: DiffStatus::Deleted,
547                old_path: Some(path.clone()),
548                new_path: None,
549                old_mode: format_mode(entry.mode),
550                new_mode: "000000".to_owned(),
551                old_oid: entry.oid,
552                new_oid: zero_oid(),
553                score: None,
554            });
555        }
556        // Recurse into deleted tree
557        let nested = diff_trees_opts(odb, Some(&entry.oid), None, &path, show_trees)?;
558        result.extend(nested);
559    } else {
560        result.push(DiffEntry {
561            status: DiffStatus::Deleted,
562            old_path: Some(path.clone()),
563            new_path: None,
564            old_mode: format_mode(entry.mode),
565            new_mode: "000000".to_owned(),
566            old_oid: entry.oid,
567            new_oid: zero_oid(),
568            score: None,
569        });
570    }
571    Ok(())
572}
573
574fn emit_added_opts(
575    odb: &Odb,
576    entry: &TreeEntry,
577    prefix: &str,
578    show_trees: bool,
579    result: &mut Vec<DiffEntry>,
580) -> Result<()> {
581    let name_str = String::from_utf8_lossy(&entry.name);
582    let path = format_path(prefix, &name_str);
583    if is_tree_mode(entry.mode) {
584        if show_trees {
585            result.push(DiffEntry {
586                status: DiffStatus::Added,
587                old_path: None,
588                new_path: Some(path.clone()),
589                old_mode: "000000".to_owned(),
590                new_mode: format_mode(entry.mode),
591                old_oid: zero_oid(),
592                new_oid: entry.oid,
593                score: None,
594            });
595        }
596        // Recurse into added tree
597        let nested = diff_trees_opts(odb, None, Some(&entry.oid), &path, show_trees)?;
598        result.extend(nested);
599    } else {
600        result.push(DiffEntry {
601            status: DiffStatus::Added,
602            old_path: None,
603            new_path: Some(path),
604            old_mode: "000000".to_owned(),
605            new_mode: format_mode(entry.mode),
606            old_oid: zero_oid(),
607            new_oid: entry.oid,
608            score: None,
609        });
610    }
611    Ok(())
612}
613
614// ── Index-to-tree diff (staged changes) ─────────────────────────────
615
616/// Compare the index against a tree (usually HEAD's tree).
617///
618/// This shows "staged" changes — what would be committed.
619///
620/// # Parameters
621///
622/// - `odb` — object database.
623/// - `index` — the current index.
624/// - `tree_oid` — the tree to compare against (e.g. HEAD's tree), or `None`
625///   for comparison against an empty tree (initial commit).
626///
627/// # Errors
628///
629/// Returns errors from ODB reads.
630///
631/// When `ignore_submodules` is true, gitlink (`160000`) paths are omitted from the diff, matching
632/// Git's `require_clean_work_tree(..., ignore_submodules=1)` used by `git rebase` / `git pull`.
633pub fn diff_index_to_tree(
634    odb: &Odb,
635    index: &Index,
636    tree_oid: Option<&ObjectId>,
637    ignore_submodules: bool,
638) -> Result<Vec<DiffEntry>> {
639    // Flatten the tree into a sorted list of (path, mode, oid)
640    let tree_entries = match tree_oid {
641        Some(oid) => flatten_tree(odb, oid, "")?,
642        None => Vec::new(),
643    };
644
645    // Build maps keyed by path
646    let mut tree_map: std::collections::BTreeMap<&str, &FlatEntry> =
647        std::collections::BTreeMap::new();
648    for entry in &tree_entries {
649        tree_map.insert(&entry.path, entry);
650    }
651
652    let mut result = Vec::new();
653    let mut stage0_paths = std::collections::BTreeSet::new();
654    let mut unmerged_modes: std::collections::BTreeMap<String, (u8, u32)> =
655        std::collections::BTreeMap::new();
656
657    // Check index entries against tree
658    for ie in &index.entries {
659        let path = String::from_utf8_lossy(&ie.path).to_string();
660        if ie.stage() == 0 && ie.intent_to_add() {
661            // Intent-to-add entries are not "staged" for diff-index / status
662            // (matches Git: `git diff --cached` is empty for `-N` paths).
663            continue;
664        }
665        if ie.stage() != 0 {
666            let rank = match ie.stage() {
667                2 => 0u8,
668                3 => 1u8,
669                1 => 2u8,
670                _ => 3u8,
671            };
672            match unmerged_modes.get(&path) {
673                Some((existing_rank, _)) if *existing_rank <= rank => {}
674                _ => {
675                    unmerged_modes.insert(path, (rank, ie.mode));
676                }
677            }
678            continue;
679        }
680        if ignore_submodules && ie.mode == 0o160000 {
681            let _ = tree_map.remove(path.as_str());
682            stage0_paths.insert(path.clone());
683            continue;
684        }
685        stage0_paths.insert(path.clone());
686        match tree_map.remove(path.as_str()) {
687            Some(te) => {
688                // Present in both — check for differences
689                if te.oid != ie.oid || te.mode != ie.mode {
690                    result.push(DiffEntry {
691                        status: DiffStatus::Modified,
692                        old_path: Some(path.clone()),
693                        new_path: Some(path),
694                        old_mode: format_mode(te.mode),
695                        new_mode: format_mode(ie.mode),
696                        old_oid: te.oid,
697                        new_oid: ie.oid,
698                        score: None,
699                    });
700                }
701            }
702            None => {
703                // In index but not tree → added
704                result.push(DiffEntry {
705                    status: DiffStatus::Added,
706                    old_path: None,
707                    new_path: Some(path),
708                    old_mode: "000000".to_owned(),
709                    new_mode: format_mode(ie.mode),
710                    old_oid: zero_oid(),
711                    new_oid: ie.oid,
712                    score: None,
713                });
714            }
715        }
716    }
717
718    for (path, (_, mode)) in &unmerged_modes {
719        if stage0_paths.contains(path) {
720            continue;
721        }
722        tree_map.remove(path.as_str());
723        result.push(DiffEntry {
724            status: DiffStatus::Unmerged,
725            old_path: Some(path.clone()),
726            new_path: Some(path.clone()),
727            old_mode: "000000".to_owned(),
728            new_mode: format_mode(*mode),
729            old_oid: zero_oid(),
730            new_oid: zero_oid(),
731            score: None,
732        });
733    }
734
735    // Remaining tree entries not in index → deleted
736    for (path, te) in tree_map {
737        if ignore_submodules && te.mode == 0o160000 {
738            continue;
739        }
740        result.push(DiffEntry {
741            status: DiffStatus::Deleted,
742            old_path: Some(path.to_owned()),
743            new_path: None,
744            old_mode: format_mode(te.mode),
745            new_mode: "000000".to_owned(),
746            old_oid: te.oid,
747            new_oid: zero_oid(),
748            score: None,
749        });
750    }
751
752    result.sort_by(|a, b| a.path().cmp(b.path()));
753    Ok(result)
754}
755
756// ── Index-to-worktree diff (unstaged changes) ───────────────────────
757
758/// Compare the index against the working tree.
759///
760/// This shows "unstaged" changes — modifications not yet staged.
761///
762/// Entries with [`IndexEntry::assume_unchanged`] or [`IndexEntry::skip_worktree`] are treated as
763/// matching the work tree without examining the filesystem (Git `CE_VALID` / skip-worktree).
764///
765/// # Parameters
766///
767/// - `odb` — object database (for hashing worktree files).
768/// - `index` — the current index.
769/// - `work_tree` — path to the working tree root.
770/// - `ignore_submodule_untracked` — when true, gitlink entries are not dirty solely from untracked
771///   files inside the submodule (matches `git status -uno`).
772/// - `simplify_gitlinks` — when true, nested gitlink entries only compare the submodule checkout
773///   HEAD to the recorded OID (ignore dirty work trees inside nested submodules). Used when
774///   computing `submodule_porcelain_flags` so untracked files under a nested submodule do not set
775///   the parent submodule's `modified` bit (Git `DIRTY_SUBMODULE_MODIFIED`; t7506).
776///
777/// # Errors
778///
779/// Returns errors from I/O or hashing.
780pub fn diff_index_to_worktree(
781    odb: &Odb,
782    index: &Index,
783    work_tree: &Path,
784    ignore_submodule_untracked: bool,
785    simplify_gitlinks: bool,
786) -> Result<Vec<DiffEntry>> {
787    diff_index_to_worktree_with_options(
788        odb,
789        index,
790        work_tree,
791        DiffIndexToWorktreeOptions {
792            ignore_submodule_untracked,
793            simplify_gitlinks,
794            ..DiffIndexToWorktreeOptions::default()
795        },
796    )
797}
798
799/// Additional inputs for [`diff_index_to_worktree_with_options`].
800#[derive(Debug, Clone, Copy, Default)]
801pub struct DiffIndexToWorktreeOptions {
802    /// Optional index mtime pair `(sec, nsec)` sampled when the index was read.
803    ///
804    /// When provided, entries with matching stat data are still considered dirty candidates if
805    /// their recorded mtime is "racy" (at or after this timestamp), matching Git's
806    /// `is_racy_timestamp` behavior.
807    pub index_mtime: Option<(u32, u32)>,
808    /// When true, gitlink entries are not dirty solely from untracked files inside the submodule.
809    pub ignore_submodule_untracked: bool,
810    /// When true, nested gitlink entries only compare the submodule checkout HEAD to the recorded OID.
811    pub simplify_gitlinks: bool,
812}
813
814/// Compare the index against the working tree with optional racy-timestamp context.
815///
816/// This variant enables a stat-trust fast path: if an entry's stat tuple matches and the mode is
817/// unchanged, the worktree blob hash is skipped unless the entry is racy relative to the supplied
818/// index mtime.
819///
820/// # Parameters
821///
822/// - `odb` — object database (for hashing worktree files).
823/// - `index` — the current index.
824/// - `work_tree` — path to the working tree root.
825/// - `options` — optional context for racy timestamp checks.
826///
827/// # Errors
828///
829/// Returns errors from I/O or hashing.
830pub fn diff_index_to_worktree_with_options(
831    odb: &Odb,
832    index: &Index,
833    work_tree: &Path,
834    options: DiffIndexToWorktreeOptions,
835) -> Result<Vec<DiffEntry>> {
836    use crate::config::ConfigSet;
837    use crate::crlf;
838
839    let ignore_submodule_untracked = options.ignore_submodule_untracked;
840    let simplify_gitlinks = options.simplify_gitlinks;
841
842    let git_dir = work_tree.join(".git");
843    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
844    let conv = crlf::ConversionConfig::from_config(&config);
845    let attrs = crlf::load_gitattributes(work_tree);
846
847    let mut result = Vec::new();
848    let mut unmerged_base: std::collections::BTreeMap<String, (u8, &IndexEntry)> =
849        std::collections::BTreeMap::new();
850
851    for ie in &index.entries {
852        if ie.stage() != 0 {
853            let path = String::from_utf8_lossy(&ie.path).to_string();
854            let rank = match ie.stage() {
855                2 => 0u8,
856                3 => 1u8,
857                1 => 2u8,
858                _ => 3u8,
859            };
860            match unmerged_base.get(&path) {
861                Some((existing_rank, _)) if *existing_rank <= rank => {}
862                _ => {
863                    unmerged_base.insert(path, (rank, ie));
864                }
865            }
866            continue;
867        }
868        // Sparse checkout: paths outside the cone are not expected on disk; `assume_unchanged`
869        // is treated as clean without reading the filesystem (wt-status.c).
870        if ie.skip_worktree() || ie.assume_unchanged() {
871            continue;
872        }
873        // Use str slice directly to avoid allocation for path joining;
874        // only allocate String if we need it for DiffEntry output.
875        let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
876        let is_intent_to_add = ie.intent_to_add();
877
878        // Gitlink entries (submodules): Git's `diff-index` reports `M` when the recorded
879        // commit differs from the submodule checkout **or** when the submodule work tree is
880        // dirty (staged/unstaged/untracked) even if HEAD still matches the gitlink. For the
881        // latter case the "new" OID column is the null OID (see `git diff-index` / t7506).
882        if ie.mode == 0o160000 {
883            let sub_dir = work_tree.join(path_str_ref);
884            let sub_head_oid = read_submodule_head_oid(&sub_dir);
885            let ref_matches = match sub_head_oid {
886                Some(oid) => oid == ie.oid,
887                None => submodule_worktree_is_unpopulated_placeholder(&sub_dir),
888            };
889            if simplify_gitlinks {
890                if !ref_matches {
891                    let path_owned = path_str_ref.to_owned();
892                    let new_oid = sub_head_oid.unwrap_or_else(zero_oid);
893                    result.push(DiffEntry {
894                        status: DiffStatus::Modified,
895                        old_path: Some(path_owned.clone()),
896                        new_path: Some(path_owned),
897                        old_mode: format_mode(ie.mode),
898                        new_mode: format_mode(ie.mode),
899                        old_oid: ie.oid,
900                        new_oid,
901                        score: None,
902                    });
903                }
904                continue;
905            }
906            let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
907            if ignore_submodule_untracked {
908                flags.untracked = false;
909            }
910            let inner_dirty = flags.modified || flags.untracked;
911            if !ref_matches || inner_dirty {
912                let path_owned = path_str_ref.to_owned();
913                let new_oid = if !ref_matches {
914                    sub_head_oid.unwrap_or_else(zero_oid)
915                } else {
916                    zero_oid()
917                };
918                result.push(DiffEntry {
919                    status: DiffStatus::Modified,
920                    old_path: Some(path_owned.clone()),
921                    new_path: Some(path_owned),
922                    old_mode: format_mode(ie.mode),
923                    new_mode: format_mode(ie.mode),
924                    old_oid: ie.oid,
925                    new_oid,
926                    score: None,
927                });
928            }
929            continue;
930        }
931
932        let file_path = work_tree.join(path_str_ref);
933
934        if is_intent_to_add {
935            match fs::symlink_metadata(&file_path) {
936                Ok(meta) => {
937                    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
938                    let worktree_oid = hash_worktree_file(
939                        odb,
940                        &file_path,
941                        &meta,
942                        &conv,
943                        &file_attrs,
944                        path_str_ref,
945                        None,
946                    )?;
947                    let worktree_mode = mode_from_metadata(&meta);
948                    result.push(DiffEntry {
949                        status: DiffStatus::Added,
950                        old_path: None,
951                        new_path: Some(path_str_ref.to_owned()),
952                        old_mode: "000000".to_owned(),
953                        new_mode: format_mode(worktree_mode),
954                        // `ita_invisible_in_index`: null OID on the index side for patch output
955                        // (`index 0000000..`, t2203); index entry still stores the empty blob.
956                        old_oid: zero_oid(),
957                        new_oid: worktree_oid,
958                        score: None,
959                    });
960                }
961                Err(e)
962                    if e.kind() == std::io::ErrorKind::NotFound
963                        || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
964                {
965                    result.push(DiffEntry {
966                        status: DiffStatus::Deleted,
967                        old_path: Some(path_str_ref.to_owned()),
968                        new_path: None,
969                        old_mode: format_mode(ie.mode),
970                        new_mode: "000000".to_owned(),
971                        old_oid: ie.oid,
972                        new_oid: zero_oid(),
973                        score: None,
974                    });
975                }
976                Err(e) => return Err(Error::Io(e)),
977            }
978            continue;
979        }
980
981        // If any parent component of the path is a symlink, the file is effectively
982        // deleted from the working tree (a symlink replaced a directory).
983        if has_symlink_in_path(work_tree, path_str_ref) {
984            result.push(DiffEntry {
985                status: DiffStatus::Deleted,
986                old_path: Some(path_str_ref.to_owned()),
987                new_path: None,
988                old_mode: format_mode(ie.mode),
989                new_mode: "000000".to_owned(),
990                old_oid: ie.oid,
991                new_oid: zero_oid(),
992                score: None,
993            });
994            continue;
995        }
996
997        match fs::symlink_metadata(&file_path) {
998            Ok(meta) if meta.is_dir() => {
999                // A directory exists where the index expects a file.
1000                // Treat as a type change: the indexed file is effectively deleted.
1001                result.push(DiffEntry {
1002                    status: DiffStatus::Deleted,
1003                    old_path: Some(path_str_ref.to_owned()),
1004                    new_path: None,
1005                    old_mode: format_mode(ie.mode),
1006                    new_mode: String::new(),
1007                    old_oid: ie.oid,
1008                    new_oid: zero_oid(),
1009                    score: None,
1010                });
1011            }
1012            Ok(meta) => {
1013                let worktree_mode = mode_from_metadata(&meta);
1014                let stat_same = stat_matches(ie, &meta);
1015                // Mode-only change: stat still matches the index entry but executable bit differs.
1016                if stat_same && worktree_mode != ie.mode {
1017                    let path_owned = path_str_ref.to_owned();
1018                    result.push(DiffEntry {
1019                        status: DiffStatus::Modified,
1020                        old_path: Some(path_owned.clone()),
1021                        new_path: Some(path_owned),
1022                        old_mode: format_mode(ie.mode),
1023                        new_mode: format_mode(worktree_mode),
1024                        old_oid: ie.oid,
1025                        new_oid: ie.oid,
1026                        score: None,
1027                    });
1028                    continue;
1029                }
1030
1031                // Fast path: unchanged stat + unchanged mode + non-racy timestamp means this entry
1032                // is clean without re-hashing blob data.
1033                if stat_same && worktree_mode == ie.mode && !entry_is_racy(ie, options.index_mtime) {
1034                    continue;
1035                }
1036
1037                // Hash the worktree blob for uncertain/racy entries.
1038                let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
1039                let worktree_oid = hash_worktree_file(
1040                    odb,
1041                    &file_path,
1042                    &meta,
1043                    &conv,
1044                    &file_attrs,
1045                    path_str_ref,
1046                    Some(ie),
1047                )?;
1048
1049                // If clean conversion disagrees with the index but raw bytes match the
1050                // blob (e.g. mixed line endings committed with autocrlf off), Git reports
1051                // no diff (t0020: touch + git diff --exit-code).
1052                let mut eff_oid = worktree_oid;
1053                if eff_oid != ie.oid {
1054                    if let Ok(raw) = fs::read(&file_path) {
1055                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1056                        if raw_oid == ie.oid {
1057                            eff_oid = ie.oid;
1058                        }
1059                    }
1060                }
1061
1062                if eff_oid != ie.oid || worktree_mode != ie.mode {
1063                    let path_owned = path_str_ref.to_owned();
1064                    result.push(DiffEntry {
1065                        status: DiffStatus::Modified,
1066                        old_path: Some(path_owned.clone()),
1067                        new_path: Some(path_owned),
1068                        old_mode: format_mode(ie.mode),
1069                        new_mode: format_mode(worktree_mode),
1070                        old_oid: ie.oid,
1071                        new_oid: eff_oid,
1072                    score: None,
1073                    });
1074                }
1075            }
1076            Err(e) if e.kind() == std::io::ErrorKind::NotFound
1077                || e.raw_os_error() == Some(20) /* ENOTDIR */ => {
1078                // File deleted from working tree (or parent replaced by a file)
1079                result.push(DiffEntry {
1080                    status: DiffStatus::Deleted,
1081                    old_path: Some(path_str_ref.to_owned()),
1082                    new_path: None,
1083                    old_mode: format_mode(ie.mode),
1084                    new_mode: "000000".to_owned(),
1085                    old_oid: ie.oid,
1086                    new_oid: zero_oid(),
1087                    score: None,
1088                });
1089            }
1090            Err(e) => return Err(Error::Io(e)),
1091        }
1092    }
1093
1094    for (path, (_, base_entry)) in unmerged_base {
1095        let file_path = work_tree.join(&path);
1096        let wt_meta = match fs::symlink_metadata(&file_path) {
1097            Ok(meta) => Some(meta),
1098            Err(e)
1099                if e.kind() == std::io::ErrorKind::NotFound
1100                    || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
1101            {
1102                None
1103            }
1104            Err(e) => return Err(Error::Io(e)),
1105        };
1106
1107        let new_mode = wt_meta.as_ref().map_or_else(
1108            || "000000".to_owned(),
1109            |meta| format_mode(mode_from_metadata(meta)),
1110        );
1111        result.push(DiffEntry {
1112            status: DiffStatus::Unmerged,
1113            old_path: Some(path.clone()),
1114            new_path: Some(path.clone()),
1115            old_mode: "000000".to_owned(),
1116            new_mode,
1117            old_oid: zero_oid(),
1118            new_oid: zero_oid(),
1119            score: None,
1120        });
1121
1122        if let Some(meta) = wt_meta {
1123            let file_attrs = crlf::get_file_attrs(&attrs, &path, false, &config);
1124            let wt_oid = hash_worktree_file(
1125                odb,
1126                &file_path,
1127                &meta,
1128                &conv,
1129                &file_attrs,
1130                &path,
1131                Some(base_entry),
1132            )?;
1133            let wt_mode = mode_from_metadata(&meta);
1134            if wt_oid != base_entry.oid || wt_mode != base_entry.mode {
1135                result.push(DiffEntry {
1136                    status: DiffStatus::Modified,
1137                    old_path: Some(path.clone()),
1138                    new_path: Some(path),
1139                    old_mode: format_mode(base_entry.mode),
1140                    new_mode: format_mode(wt_mode),
1141                    old_oid: base_entry.oid,
1142                    new_oid: wt_oid,
1143                    score: None,
1144                });
1145            }
1146        }
1147    }
1148
1149    Ok(result)
1150}
1151
1152fn entry_is_racy(ie: &IndexEntry, index_mtime: Option<(u32, u32)>) -> bool {
1153    let Some((index_mtime_sec, index_mtime_nsec)) = index_mtime else {
1154        return false;
1155    };
1156    if index_mtime_sec == 0 {
1157        return false;
1158    }
1159    index_mtime_sec < ie.mtime_sec
1160        || (index_mtime_sec == ie.mtime_sec && index_mtime_nsec <= ie.mtime_nsec)
1161}
1162
1163/// Quick stat check: does the index entry's cached stat data match the file?
1164/// Returns true when the file at `ie`'s path differs from the index entry (mode or blob).
1165///
1166/// Used by commands such as `git mv` to detect "dirty" paths under sparse checkout.
1167/// Symlinks and submodules are compared in a Git-compatible way.
1168///
1169/// `ignore_submodule_untracked` mirrors [`diff_index_to_worktree`]'s same flag for gitlinks.
1170pub fn worktree_differs_from_index_entry(
1171    odb: &Odb,
1172    work_tree: &Path,
1173    ie: &IndexEntry,
1174    ignore_submodule_untracked: bool,
1175) -> Result<bool> {
1176    use crate::config::ConfigSet;
1177    use crate::crlf;
1178
1179    let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
1180    let file_path = work_tree.join(path_str_ref);
1181
1182    if ie.mode == 0o160000 {
1183        let sub_head_oid = read_submodule_head(&file_path);
1184        let ref_matches = match sub_head_oid {
1185            Some(oid) => oid == ie.oid,
1186            None => submodule_worktree_is_unpopulated_placeholder(&file_path),
1187        };
1188        let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
1189        if ignore_submodule_untracked {
1190            flags.untracked = false;
1191        }
1192        return Ok(!ref_matches || flags.modified || flags.untracked);
1193    }
1194
1195    let meta = match fs::symlink_metadata(&file_path) {
1196        Ok(m) => m,
1197        Err(e)
1198            if e.kind() == std::io::ErrorKind::NotFound
1199                || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
1200        {
1201            return Ok(true);
1202        }
1203        Err(e) => return Err(Error::Io(e)),
1204    };
1205
1206    if meta.is_dir() {
1207        return Ok(true);
1208    }
1209
1210    let worktree_mode = mode_from_metadata(&meta);
1211    if worktree_mode != ie.mode {
1212        return Ok(true);
1213    }
1214
1215    let git_dir = work_tree.join(".git");
1216    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1217    let conv = crlf::ConversionConfig::from_config(&config);
1218    let attrs = crlf::load_gitattributes(work_tree);
1219    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
1220    let worktree_oid = hash_worktree_file(
1221        odb,
1222        &file_path,
1223        &meta,
1224        &conv,
1225        &file_attrs,
1226        path_str_ref,
1227        Some(ie),
1228    )?;
1229
1230    let mut eff_oid = worktree_oid;
1231    if eff_oid != ie.oid {
1232        if let Ok(raw) = fs::read(&file_path) {
1233            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1234            if raw_oid == ie.oid {
1235                eff_oid = ie.oid;
1236            }
1237        }
1238    }
1239
1240    Ok(eff_oid != ie.oid)
1241}
1242
1243pub fn stat_matches(ie: &IndexEntry, meta: &fs::Metadata) -> bool {
1244    // Compare size
1245    if meta.len() as u32 != ie.size {
1246        return false;
1247    }
1248    // Compare mtime (seconds + nanoseconds)
1249    if meta.mtime() as u32 != ie.mtime_sec {
1250        return false;
1251    }
1252    if meta.mtime_nsec() as u32 != ie.mtime_nsec {
1253        return false;
1254    }
1255    // Compare ctime (seconds + nanoseconds)
1256    if meta.ctime() as u32 != ie.ctime_sec {
1257        return false;
1258    }
1259    if meta.ctime_nsec() as u32 != ie.ctime_nsec {
1260        return false;
1261    }
1262    // Compare inode and device
1263    if meta.ino() as u32 != ie.ino {
1264        return false;
1265    }
1266    if meta.dev() as u32 != ie.dev {
1267        return false;
1268    }
1269    true
1270}
1271
1272/// Hash a working tree file as a blob to get its OID.
1273/// Check if any parent component of `rel_path` (relative to `work_tree`) is a symlink.
1274fn has_symlink_in_path(work_tree: &Path, rel_path: &str) -> bool {
1275    let mut check = work_tree.to_path_buf();
1276    let components: Vec<&str> = rel_path.split('/').collect();
1277    // Check all components except the last one (which is the file itself)
1278    for component in &components[..components.len().saturating_sub(1)] {
1279        check.push(component);
1280        match fs::symlink_metadata(&check) {
1281            Ok(meta) if meta.file_type().is_symlink() => return true,
1282            _ => {}
1283        }
1284    }
1285    false
1286}
1287
1288pub fn hash_worktree_file(
1289    odb: &Odb,
1290    path: &Path,
1291    meta: &fs::Metadata,
1292    conv: &crate::crlf::ConversionConfig,
1293    file_attrs: &crate::crlf::FileAttrs,
1294    rel_path: &str,
1295    index_entry: Option<&IndexEntry>,
1296) -> Result<ObjectId> {
1297    let prior_blob: Option<Vec<u8>> = index_entry
1298        .filter(|e| e.oid != zero_oid())
1299        .and_then(|e| odb.read(&e.oid).ok().map(|o| o.data));
1300    let data = if meta.file_type().is_symlink() {
1301        // For symlinks, hash the target path
1302        let target = fs::read_link(path)?;
1303        target.to_string_lossy().into_owned().into_bytes()
1304    } else if meta.is_dir() {
1305        // `read()` on a directory fails with EISDIR; unmerged paths may leave an empty
1306        // placeholder directory (e.g. t4027 combined submodule conflict).
1307        Vec::new()
1308    } else {
1309        let raw = fs::read(path)?;
1310        // Apply clean conversion (CRLF→LF) so hash matches index blob.
1311        // Do not run safecrlf here: diff/commit use this for hashing and must not print warnings.
1312        let opts = crate::crlf::ConvertToGitOpts {
1313            index_blob: prior_blob.as_deref(),
1314            renormalize: false,
1315            check_safecrlf: false,
1316        };
1317        crate::crlf::convert_to_git_with_opts(&raw, rel_path, conv, file_attrs, opts).unwrap_or(raw)
1318    };
1319
1320    Ok(Odb::hash_object_data(ObjectKind::Blob, &data))
1321}
1322
1323/// Derive a Git file mode from filesystem metadata.
1324pub fn mode_from_metadata(meta: &fs::Metadata) -> u32 {
1325    if meta.file_type().is_symlink() {
1326        0o120000
1327    } else if meta.mode() & 0o111 != 0 {
1328        0o100755
1329    } else {
1330        0o100644
1331    }
1332}
1333
1334/// Compare a tree against the working tree.
1335///
1336/// Shows changes from `tree_oid` to the current working directory state.
1337/// Files tracked in the index but not in the tree are shown as Added.
1338/// Files in the tree but missing from the working tree are shown as Deleted.
1339///
1340/// # Parameters
1341///
1342/// - `odb` — object database.
1343/// - `tree_oid` — the tree to compare against (`None` for empty tree).
1344/// - `work_tree` — path to the working tree root.
1345/// - `index` — current index (used to discover new tracked files not in tree).
1346///
1347/// # Errors
1348///
1349/// Returns errors from ODB reads or I/O.
1350pub fn diff_tree_to_worktree(
1351    odb: &Odb,
1352    tree_oid: Option<&ObjectId>,
1353    work_tree: &Path,
1354    index: &Index,
1355) -> Result<Vec<DiffEntry>> {
1356    use crate::config::ConfigSet;
1357    use crate::crlf;
1358
1359    let git_dir = work_tree.join(".git");
1360    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1361    let conv = crlf::ConversionConfig::from_config(&config);
1362    let attrs = crlf::load_gitattributes(work_tree);
1363
1364    // Flatten the tree into a BTreeMap keyed by path
1365    let tree_flat = match tree_oid {
1366        Some(oid) => flatten_tree(odb, oid, "")?,
1367        None => Vec::new(),
1368    };
1369    let tree_map: std::collections::BTreeMap<String, &FlatEntry> =
1370        tree_flat.iter().map(|e| (e.path.clone(), e)).collect();
1371
1372    // Build index lookup: path → &IndexEntry (stage 0 only)
1373    let mut index_entries: std::collections::BTreeMap<&[u8], &IndexEntry> =
1374        std::collections::BTreeMap::new();
1375    let mut index_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1376    let mut stage0_paths: std::collections::BTreeSet<Vec<u8>> = std::collections::BTreeSet::new();
1377    for ie in &index.entries {
1378        if ie.stage() != 0 {
1379            continue;
1380        }
1381        let path = String::from_utf8_lossy(&ie.path).to_string();
1382        index_entries.insert(&ie.path, ie);
1383        index_paths.insert(path);
1384        stage0_paths.insert(ie.path.clone());
1385    }
1386
1387    // Paths with only unmerged stages (1–3) and no stage 0 — `git diff <rev>` must still list them
1388    // so combined `diff --cc` conflict hunks can be emitted (`t4108-apply-threeway`).
1389    let mut unmerged_only_paths: std::collections::BTreeSet<String> =
1390        std::collections::BTreeSet::new();
1391    for ie in &index.entries {
1392        if !(1..=3).contains(&ie.stage()) {
1393            continue;
1394        }
1395        if stage0_paths.contains(&ie.path) {
1396            continue;
1397        }
1398        unmerged_only_paths.insert(String::from_utf8_lossy(&ie.path).into_owned());
1399    }
1400
1401    // Union of tree paths + index paths
1402    let mut all_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1403    all_paths.extend(tree_map.keys().cloned());
1404    all_paths.extend(index_paths.iter().cloned());
1405    all_paths.extend(unmerged_only_paths.iter().cloned());
1406
1407    let mut result = Vec::new();
1408
1409    for path in &all_paths {
1410        if index_entries
1411            .get(path.as_bytes())
1412            .is_some_and(|ie| ie.skip_worktree())
1413        {
1414            // Sparse checkout: `git diff <rev>` does not report tree↔worktree drift for
1415            // skip-worktree paths (they are outside the sparse cone). Matches t7012 stash flow.
1416            continue;
1417        }
1418
1419        let tree_entry = tree_map.get(path.as_str());
1420
1421        // Gitlink entries (submodules) — compare HEAD commit, not file content.
1422        let is_gitlink = tree_entry.is_some_and(|te| te.mode == 0o160000)
1423            || index_entries
1424                .get(path.as_bytes())
1425                .is_some_and(|ie| ie.mode == 0o160000);
1426        if is_gitlink {
1427            if let Some(te) = tree_entry {
1428                let sub_dir = work_tree.join(path);
1429                let sub_head = read_submodule_head_oid(&sub_dir);
1430                let index_oid = index_entries
1431                    .get(path.as_bytes())
1432                    .filter(|ie| ie.mode == 0o160000)
1433                    .map(|ie| ie.oid);
1434                let index_matches_tree = index_oid.is_some_and(|oid| oid == te.oid);
1435                let head_differs = sub_head.as_ref() != Some(&te.oid);
1436                let dirty_while_aligned = index_matches_tree
1437                    && !head_differs
1438                    && submodule_has_dirty_worktree_for_super_diff(work_tree, path, &te.oid);
1439                if head_differs || dirty_while_aligned {
1440                    // Raw `git diff <tree>` lines use a null OID on the worktree side when the
1441                    // checked-out submodule HEAD differs from the tree's gitlink; patch output still
1442                    // resolves the real commit from the submodule directory.
1443                    let new_oid = if head_differs { zero_oid() } else { te.oid };
1444                    result.push(DiffEntry {
1445                        status: DiffStatus::Modified,
1446                        old_path: Some(path.clone()),
1447                        new_path: Some(path.clone()),
1448                        old_mode: format_mode(te.mode),
1449                        new_mode: format_mode(te.mode),
1450                        old_oid: te.oid,
1451                        new_oid,
1452                        score: None,
1453                    });
1454                }
1455            }
1456            continue;
1457        }
1458
1459        let file_path = work_tree.join(path);
1460
1461        let wt_meta = match fs::symlink_metadata(&file_path) {
1462            Ok(m) => Some(m),
1463            Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
1464            Err(e) => return Err(Error::Io(e)),
1465        };
1466
1467        if unmerged_only_paths.contains(path) {
1468            if let (Some(te), Some(meta)) = (tree_entry, wt_meta.as_ref()) {
1469                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1470                let wt_oid =
1471                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, None)?;
1472                let wt_mode = mode_from_metadata(meta);
1473                if wt_oid != te.oid || wt_mode != te.mode {
1474                    result.push(DiffEntry {
1475                        status: DiffStatus::Modified,
1476                        old_path: Some(path.clone()),
1477                        new_path: Some(path.clone()),
1478                        old_mode: format_mode(te.mode),
1479                        new_mode: format_mode(wt_mode),
1480                        old_oid: te.oid,
1481                        new_oid: wt_oid,
1482                        score: None,
1483                    });
1484                }
1485            }
1486            continue;
1487        }
1488
1489        match (tree_entry, wt_meta) {
1490            (Some(te), Some(ref meta)) => {
1491                let wt_mode = mode_from_metadata(meta);
1492                let Some(ie) = index_entries.get(path.as_bytes()) else {
1493                    continue;
1494                };
1495
1496                let index_matches_tree = ie.oid == te.oid && ie.mode == te.mode;
1497
1498                // Fully clean: index matches `HEAD`, worktree matches index, stat cache fresh.
1499                if index_matches_tree && wt_mode == te.mode && stat_matches(ie, meta) {
1500                    continue;
1501                }
1502
1503                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1504                let idx_ent = index_entries.get(path.as_bytes()).copied();
1505
1506                // Staged mode (same blob as `HEAD`, different mode recorded in the index).
1507                if ie.oid == te.oid && ie.mode != te.mode {
1508                    result.push(DiffEntry {
1509                        status: DiffStatus::Modified,
1510                        old_path: Some(path.clone()),
1511                        new_path: Some(path.clone()),
1512                        old_mode: format_mode(te.mode),
1513                        new_mode: format_mode(ie.mode),
1514                        old_oid: te.oid,
1515                        new_oid: te.oid,
1516                        score: None,
1517                    });
1518                    continue;
1519                }
1520
1521                // Index still matches `HEAD`: only unstaged worktree drift (content and/or
1522                // worktree-only exec bit when `update-index` was not run — t4049 harness).
1523                if index_matches_tree {
1524                    let wt_oid = hash_worktree_file(
1525                        odb,
1526                        &file_path,
1527                        meta,
1528                        &conv,
1529                        &file_attrs,
1530                        path,
1531                        idx_ent,
1532                    )?;
1533                    let mut eff_oid = wt_oid;
1534                    if eff_oid != te.oid {
1535                        if let Ok(raw) = fs::read(&file_path) {
1536                            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1537                            if raw_oid == te.oid {
1538                                eff_oid = te.oid;
1539                            }
1540                        }
1541                    }
1542                    if eff_oid != te.oid {
1543                        result.push(DiffEntry {
1544                            status: DiffStatus::Modified,
1545                            old_path: Some(path.clone()),
1546                            new_path: Some(path.clone()),
1547                            old_mode: format_mode(te.mode),
1548                            new_mode: format_mode(wt_mode),
1549                            old_oid: te.oid,
1550                            new_oid: eff_oid,
1551                            score: None,
1552                        });
1553                    } else if wt_mode != te.mode {
1554                        result.push(DiffEntry {
1555                            status: DiffStatus::Modified,
1556                            old_path: Some(path.clone()),
1557                            new_path: Some(path.clone()),
1558                            old_mode: format_mode(te.mode),
1559                            new_mode: format_mode(wt_mode),
1560                            old_oid: te.oid,
1561                            new_oid: te.oid,
1562                            score: None,
1563                        });
1564                    }
1565                    continue;
1566                }
1567
1568                // Staged content (and possibly mode): `git diff <rev>` is tree vs working tree.
1569                let wt_oid =
1570                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, idx_ent)?;
1571                let mut eff_oid = wt_oid;
1572                if eff_oid != te.oid {
1573                    if let Ok(raw) = fs::read(&file_path) {
1574                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1575                        if raw_oid == te.oid {
1576                            eff_oid = te.oid;
1577                        }
1578                    }
1579                }
1580                if eff_oid != te.oid || wt_mode != te.mode {
1581                    result.push(DiffEntry {
1582                        status: DiffStatus::Modified,
1583                        old_path: Some(path.clone()),
1584                        new_path: Some(path.clone()),
1585                        old_mode: format_mode(te.mode),
1586                        new_mode: format_mode(wt_mode),
1587                        old_oid: te.oid,
1588                        new_oid: eff_oid,
1589                        score: None,
1590                    });
1591                }
1592            }
1593            (Some(te), None) => {
1594                // In tree but missing from worktree
1595                result.push(DiffEntry {
1596                    status: DiffStatus::Deleted,
1597                    old_path: Some(path.clone()),
1598                    new_path: None,
1599                    old_mode: format_mode(te.mode),
1600                    new_mode: "000000".to_owned(),
1601                    old_oid: te.oid,
1602                    new_oid: zero_oid(),
1603                    score: None,
1604                });
1605            }
1606            (None, Some(ref meta)) => {
1607                // In index but not in tree, and exists in worktree
1608                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1609                let wt_oid = hash_worktree_file(
1610                    odb,
1611                    &file_path,
1612                    meta,
1613                    &conv,
1614                    &file_attrs,
1615                    path,
1616                    index_entries.get(path.as_bytes()).copied(),
1617                )?;
1618                let wt_mode = mode_from_metadata(meta);
1619                result.push(DiffEntry {
1620                    status: DiffStatus::Added,
1621                    old_path: None,
1622                    new_path: Some(path.clone()),
1623                    old_mode: "000000".to_owned(),
1624                    new_mode: format_mode(wt_mode),
1625                    old_oid: zero_oid(),
1626                    new_oid: wt_oid,
1627                    score: None,
1628                });
1629            }
1630            (None, None) => {
1631                // Tracked in index but neither in tree nor worktree — skip
1632            }
1633        }
1634    }
1635
1636    result.sort_by(|a, b| a.path().cmp(b.path()));
1637    Ok(result)
1638}
1639
1640// ── Rename detection ────────────────────────────────────────────────
1641
1642fn read_added_entry_bytes(
1643    odb: &Odb,
1644    entry: &DiffEntry,
1645    work_root: Option<&Path>,
1646) -> Option<Vec<u8>> {
1647    if entry.new_oid != zero_oid() {
1648        return odb.read(&entry.new_oid).ok().map(|obj| obj.data);
1649    }
1650    let path = entry.new_path.as_deref()?;
1651    let root = work_root?;
1652    fs::read(root.join(path)).ok()
1653}
1654
1655fn modified_as_copy_from_sources(
1656    odb: &Odb,
1657    work_root: Option<&Path>,
1658    e: &DiffEntry,
1659    threshold: u32,
1660    sources: &[(String, ObjectId, bool)],
1661    source_contents: &[Option<Vec<u8>>],
1662    source_tree_entries: &[(String, String, ObjectId)],
1663) -> Option<DiffEntry> {
1664    fn regular_file_mode(mode: &str) -> bool {
1665        mode == "100644" || mode == "100755"
1666    }
1667
1668    if e.status != DiffStatus::Modified || !regular_file_mode(&e.new_mode) {
1669        return None;
1670    }
1671    let new_data = read_added_entry_bytes(odb, e, work_root)?;
1672    let new_oid_eff = if e.new_oid != zero_oid() {
1673        e.new_oid
1674    } else {
1675        Odb::hash_object_data(ObjectKind::Blob, &new_data)
1676    };
1677
1678    let mut best: Option<(usize, u32)> = None;
1679    for (si, (src_path, src_oid, is_deleted)) in sources.iter().enumerate() {
1680        if *is_deleted {
1681            continue;
1682        }
1683        if e.new_path.as_deref() == Some(src_path.as_str()) {
1684            continue;
1685        }
1686        let src_mode_str = source_tree_entries
1687            .iter()
1688            .find(|(p, _, _)| p == src_path)
1689            .map(|(_, m, _)| m.as_str())
1690            .unwrap_or("100644");
1691        if !regular_file_mode(src_mode_str) {
1692            continue;
1693        }
1694
1695        let score = if *src_oid == new_oid_eff {
1696            100
1697        } else {
1698            match (&source_contents[si], Some(new_data.as_slice())) {
1699                (Some(old_data), Some(nd)) => compute_similarity(old_data, nd),
1700                _ => 0,
1701            }
1702        };
1703        if score >= threshold {
1704            let replace = match best {
1705                None => true,
1706                Some((_, s)) => score > s,
1707            };
1708            if replace {
1709                best = Some((si, score));
1710            }
1711        }
1712    }
1713
1714    let (si, score) = best?;
1715    let (src_path, src_oid, _) = &sources[si];
1716    let src_mode = source_tree_entries
1717        .iter()
1718        .find(|(p, _, _)| p == src_path)
1719        .map(|(_, m, _)| m.clone())
1720        .unwrap_or_else(|| e.old_mode.clone());
1721
1722    Some(DiffEntry {
1723        status: DiffStatus::Copied,
1724        old_path: Some(src_path.clone()),
1725        new_path: e.new_path.clone(),
1726        old_mode: src_mode,
1727        new_mode: e.new_mode.clone(),
1728        old_oid: *src_oid,
1729        new_oid: e.new_oid,
1730        score: Some(score),
1731    })
1732}
1733
1734/// Detect renames by pairing Deleted and Added entries with similar content.
1735///
1736/// `threshold` is the minimum similarity percentage (0–100) for a pair to
1737/// be considered a rename (Git's default is 50%).  The function reads blob
1738/// content from the ODB to compute a line-level similarity score.
1739///
1740/// Exact-OID matches are always 100% similar regardless of content.
1741///
1742/// When `work_root` is set, added entries whose `new_oid` is the zero placeholder (as in
1743/// uncached `diff-index` when the work tree diverged from the index) load content from disk
1744/// under that root instead of the object database.
1745pub fn detect_renames(
1746    odb: &Odb,
1747    work_root: Option<&Path>,
1748    entries: Vec<DiffEntry>,
1749    threshold: u32,
1750) -> Vec<DiffEntry> {
1751    // Split entries into deleted, added, and others.
1752    let mut deleted: Vec<DiffEntry> = Vec::new();
1753    let mut added: Vec<DiffEntry> = Vec::new();
1754    let mut others: Vec<DiffEntry> = Vec::new();
1755
1756    for entry in entries {
1757        match entry.status {
1758            DiffStatus::Deleted => deleted.push(entry),
1759            DiffStatus::Added => added.push(entry),
1760            _ => others.push(entry),
1761        }
1762    }
1763
1764    if deleted.is_empty() || added.is_empty() {
1765        // Nothing to pair — return original order.
1766        let mut result = others;
1767        result.extend(deleted);
1768        result.extend(added);
1769        result.sort_by(|a, b| a.path().cmp(b.path()));
1770        return result;
1771    }
1772
1773    // Read content for all deleted blobs.
1774    let deleted_contents: Vec<Option<Vec<u8>>> = deleted
1775        .iter()
1776        .map(|d| odb.read(&d.old_oid).ok().map(|obj| obj.data))
1777        .collect();
1778
1779    // Read content for all added blobs.
1780    let added_contents: Vec<Option<Vec<u8>>> = added
1781        .iter()
1782        .map(|a| read_added_entry_bytes(odb, a, work_root))
1783        .collect();
1784
1785    // Build a matrix of similarity scores and find the best pairings.
1786    // We use a greedy approach: pick the highest-scoring pair first.
1787    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
1788
1789    fn is_regularish_mode(mode: &str) -> bool {
1790        mode == "100644" || mode == "100755"
1791    }
1792
1793    for (di, del) in deleted.iter().enumerate() {
1794        for (ai, add) in added.iter().enumerate() {
1795            // Exact OID match → 100%
1796            if del.old_oid == add.new_oid {
1797                scores.push((100, di, ai));
1798                continue;
1799            }
1800
1801            // Do not use line similarity across file types (e.g. regular ↔ symlink); Git keeps these
1802            // as separate changes (`t4008-diff-break-rewrite` #7).
1803            if !is_regularish_mode(&del.old_mode) || !is_regularish_mode(&add.new_mode) {
1804                continue;
1805            }
1806
1807            let score = match (&deleted_contents[di], &added_contents[ai]) {
1808                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
1809                _ => 0,
1810            };
1811
1812            if score >= threshold {
1813                scores.push((score, di, ai));
1814            }
1815        }
1816    }
1817
1818    // Sort: prefer same-basename pairs first, then by score descending.
1819    // This matches Git's behavior where basename matches are checked first.
1820    scores.sort_by(|a, b| {
1821        let a_same = same_basename(&deleted[a.1], &added[a.2]);
1822        let b_same = same_basename(&deleted[b.1], &added[b.2]);
1823        b_same.cmp(&a_same).then_with(|| b.0.cmp(&a.0))
1824    });
1825
1826    let mut used_deleted = vec![false; deleted.len()];
1827    let mut used_added = vec![false; added.len()];
1828    let mut renames: Vec<DiffEntry> = Vec::new();
1829
1830    for (score, di, ai) in &scores {
1831        if used_deleted[*di] || used_added[*ai] {
1832            continue;
1833        }
1834        used_deleted[*di] = true;
1835        used_added[*ai] = true;
1836
1837        let del = &deleted[*di];
1838        let add = &added[*ai];
1839
1840        renames.push(DiffEntry {
1841            status: DiffStatus::Renamed,
1842            old_path: del.old_path.clone(),
1843            new_path: add.new_path.clone(),
1844            old_mode: del.old_mode.clone(),
1845            new_mode: add.new_mode.clone(),
1846            old_oid: del.old_oid,
1847            new_oid: add.new_oid,
1848            score: Some(*score),
1849        });
1850    }
1851
1852    // Collect unmatched entries.
1853    let mut result = others;
1854    result.extend(renames);
1855    for (i, entry) in deleted.into_iter().enumerate() {
1856        if !used_deleted[i] {
1857            result.push(entry);
1858        }
1859    }
1860    for (i, entry) in added.into_iter().enumerate() {
1861        if !used_added[i] {
1862            result.push(entry);
1863        }
1864    }
1865
1866    result.sort_by(|a, b| a.path().cmp(b.path()));
1867    result
1868}
1869
1870/// Detect copies among diff entries.
1871///
1872/// This first runs rename detection (pairing Deleted+Added), then for any
1873/// remaining Added entries, looks for copy sources.
1874///
1875/// - `find_copies_harder` = false: only Modified entries are copy source candidates.
1876/// - `find_copies_harder` = true: also examine unmodified files from `source_tree_entries`.
1877///
1878/// `source_tree_entries` should be a list of (path, mode, oid) from the source tree;
1879/// used when `find_copies_harder` is true to consider unmodified files as copy sources.
1880pub fn detect_copies(
1881    odb: &Odb,
1882    work_root: Option<&Path>,
1883    entries: Vec<DiffEntry>,
1884    threshold: u32,
1885    find_copies_harder: bool,
1886    source_tree_entries: &[(String, String, ObjectId)],
1887) -> Vec<DiffEntry> {
1888    use std::collections::{HashMap, HashSet};
1889
1890    // Separate entries by status.
1891    let mut deleted: Vec<DiffEntry> = Vec::new();
1892    let mut added: Vec<DiffEntry> = Vec::new();
1893    let mut others: Vec<DiffEntry> = Vec::new();
1894
1895    for entry in entries {
1896        match entry.status {
1897            DiffStatus::Deleted => deleted.push(entry),
1898            DiffStatus::Added => added.push(entry),
1899            _ => others.push(entry),
1900        }
1901    }
1902
1903    // Build source candidates: deleted files, modified files, and optionally tree entries.
1904    // Track which sources are from deleted files (can become renames).
1905    let mut sources: Vec<(String, ObjectId, bool)> = Vec::new(); // (path, oid, is_deleted)
1906    let mut deleted_source_idx: HashMap<String, usize> = HashMap::new();
1907
1908    for entry in &deleted {
1909        if let Some(ref path) = entry.old_path {
1910            deleted_source_idx.insert(path.clone(), sources.len());
1911            sources.push((path.clone(), entry.old_oid, true));
1912        }
1913    }
1914
1915    // Modified and type-changed files are candidates for `-C` (e.g. symlink rewrite leaves the
1916    // old blob available as a copy source for another path; see `t4008-diff-break-rewrite`).
1917    for entry in &others {
1918        if matches!(entry.status, DiffStatus::Modified | DiffStatus::TypeChanged) {
1919            if let Some(ref old_path) = entry.old_path {
1920                if !sources.iter().any(|(p, _, _)| p == old_path) {
1921                    sources.push((old_path.clone(), entry.old_oid, false));
1922                }
1923            }
1924        }
1925    }
1926
1927    // With find_copies_harder, add all source tree entries.
1928    if find_copies_harder {
1929        for (path, _mode, oid) in source_tree_entries {
1930            if !sources.iter().any(|(p, _, _)| p == path) {
1931                sources.push((path.clone(), *oid, false));
1932            }
1933        }
1934    }
1935
1936    if sources.is_empty() {
1937        let mut result = others;
1938        result.extend(deleted);
1939        result.extend(added);
1940        result.sort_by(|a, b| a.path().cmp(b.path()));
1941        return result;
1942    }
1943
1944    // Read content for sources.
1945    let source_contents: Vec<Option<Vec<u8>>> = sources
1946        .iter()
1947        .map(|(_, oid, _)| odb.read(oid).ok().map(|obj| obj.data))
1948        .collect();
1949
1950    let mut result_entries: Vec<DiffEntry> = Vec::new();
1951    let mut renamed_deleted: HashSet<usize> = HashSet::new();
1952    let mut used_added2 = vec![false; added.len()];
1953
1954    if !added.is_empty() {
1955        // Read content for added blobs.
1956        let added_contents: Vec<Option<Vec<u8>>> = added
1957            .iter()
1958            .map(|a| read_added_entry_bytes(odb, a, work_root))
1959            .collect();
1960
1961        // Build score matrix: (score, source_idx, added_idx)
1962        let mut scores: Vec<(u32, usize, usize)> = Vec::new();
1963        for (si, (src_path, src_oid, _)) in sources.iter().enumerate() {
1964            for (ai, add) in added.iter().enumerate() {
1965                // Never pair a path with itself as copy source (matches Git; avoids
1966                // arbitrary tie-breaking when several sources share the same blob).
1967                if add.new_path.as_deref() == Some(src_path.as_str()) {
1968                    continue;
1969                }
1970                let add_oid = if add.new_oid != zero_oid() {
1971                    add.new_oid
1972                } else if let Some(ref data) = added_contents[ai] {
1973                    Odb::hash_object_data(ObjectKind::Blob, data)
1974                } else {
1975                    zero_oid()
1976                };
1977                if *src_oid == add_oid {
1978                    scores.push((100, si, ai));
1979                    continue;
1980                }
1981                let score = match (&source_contents[si], &added_contents[ai]) {
1982                    (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
1983                    _ => 0,
1984                };
1985                if score >= threshold {
1986                    scores.push((score, si, ai));
1987                }
1988            }
1989        }
1990
1991        // Sort by score descending.
1992        scores.sort_by(|a, b| b.0.cmp(&a.0));
1993
1994        // Build source->added mappings, each added file assigned to best source.
1995        let mut used_added = vec![false; added.len()];
1996        let mut source_to_added: HashMap<usize, Vec<(usize, u32)>> = HashMap::new();
1997        for &(score, si, ai) in &scores {
1998            if used_added[ai] {
1999                continue;
2000            }
2001            used_added[ai] = true;
2002            source_to_added.entry(si).or_default().push((ai, score));
2003        }
2004
2005        // For each deleted source, pick one assignment as Rename, rest as Copy.
2006        for (&si, assignments_for_src) in &source_to_added {
2007            let (_, _, is_deleted) = &sources[si];
2008            if *is_deleted && !assignments_for_src.is_empty() {
2009                // Pick the last one (by path) as the rename target.
2010                // Git tends to pick the rename as the last alphabetically.
2011                let rename_ai = assignments_for_src
2012                    .iter()
2013                    .max_by_key(|(ai, _score)| added[*ai].path().to_string())
2014                    .map(|(ai, _)| *ai);
2015
2016                for &(ai, score) in assignments_for_src {
2017                    let (ref src_path, _, _) = sources[si];
2018                    let add = &added[ai];
2019                    let src_mode = source_tree_entries
2020                        .iter()
2021                        .find(|(p, _, _)| p == src_path)
2022                        .map(|(_, m, _)| m.clone())
2023                        .unwrap_or_else(|| add.old_mode.clone());
2024
2025                    let is_rename = Some(ai) == rename_ai;
2026                    result_entries.push(DiffEntry {
2027                        status: if is_rename {
2028                            DiffStatus::Renamed
2029                        } else {
2030                            DiffStatus::Copied
2031                        },
2032                        old_path: Some(src_path.clone()),
2033                        new_path: add.new_path.clone(),
2034                        old_mode: src_mode,
2035                        new_mode: add.new_mode.clone(),
2036                        old_oid: sources[si].1,
2037                        new_oid: add.new_oid,
2038                        score: Some(score),
2039                    });
2040                    used_added2[ai] = true;
2041                }
2042                renamed_deleted.insert(si);
2043            } else {
2044                // Non-deleted source: all assignments are copies.
2045                for &(ai, score) in assignments_for_src {
2046                    let (ref src_path, _, _) = sources[si];
2047                    let add = &added[ai];
2048                    let src_mode = source_tree_entries
2049                        .iter()
2050                        .find(|(p, _, _)| p == src_path)
2051                        .map(|(_, m, _)| m.clone())
2052                        .unwrap_or_else(|| add.old_mode.clone());
2053
2054                    result_entries.push(DiffEntry {
2055                        status: DiffStatus::Copied,
2056                        old_path: Some(src_path.clone()),
2057                        new_path: add.new_path.clone(),
2058                        old_mode: src_mode,
2059                        new_mode: add.new_mode.clone(),
2060                        old_oid: sources[si].1,
2061                        new_oid: add.new_oid,
2062                        score: Some(score),
2063                    });
2064                    used_added2[ai] = true;
2065                }
2066            }
2067        }
2068    }
2069
2070    // Keep deleted entries that weren't consumed by a rename.
2071    for entry in deleted.into_iter() {
2072        if let Some(ref path) = entry.old_path {
2073            if let Some(&si) = deleted_source_idx.get(path) {
2074                if renamed_deleted.contains(&si) {
2075                    // This deletion was consumed by a rename; skip it.
2076                    continue;
2077                }
2078            }
2079        }
2080        result_entries.push(entry);
2081    }
2082
2083    let mut result = others;
2084    result.extend(result_entries);
2085    // Keep unmatched added entries.
2086    for (i, entry) in added.into_iter().enumerate() {
2087        if !used_added2[i] {
2088            result.push(entry);
2089        }
2090    }
2091
2092    let mut final_result = Vec::with_capacity(result.len());
2093    for e in result {
2094        if let Some(c) = modified_as_copy_from_sources(
2095            odb,
2096            work_root,
2097            &e,
2098            threshold,
2099            &sources,
2100            &source_contents,
2101            source_tree_entries,
2102        ) {
2103            final_result.push(c);
2104        } else {
2105            final_result.push(e);
2106        }
2107    }
2108
2109    final_result.sort_by(|a, b| a.path().cmp(b.path()));
2110    final_result
2111}
2112
2113/// Apply Git-style rename and optional copy detection for index↔worktree diffs.
2114///
2115/// When `copies` is true (Git `diff.renames` / `status.renames` set to `copy`/`copies`),
2116/// runs [`detect_copies`] after rename detection so added files can match unchanged
2117/// paths from `HEAD` (e.g. intent-to-add copies).
2118///
2119/// # Errors
2120///
2121/// Propagates errors from reading the `head_tree` object from `odb`.
2122pub fn status_apply_rename_copy_detection(
2123    odb: &Odb,
2124    unstaged_raw: Vec<DiffEntry>,
2125    threshold: u32,
2126    copies: bool,
2127    head_tree: Option<&ObjectId>,
2128) -> Result<Vec<DiffEntry>> {
2129    let after_renames = detect_renames(odb, None, unstaged_raw, threshold);
2130    if !copies {
2131        return Ok(after_renames);
2132    }
2133    let source_tree_entries: Vec<(String, String, ObjectId)> = match head_tree {
2134        Some(oid) => flatten_tree(odb, oid, "")?
2135            .into_iter()
2136            .map(|e| (e.path, format_mode(e.mode), e.oid))
2137            .collect(),
2138        None => Vec::new(),
2139    };
2140    Ok(detect_copies(
2141        odb,
2142        None,
2143        after_renames,
2144        threshold,
2145        false,
2146        &source_tree_entries,
2147    ))
2148}
2149
2150/// Format a rename pair using Git's compact path format.
2151///
2152/// Examples:
2153/// - `a/b/c` → `c/b/a` → `a/b/c => c/b/a`
2154/// - `c/b/a` → `c/d/e` → `c/{b/a => d/e}`
2155/// - `c/d/e` → `d/e` → `{c/d => d}/e`
2156/// - `d/e` → `d/f/e` → `d/{ => f}/e`
2157pub fn format_rename_path(old: &str, new: &str) -> String {
2158    let ob = old.as_bytes();
2159    let nb = new.as_bytes();
2160
2161    // Find common prefix length, snapped to '/' boundary.
2162    let pfx = {
2163        let mut last_sep = 0usize;
2164        let min_len = ob.len().min(nb.len());
2165        for i in 0..min_len {
2166            if ob[i] != nb[i] {
2167                break;
2168            }
2169            if ob[i] == b'/' {
2170                last_sep = i + 1;
2171            }
2172        }
2173        last_sep
2174    };
2175
2176    // Find common suffix length, snapped to '/' boundary.
2177    let mut sfx = {
2178        let mut last_sep = 0usize;
2179        let min_len = ob.len().min(nb.len());
2180        for i in 0..min_len {
2181            let oi = ob.len() - 1 - i;
2182            let ni = nb.len() - 1 - i;
2183            if ob[oi] != nb[ni] {
2184                break;
2185            }
2186            if ob[oi] == b'/' {
2187                last_sep = i + 1;
2188            }
2189        }
2190        last_sep
2191    };
2192
2193    // Suffix starts at this position in each string.
2194    let mut sfx_at_old = ob.len() - sfx;
2195    let mut sfx_at_new = nb.len() - sfx;
2196
2197    // If prefix and suffix overlap in both strings (both middles empty),
2198    // reduce the suffix so that at least the longer string has a non-empty middle.
2199    while pfx > sfx_at_old && pfx > sfx_at_new && sfx > 0 {
2200        // Reduce suffix by snapping to the next smaller '/' boundary.
2201        let suffix_bytes = &ob[sfx_at_old..];
2202        let mut new_sfx = 0;
2203        // Find the next '/' after sfx_at_old (i.e., reduce suffix).
2204        for (i, &b) in suffix_bytes.iter().enumerate().skip(1) {
2205            if b == b'/' {
2206                new_sfx = sfx - i;
2207                break;
2208            }
2209        }
2210        if new_sfx == 0 || new_sfx >= sfx {
2211            sfx_at_old = ob.len();
2212            sfx_at_new = nb.len();
2213            break;
2214        }
2215        sfx = new_sfx;
2216        sfx_at_old = ob.len() - sfx;
2217        sfx_at_new = nb.len() - sfx;
2218    }
2219
2220    // When prefix and suffix overlap in the shorter string, they share
2221    // the '/' boundary character. In the output format, the shared '/'
2222    // appears in both positions (e.g. "d/{ => f}/e" for d/e → d/f/e).
2223    // Compute the middle parts. When prefix and suffix overlap in a
2224    // string, the middle for that string is empty. The shared '/' shows
2225    // in both prefix (trailing) and suffix (leading) positions.
2226    let prefix = &old[..pfx];
2227    let suffix = &old[sfx_at_old..];
2228    let old_mid = if pfx <= sfx_at_old {
2229        &old[pfx..sfx_at_old]
2230    } else {
2231        ""
2232    };
2233    let new_mid = if pfx <= sfx_at_new {
2234        &new[pfx..sfx_at_new]
2235    } else {
2236        ""
2237    };
2238
2239    if prefix.is_empty() && suffix.is_empty() {
2240        return format!("{old} => {new}");
2241    }
2242
2243    format!("{prefix}{{{old_mid} => {new_mid}}}{suffix}")
2244}
2245
2246/// Check if two entries share the same filename (basename).
2247fn same_basename(del: &DiffEntry, add: &DiffEntry) -> bool {
2248    let old = del.old_path.as_deref().unwrap_or("");
2249    let new = add.new_path.as_deref().unwrap_or("");
2250    let old_base = old.rsplit('/').next().unwrap_or(old);
2251    let new_base = new.rsplit('/').next().unwrap_or(new);
2252    old_base == new_base && !old_base.is_empty()
2253}
2254
2255/// Compute a similarity percentage (0–100) between two byte slices.
2256///
2257/// Uses Git's approach: count the bytes that are "shared" (appear in
2258/// equal lines), then compute `score = shared_bytes * 2 * 100 / (src_size + dst_size)`.
2259fn compute_similarity(old: &[u8], new: &[u8]) -> u32 {
2260    // Normalize CRLF → LF before comparing so that files differing
2261    // only in line endings are detected as renames.
2262    let old_norm = crate::crlf::crlf_to_lf(old);
2263    let new_norm = crate::crlf::crlf_to_lf(new);
2264
2265    let src_size = old_norm.len();
2266    let dst_size = new_norm.len();
2267
2268    if src_size == 0 && dst_size == 0 {
2269        return 100;
2270    }
2271    let total = src_size + dst_size;
2272    if total == 0 {
2273        return 100;
2274    }
2275
2276    // Use line-level diff to find shared content, then count bytes.
2277    use similar::{ChangeTag, TextDiff};
2278    let old_str = String::from_utf8_lossy(&old_norm);
2279    let new_str = String::from_utf8_lossy(&new_norm);
2280    let diff = TextDiff::from_lines(&old_str as &str, &new_str as &str);
2281
2282    let mut shared_bytes = 0usize;
2283    for change in diff.iter_all_changes() {
2284        if change.tag() == ChangeTag::Equal {
2285            // Count bytes in the matching line (including newline).
2286            shared_bytes += change.value().len();
2287        }
2288    }
2289
2290    // Git: score = copied * MAX_SCORE / max(src_size, dst_size)
2291    // We normalize to 0-100.
2292    let max_size = src_size.max(dst_size);
2293
2294    ((shared_bytes * 100) / max_size).min(100) as u32
2295}
2296
2297/// Compute rename/copy similarity percentage (0–100) between two byte slices.
2298///
2299/// This uses the same scoring logic as internal rename detection.
2300#[must_use]
2301pub fn rename_similarity_score(old: &[u8], new: &[u8]) -> u32 {
2302    compute_similarity(old, new)
2303}
2304
2305// ── Output formatting ───────────────────────────────────────────────
2306
2307/// Format a diff entry in Git's raw diff format.
2308///
2309/// Example: `:100644 100644 abc1234... def5678... M\tfile.txt`
2310pub fn format_raw(entry: &DiffEntry) -> String {
2311    let path = match entry.status {
2312        DiffStatus::Renamed | DiffStatus::Copied => {
2313            format!(
2314                "{}\t{}",
2315                entry.old_path.as_deref().unwrap_or(""),
2316                entry.new_path.as_deref().unwrap_or("")
2317            )
2318        }
2319        _ => entry.path().to_owned(),
2320    };
2321
2322    let status_str = match (entry.status, entry.score) {
2323        (DiffStatus::Renamed, Some(s)) => format!("R{:03}", s),
2324        (DiffStatus::Copied, Some(s)) => format!("C{:03}", s),
2325        _ => entry.status.letter().to_string(),
2326    };
2327
2328    format!(
2329        ":{} {} {} {} {}\t{}",
2330        entry.old_mode, entry.new_mode, entry.old_oid, entry.new_oid, status_str, path
2331    )
2332}
2333
2334/// Format a diff entry with abbreviated OIDs.
2335pub fn format_raw_abbrev(entry: &DiffEntry, abbrev_len: usize) -> String {
2336    let ellipsis = if std::env::var("GIT_PRINT_SHA1_ELLIPSIS").ok().as_deref() == Some("yes") {
2337        "..."
2338    } else {
2339        ""
2340    };
2341    let old_hex = format!("{}", entry.old_oid);
2342    let new_hex = format!("{}", entry.new_oid);
2343    let old_abbrev = &old_hex[..abbrev_len.min(old_hex.len())];
2344    let new_abbrev = &new_hex[..abbrev_len.min(new_hex.len())];
2345
2346    let path = entry.path();
2347
2348    format!(
2349        ":{} {} {}{} {}{} {}\t{}",
2350        entry.old_mode,
2351        entry.new_mode,
2352        old_abbrev,
2353        ellipsis,
2354        new_abbrev,
2355        ellipsis,
2356        entry.status.letter(),
2357        path
2358    )
2359}
2360
2361/// Generate a unified diff patch for two blobs.
2362///
2363/// # Parameters
2364///
2365/// - `old_content` — the old file content (empty for added files).
2366/// - `new_content` — the new file content (empty for deleted files).
2367/// - `old_path` — display path for the old side.
2368/// - `new_path` — display path for the new side.
2369/// - `context_lines` — number of context lines around changes (default: 3).
2370/// - Inter-hunk context defaults to `0` (see [`unified_diff_with_prefix`]).
2371///
2372/// # Returns
2373///
2374/// The unified diff as a string.
2375pub fn unified_diff(
2376    old_content: &str,
2377    new_content: &str,
2378    old_path: &str,
2379    new_path: &str,
2380    context_lines: usize,
2381    indent_heuristic: bool,
2382    quote_path_fully: bool,
2383) -> String {
2384    unified_diff_with_prefix(
2385        old_content,
2386        new_content,
2387        old_path,
2388        new_path,
2389        context_lines,
2390        0,
2391        "a/",
2392        "b/",
2393        indent_heuristic,
2394        quote_path_fully,
2395    )
2396}
2397
2398/// Same as `unified_diff` but with configurable source/destination prefixes.
2399///
2400/// `inter_hunk_context` is Git's `--inter-hunk-context`: adjacent hunks merge when
2401/// the unchanged gap between them is at most `2 * context_lines + inter_hunk_context` lines.
2402#[allow(clippy::too_many_arguments)] // Mirrors Git-style unified diff parameters.
2403pub fn unified_diff_with_prefix(
2404    old_content: &str,
2405    new_content: &str,
2406    old_path: &str,
2407    new_path: &str,
2408    context_lines: usize,
2409    inter_hunk_context: usize,
2410    src_prefix: &str,
2411    dst_prefix: &str,
2412    indent_heuristic: bool,
2413    quote_path_fully: bool,
2414) -> String {
2415    unified_diff_with_prefix_and_funcname(
2416        old_content,
2417        new_content,
2418        old_path,
2419        new_path,
2420        context_lines,
2421        inter_hunk_context,
2422        src_prefix,
2423        dst_prefix,
2424        None,
2425        indent_heuristic,
2426        quote_path_fully,
2427    )
2428}
2429
2430/// Same as [`unified_diff_with_prefix`] with optional custom hunk-header
2431/// function-name matching.
2432#[allow(clippy::too_many_arguments)]
2433pub fn unified_diff_with_prefix_and_funcname(
2434    old_content: &str,
2435    new_content: &str,
2436    old_path: &str,
2437    new_path: &str,
2438    context_lines: usize,
2439    inter_hunk_context: usize,
2440    src_prefix: &str,
2441    dst_prefix: &str,
2442    funcname_matcher: Option<&FuncnameMatcher>,
2443    indent_heuristic: bool,
2444    quote_path_fully: bool,
2445) -> String {
2446    unified_diff_with_prefix_and_funcname_and_algorithm(
2447        old_content,
2448        new_content,
2449        old_path,
2450        new_path,
2451        context_lines,
2452        inter_hunk_context,
2453        src_prefix,
2454        dst_prefix,
2455        funcname_matcher,
2456        similar::Algorithm::Myers,
2457        false,
2458        false,
2459        indent_heuristic,
2460        quote_path_fully,
2461    )
2462}
2463
2464/// Same as [`unified_diff_with_prefix_and_funcname`] but allows callers to
2465/// choose the line diff algorithm used for hunk generation.
2466///
2467/// When `function_context` is true (`git diff -W`), hunks are expanded to
2468/// whole logical functions using the same rules as Git's `XDL_EMIT_FUNCCONTEXT`.
2469#[allow(clippy::too_many_arguments)]
2470pub fn unified_diff_with_prefix_and_funcname_and_algorithm(
2471    old_content: &str,
2472    new_content: &str,
2473    old_path: &str,
2474    new_path: &str,
2475    context_lines: usize,
2476    inter_hunk_context: usize,
2477    src_prefix: &str,
2478    dst_prefix: &str,
2479    funcname_matcher: Option<&FuncnameMatcher>,
2480    algorithm: similar::Algorithm,
2481    function_context: bool,
2482    use_git_histogram: bool,
2483    indent_heuristic: bool,
2484    quote_path_fully: bool,
2485) -> String {
2486    if use_git_histogram {
2487        return unified_diff_histogram_with_prefix_and_funcname(
2488            old_content,
2489            new_content,
2490            old_path,
2491            new_path,
2492            context_lines,
2493            inter_hunk_context,
2494            src_prefix,
2495            dst_prefix,
2496            funcname_matcher,
2497            quote_path_fully,
2498        );
2499    }
2500
2501    if function_context {
2502        return unified_diff_with_function_context(
2503            old_content,
2504            new_content,
2505            old_path,
2506            new_path,
2507            context_lines,
2508            inter_hunk_context,
2509            src_prefix,
2510            dst_prefix,
2511            funcname_matcher,
2512            algorithm,
2513            indent_heuristic,
2514            quote_path_fully,
2515        );
2516    }
2517
2518    use crate::quote_path::format_diff_path_with_prefix;
2519    use similar::{group_diff_ops, udiff::UnifiedDiffHunk, TextDiff};
2520
2521    let diff = TextDiff::configure()
2522        .algorithm(algorithm)
2523        .diff_lines(old_content, new_content);
2524    let compacted_ops = diff_indent_heuristic::diff_lines_ops_compacted(
2525        old_content,
2526        new_content,
2527        algorithm,
2528        indent_heuristic,
2529    );
2530
2531    let mut output = String::new();
2532    if old_path == "/dev/null" {
2533        output.push_str("--- /dev/null\n");
2534    } else if src_prefix.is_empty() {
2535        // Callers (e.g. `diff-tree`, `diff-index`) may pass a fully formatted token
2536        // (already includes `a/` and any C-style quoting).
2537        output.push_str(&format!("--- {old_path}\n"));
2538    } else {
2539        output.push_str("--- ");
2540        output.push_str(&format_diff_path_with_prefix(
2541            src_prefix,
2542            old_path,
2543            quote_path_fully,
2544        ));
2545        output.push('\n');
2546    }
2547    if new_path == "/dev/null" {
2548        output.push_str("+++ /dev/null\n");
2549    } else if dst_prefix.is_empty() {
2550        output.push_str(&format!("+++ {new_path}\n"));
2551    } else {
2552        output.push_str("+++ ");
2553        output.push_str(&format_diff_path_with_prefix(
2554            dst_prefix,
2555            new_path,
2556            quote_path_fully,
2557        ));
2558        output.push('\n');
2559    }
2560
2561    let old_lines: Vec<&str> = old_content.lines().collect();
2562
2563    // `similar::group_diff_ops` ends a hunk when an unchanged run has length > `2 * n`.
2564    // Git's xdiff merges adjacent changes while the gap between them in the old file is at most
2565    // `2 * context_lines + inter_hunk_context` (see `xdl_get_hunk` in xemit.c). Match that by
2566    // choosing `n` so `2 * n` equals that maximum merged gap (rounded up when the sum is odd).
2567    let max_common_gap = context_lines
2568        .saturating_mul(2)
2569        .saturating_add(inter_hunk_context);
2570    let group_radius = max_common_gap.div_ceil(2);
2571    let op_groups = group_diff_ops(compacted_ops, group_radius);
2572
2573    for ops in op_groups {
2574        if ops.is_empty() {
2575            continue;
2576        }
2577        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
2578        let hunk_str = format!("{hunk}");
2579        // The similar crate outputs @@ -a,b +c,d @@\n but Git adds
2580        // function context after the closing @@. Extract the hunk header
2581        // and add function context.
2582        if let Some(first_newline) = hunk_str.find('\n') {
2583            let header_line = &hunk_str[..first_newline];
2584            let rest = &hunk_str[first_newline..];
2585
2586            // Parse the old start line from the @@ header
2587            if let Some(func_ctx) =
2588                extract_function_context(header_line, &old_lines, funcname_matcher)
2589            {
2590                output.push_str(header_line);
2591                output.push(' ');
2592                output.push_str(&func_ctx);
2593                output.push_str(rest);
2594            } else {
2595                output.push_str(&hunk_str);
2596            }
2597        } else {
2598            output.push_str(&hunk_str);
2599        }
2600    }
2601
2602    output
2603}
2604
2605/// `git diff -W`: expand each hunk to include full function bodies (see Git `xemit.c`).
2606fn unified_diff_with_function_context(
2607    old_content: &str,
2608    new_content: &str,
2609    old_path: &str,
2610    new_path: &str,
2611    context_lines: usize,
2612    inter_hunk_context: usize,
2613    src_prefix: &str,
2614    dst_prefix: &str,
2615    funcname_matcher: Option<&FuncnameMatcher>,
2616    algorithm: similar::Algorithm,
2617    indent_heuristic: bool,
2618    quote_path_fully: bool,
2619) -> String {
2620    use crate::quote_path::format_diff_path_with_prefix;
2621    use similar::{group_diff_ops, udiff::UnifiedDiffHunk, TextDiff};
2622
2623    let diff = TextDiff::configure()
2624        .algorithm(algorithm)
2625        .diff_lines(old_content, new_content);
2626
2627    let old_lines: Vec<&str> = old_content.lines().collect();
2628    let new_lines: Vec<&str> = new_content.lines().collect();
2629    let n_old = old_lines.len();
2630    let n_new = new_lines.len();
2631
2632    let group_radius = context_lines
2633        .saturating_mul(2)
2634        .saturating_add(inter_hunk_context);
2635    let all_ops = diff.ops().to_vec();
2636    let op_groups = group_diff_ops(all_ops.clone(), group_radius);
2637
2638    let mut ranges: Vec<(usize, usize, usize, usize)> = Vec::new();
2639
2640    for ops in op_groups {
2641        if ops.is_empty() {
2642            continue;
2643        }
2644        let i1_anchor = func_context_old_anchor(&ops, n_old);
2645        let i1_end = hunk_old_change_end_exclusive(&ops);
2646        let skip_preimage_pull =
2647            append_with_whole_function_added(&ops, n_old, n_new, &new_lines, funcname_matcher);
2648        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
2649        let hunk_str = format!("{hunk}");
2650        let header_line = hunk_str
2651            .lines()
2652            .next()
2653            .unwrap_or("")
2654            .trim_end_matches(['\r', '\n']);
2655        let Some((base_s1, base_e1, _base_s2, _base_e2)) =
2656            parse_unified_hunk_header_ranges(header_line)
2657        else {
2658            continue;
2659        };
2660
2661        let ctx = context_lines;
2662        let (s1, e1, s2, e2) = if skip_preimage_pull {
2663            let s = n_old.saturating_sub(ctx);
2664            let s2 = map_old_line_to_new(&all_ops, s, n_new).min(n_new);
2665            (s, n_old, s2, n_new)
2666        } else {
2667            let mut s1 = base_s1.saturating_sub(ctx);
2668            let mut s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
2669
2670            let base_pre_s1 = i1_anchor.saturating_sub(ctx);
2671            if base_pre_s1 < s1 {
2672                s1 = base_pre_s1;
2673                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
2674            }
2675
2676            let fs1 = expand_func_pre_start(s1, i1_anchor, n_old, &old_lines, funcname_matcher);
2677            if fs1 < s1 {
2678                s1 = fs1;
2679                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
2680            }
2681
2682            let mut e1 = (base_e1 + ctx).min(n_old);
2683            let mut e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
2684            let fe1 = expand_func_post_end(e1, i1_end, n_old, &old_lines, funcname_matcher);
2685            if fe1 > e1 {
2686                e1 = fe1;
2687                e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
2688            }
2689            (s1, e1, s2, e2)
2690        };
2691
2692        ranges.push((s1, e1, s2, e2));
2693    }
2694
2695    let mut output = String::new();
2696    if old_path == "/dev/null" {
2697        output.push_str("--- /dev/null\n");
2698    } else if src_prefix.is_empty() {
2699        output.push_str(&format!("--- {old_path}\n"));
2700    } else {
2701        output.push_str("--- ");
2702        output.push_str(&format_diff_path_with_prefix(
2703            src_prefix,
2704            old_path,
2705            quote_path_fully,
2706        ));
2707        output.push('\n');
2708    }
2709    if new_path == "/dev/null" {
2710        output.push_str("+++ /dev/null\n");
2711    } else if dst_prefix.is_empty() {
2712        output.push_str(&format!("+++ {new_path}\n"));
2713    } else {
2714        output.push_str("+++ ");
2715        output.push_str(&format_diff_path_with_prefix(
2716            dst_prefix,
2717            new_path,
2718            quote_path_fully,
2719        ));
2720        output.push('\n');
2721    }
2722
2723    for (s1, e1, s2, e2) in ranges {
2724        if s1 >= e1 && s2 >= e2 {
2725            continue;
2726        }
2727        let old_seg =
2728            line_slice_for_diff_with_eof_nl(&old_lines, s1, e1, old_content.ends_with('\n'));
2729        let new_seg =
2730            line_slice_for_diff_with_eof_nl(&new_lines, s2, e2, new_content.ends_with('\n'));
2731        let inner_ctx = old_seg.lines().count().max(new_seg.lines().count()).max(1);
2732        let piece = unified_diff_with_prefix_and_funcname_and_algorithm(
2733            &old_seg,
2734            &new_seg,
2735            old_path,
2736            new_path,
2737            inner_ctx,
2738            0,
2739            src_prefix,
2740            dst_prefix,
2741            funcname_matcher,
2742            algorithm,
2743            false,
2744            false,
2745            indent_heuristic,
2746            quote_path_fully,
2747        );
2748        let shifted = shift_unified_hunk_headers_to_full_file(&piece, s1, s2);
2749        let with_func =
2750            enrich_unified_hunk_headers_funcname(&shifted, &old_lines, funcname_matcher);
2751        for line in with_func.lines() {
2752            if line.starts_with("--- ") || line.starts_with("+++ ") {
2753                continue;
2754            }
2755            output.push_str(line);
2756            output.push('\n');
2757        }
2758    }
2759
2760    output
2761}
2762
2763/// `piece` is a unified diff for a slice of the file; hunk headers use 1-based
2764/// coordinates relative to that slice. Shift them by `delta_old` / `delta_new`
2765/// (0-based offsets of the slice in the full file) so the combined patch applies
2766/// to the whole file.
2767fn shift_unified_hunk_headers_to_full_file(
2768    patch: &str,
2769    delta_old: usize,
2770    delta_new: usize,
2771) -> String {
2772    if delta_old == 0 && delta_new == 0 {
2773        return patch.to_owned();
2774    }
2775    let mut out = String::with_capacity(patch.len());
2776    for line in patch.lines() {
2777        if let Some(shifted) = shift_one_unified_hunk_header(line, delta_old, delta_new) {
2778            out.push_str(&shifted);
2779        } else {
2780            out.push_str(line);
2781        }
2782        out.push('\n');
2783    }
2784    out
2785}
2786
2787fn shift_one_unified_hunk_header(line: &str, delta_old: usize, delta_new: usize) -> Option<String> {
2788    let rest = line.strip_prefix("@@ ")?;
2789    let (old_chunk, after_plus) = rest.split_once(" +")?;
2790    let old_spec = old_chunk.strip_prefix('-')?;
2791    let (new_spec, suffix) = after_plus.split_once(" @@")?;
2792    let shifted_old = shift_unified_range_spec(old_spec, delta_old)?;
2793    let shifted_new = shift_unified_range_spec(new_spec, delta_new)?;
2794    Some(format!("@@ -{shifted_old} +{shifted_new} @@{suffix}"))
2795}
2796
2797fn shift_unified_range_spec(spec: &str, delta: usize) -> Option<String> {
2798    let spec = spec.trim();
2799    if let Some((start_s, count_s)) = spec.split_once(',') {
2800        let start: usize = start_s.parse().ok()?;
2801        let count: usize = count_s.parse().ok()?;
2802        Some(format!("{},{}", start.saturating_add(delta), count))
2803    } else {
2804        let start: usize = spec.parse().ok()?;
2805        Some(format!("{}", start.saturating_add(delta)))
2806    }
2807}
2808
2809/// Re-attach `@@ ... @@ <funcname>` using full-file line indices (inner diffs use slices).
2810fn enrich_unified_hunk_headers_funcname(
2811    patch: &str,
2812    full_old_lines: &[&str],
2813    funcname_matcher: Option<&FuncnameMatcher>,
2814) -> String {
2815    let mut out = String::with_capacity(patch.len());
2816    for line in patch.lines() {
2817        if let Some(fixed) = enrich_one_hunk_header_funcname(line, full_old_lines, funcname_matcher)
2818        {
2819            out.push_str(&fixed);
2820        } else {
2821            out.push_str(line);
2822        }
2823        out.push('\n');
2824    }
2825    out
2826}
2827
2828fn enrich_one_hunk_header_funcname(
2829    line: &str,
2830    full_old_lines: &[&str],
2831    funcname_matcher: Option<&FuncnameMatcher>,
2832) -> Option<String> {
2833    let after_at = line.strip_prefix("@@ ")?;
2834    let idx = after_at.find(" @@")?;
2835    let mid = after_at[..idx].trim();
2836    let tail = after_at[idx + 3..].trim_start();
2837    let header_for_parse = format!("@@ {mid} @@");
2838    let func = extract_function_context(&header_for_parse, full_old_lines, funcname_matcher);
2839    Some(if let Some(f) = func {
2840        format!("@@ {mid} @@ {f}")
2841    } else if !tail.is_empty() {
2842        format!("@@ {mid} @@ {tail}")
2843    } else {
2844        format!("@@ {mid} @@")
2845    })
2846}
2847
2848fn line_slice_for_diff_with_eof_nl(
2849    lines: &[&str],
2850    start: usize,
2851    end: usize,
2852    full_file_ends_with_newline: bool,
2853) -> String {
2854    if start >= end {
2855        return String::new();
2856    }
2857    let mut s = lines[start..end].join("\n");
2858    let slice_is_suffix_of_file = end == lines.len();
2859    let need_trailing_nl = if slice_is_suffix_of_file {
2860        full_file_ends_with_newline
2861    } else {
2862        true
2863    };
2864    if need_trailing_nl && !s.ends_with('\n') {
2865        s.push('\n');
2866    }
2867    s
2868}
2869
2870/// Map a 0-based old line index to the corresponding 0-based new line index using the full-file
2871/// diff ops (Git aligns context across deletions/insertions).
2872fn map_old_line_to_new(ops: &[similar::DiffOp], old_line: usize, n_new: usize) -> usize {
2873    use similar::DiffOp;
2874    let mut n = 0usize;
2875    for op in ops {
2876        match *op {
2877            DiffOp::Equal {
2878                old_index,
2879                new_index,
2880                len,
2881            } => {
2882                if old_index + len <= old_line {
2883                    n = new_index + len;
2884                    continue;
2885                }
2886                if old_index < old_line {
2887                    let take = old_line - old_index;
2888                    return (new_index + take).min(n_new);
2889                }
2890                return new_index.min(n_new);
2891            }
2892            DiffOp::Delete {
2893                old_index,
2894                old_len,
2895                new_index,
2896            } => {
2897                if old_index + old_len <= old_line {
2898                    n = new_index;
2899                    continue;
2900                }
2901                if old_index < old_line {
2902                    return new_index.min(n_new);
2903                }
2904            }
2905            DiffOp::Insert {
2906                old_index,
2907                new_index,
2908                new_len,
2909            } => {
2910                if old_index < old_line {
2911                    n = new_index + new_len;
2912                    continue;
2913                }
2914                if old_index == old_line {
2915                    // `old_line` is an exclusive end or insertion point aligned with this insert
2916                    // (e.g. EOF append maps to after the inserted block).
2917                    return (new_index + new_len).min(n_new);
2918                }
2919                return new_index.min(n_new);
2920            }
2921            DiffOp::Replace {
2922                old_index,
2923                old_len,
2924                new_index,
2925                new_len,
2926            } => {
2927                if old_index + old_len <= old_line {
2928                    n = new_index + new_len;
2929                    continue;
2930                }
2931                if old_index < old_line {
2932                    let into_old = old_line - old_index;
2933                    let mapped = new_index + into_old.min(new_len);
2934                    return mapped.min(n_new);
2935                }
2936                return new_index.min(n_new);
2937            }
2938        }
2939    }
2940    n.min(n_new)
2941}
2942
2943/// Parse `@@ -old +new @@` into 0-based half-open ranges in each file.
2944fn parse_unified_hunk_header_ranges(header: &str) -> Option<(usize, usize, usize, usize)> {
2945    let rest = header.strip_prefix("@@ ")?;
2946    let (old_tok, rest2) = rest.split_once(" +")?;
2947    let old_tok = old_tok.strip_prefix('-')?;
2948    let new_tok = rest2.split_once(" @@").map(|(a, _)| a)?;
2949
2950    fn parse_side(spec: &str) -> Option<(usize, usize)> {
2951        let spec = spec.trim();
2952        let (start_one_based, count) = if let Some((a, b)) = spec.split_once(',') {
2953            (a.parse::<usize>().ok()?, b.parse::<usize>().ok()?)
2954        } else {
2955            let s = spec.parse::<usize>().ok()?;
2956            (s, 1usize)
2957        };
2958        let s0 = start_one_based.saturating_sub(1);
2959        let e0 = s0.saturating_add(count);
2960        Some((s0, e0))
2961    }
2962
2963    let (os, oe) = parse_side(old_tok)?;
2964    let (ns, ne) = parse_side(new_tok)?;
2965    Some((os, oe, ns, ne))
2966}
2967
2968/// Git `xemit.c`: when a hunk only inserts at EOF (first inserted line is `new_index == n_old`)
2969/// and the added text already contains a funcname line, do not pull extra context from the preimage.
2970fn append_with_whole_function_added(
2971    ops: &[similar::DiffOp],
2972    n_old: usize,
2973    n_new: usize,
2974    new_lines: &[&str],
2975    matcher: Option<&FuncnameMatcher>,
2976) -> bool {
2977    use similar::DiffOp;
2978    if n_old == 0 {
2979        return false;
2980    }
2981    let mut only_ins_or_eq = true;
2982    let mut min_new_ins = usize::MAX;
2983    for op in ops {
2984        match *op {
2985            DiffOp::Equal { .. } => {}
2986            DiffOp::Insert {
2987                new_index, new_len, ..
2988            } => {
2989                min_new_ins = min_new_ins.min(new_index);
2990                if new_len == 0 {
2991                    only_ins_or_eq = false;
2992                }
2993            }
2994            DiffOp::Delete { .. } | DiffOp::Replace { .. } => {
2995                only_ins_or_eq = false;
2996            }
2997        }
2998    }
2999    let mut insert_at_eof = false;
3000    for op in ops {
3001        if let DiffOp::Insert { old_index, .. } = *op {
3002            if old_index == n_old {
3003                insert_at_eof = true;
3004                break;
3005            }
3006        }
3007    }
3008    let append_at_eof = min_new_ins == n_old || insert_at_eof;
3009    if !only_ins_or_eq || !append_at_eof || min_new_ins == usize::MAX {
3010        return false;
3011    }
3012    // Git only skips preimage pull when the inserted block is clearly a new logical
3013    // function (see `xemit.c` walking `xdf2` for `is_func_rec`). A loose "any line
3014    // looks like a function" check would match `return` / `printf` and break `-W`
3015    // hunks that still need preimage context (t4051 `extended`).
3016    let mut j = min_new_ins;
3017    while j < n_new {
3018        let line = new_lines[j];
3019        if line.trim().is_empty() {
3020            j += 1;
3021            continue;
3022        }
3023        if let Some(m) = matcher {
3024            if m.match_line(line).is_some() {
3025                return true;
3026            }
3027        } else if inserted_block_starts_with_c_like_function_definition(line) {
3028            return true;
3029        }
3030        j += 1;
3031    }
3032    false
3033}
3034
3035fn inserted_block_starts_with_c_like_function_definition(line: &str) -> bool {
3036    let t = line.trim_start();
3037    let Some(open_paren) = t.find('(') else {
3038        return false;
3039    };
3040    let head = &t[..open_paren];
3041    let tokens: Vec<&str> = head.split_whitespace().collect();
3042    if tokens.len() < 2 {
3043        // `printf(...)`, `return (`, etc. — not `return_type name(`.
3044        return false;
3045    }
3046    let nameish = tokens.last().copied().unwrap_or("");
3047    let name = nameish.trim_end_matches(['*', '&']);
3048    if name.is_empty() || !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
3049        return false;
3050    }
3051    let type_or_modifier = |tok: &str| {
3052        matches!(
3053            tok,
3054            "static"
3055                | "extern"
3056                | "inline"
3057                | "void"
3058                | "int"
3059                | "char"
3060                | "short"
3061                | "long"
3062                | "float"
3063                | "double"
3064                | "unsigned"
3065                | "signed"
3066                | "struct"
3067                | "enum"
3068                | "union"
3069                | "const"
3070                | "volatile"
3071                | "typedef"
3072        )
3073    };
3074    tokens[..tokens.len() - 1]
3075        .iter()
3076        .any(|tok| type_or_modifier(tok))
3077}
3078
3079fn hunk_old_change_end_exclusive(ops: &[similar::DiffOp]) -> usize {
3080    use similar::DiffOp;
3081    let mut max_o = 0usize;
3082    for op in ops {
3083        match *op {
3084            DiffOp::Delete {
3085                old_index, old_len, ..
3086            } => {
3087                max_o = max_o.max(old_index + old_len);
3088            }
3089            DiffOp::Replace {
3090                old_index, old_len, ..
3091            } => {
3092                max_o = max_o.max(old_index + old_len);
3093            }
3094            DiffOp::Insert { old_index, .. } => {
3095                // Pure insertions do not consume old lines; Git's post-context anchor is the
3096                // insertion point (`old_index`), not 0 (t4051 `extended`).
3097                max_o = max_o.max(old_index);
3098            }
3099            DiffOp::Equal { .. } => {}
3100        }
3101    }
3102    max_o
3103}
3104
3105fn func_context_old_anchor(ops: &[similar::DiffOp], n_old: usize) -> usize {
3106    use similar::DiffOp;
3107    let mut has_delete_or_replace = false;
3108    let mut min_del = usize::MAX;
3109    let mut min_ins_old = usize::MAX;
3110
3111    for op in ops {
3112        match *op {
3113            DiffOp::Delete {
3114                old_index, old_len, ..
3115            } => {
3116                has_delete_or_replace = true;
3117                min_del = min_del.min(old_index);
3118                min_del = min_del.min(old_index + old_len.saturating_sub(1));
3119            }
3120            DiffOp::Replace {
3121                old_index, old_len, ..
3122            } => {
3123                has_delete_or_replace = true;
3124                min_del = min_del.min(old_index);
3125                min_del = min_del.min(old_index + old_len.saturating_sub(1));
3126            }
3127            DiffOp::Insert { old_index, .. } => {
3128                min_ins_old = min_ins_old.min(old_index);
3129            }
3130            DiffOp::Equal { .. } => {}
3131        }
3132    }
3133
3134    let mut i1 = if has_delete_or_replace {
3135        min_del
3136    } else if min_ins_old != usize::MAX {
3137        min_ins_old
3138    } else {
3139        0
3140    };
3141
3142    let pure_insert = ops
3143        .iter()
3144        .all(|op| matches!(op, DiffOp::Insert { .. } | DiffOp::Equal { .. }))
3145        && ops.iter().any(|op| matches!(op, DiffOp::Insert { .. }));
3146
3147    if pure_insert && i1 >= n_old && n_old > 0 {
3148        i1 = n_old - 1;
3149    }
3150
3151    i1.min(n_old.saturating_sub(1))
3152}
3153
3154fn expand_func_pre_start(
3155    s1: usize,
3156    i1: usize,
3157    n_old: usize,
3158    old_lines: &[&str],
3159    matcher: Option<&FuncnameMatcher>,
3160) -> usize {
3161    if n_old == 0 {
3162        return s1;
3163    }
3164    let i1 = i1.min(n_old.saturating_sub(1));
3165    let mut fs1 = get_func_line_backward(old_lines, i1, matcher).unwrap_or(i1);
3166    while fs1 > 0
3167        && !is_line_empty_for_func_context(old_lines[fs1 - 1])
3168        && !is_func_line(old_lines[fs1 - 1], matcher)
3169    {
3170        fs1 -= 1;
3171    }
3172    s1.min(fs1)
3173}
3174
3175fn expand_func_post_end(
3176    e1: usize,
3177    i1_end: usize,
3178    n_old: usize,
3179    old_lines: &[&str],
3180    matcher: Option<&FuncnameMatcher>,
3181) -> usize {
3182    let from = i1_end.min(n_old);
3183    let fe1 = get_func_line_forward(old_lines, from, matcher).unwrap_or(n_old);
3184    let mut fe1_adj = fe1;
3185    while fe1_adj > 0 && is_line_empty_for_func_context(old_lines[fe1_adj - 1]) {
3186        fe1_adj -= 1;
3187    }
3188    e1.max(fe1_adj).min(n_old)
3189}
3190
3191fn is_line_empty_for_func_context(line: &str) -> bool {
3192    line.chars().all(|c| c.is_whitespace())
3193}
3194
3195fn is_func_line(line: &str, matcher: Option<&FuncnameMatcher>) -> bool {
3196    if let Some(m) = matcher {
3197        return m.match_line(line).is_some();
3198    }
3199    let t = line.trim_end_matches(['\n', '\r']);
3200    if t.is_empty() {
3201        return false;
3202    }
3203    let b = t.as_bytes()[0];
3204    b.is_ascii_alphabetic() || b == b'_' || b == b'$'
3205}
3206
3207fn get_func_line_backward(
3208    old_lines: &[&str],
3209    start: usize,
3210    matcher: Option<&FuncnameMatcher>,
3211) -> Option<usize> {
3212    let mut l = start.min(old_lines.len().saturating_sub(1));
3213    if old_lines.is_empty() {
3214        return None;
3215    }
3216    loop {
3217        if is_func_line(old_lines[l], matcher) {
3218            return Some(l);
3219        }
3220        if l == 0 {
3221            break;
3222        }
3223        l -= 1;
3224    }
3225    None
3226}
3227
3228fn get_func_line_forward(
3229    old_lines: &[&str],
3230    start: usize,
3231    matcher: Option<&FuncnameMatcher>,
3232) -> Option<usize> {
3233    let mut l = start;
3234    while l < old_lines.len() {
3235        if is_func_line(old_lines[l], matcher) {
3236            return Some(l);
3237        }
3238        l += 1;
3239    }
3240    None
3241}
3242
3243/// Compute a unified diff with anchored lines.
3244///
3245/// Anchored lines that appear exactly once in both old and new content are
3246/// forced to match, splitting the diff into segments around those anchor points.
3247/// This produces diffs where the anchored text stays as context and surrounding
3248/// lines are shown as additions/removals.
3249///
3250/// Segment diffs use `algorithm`. When `use_git_histogram` is true, histogram uses imara-diff
3251/// (Git-compatible); otherwise `algorithm` is passed to `similar`.
3252pub fn anchored_unified_diff(
3253    old_content: &str,
3254    new_content: &str,
3255    old_path: &str,
3256    new_path: &str,
3257    context_lines: usize,
3258    anchors: &[String],
3259    algorithm: similar::Algorithm,
3260    use_git_histogram: bool,
3261    indent_heuristic: bool,
3262    quote_path_fully: bool,
3263) -> String {
3264    use crate::quote_path::format_diff_path_with_prefix;
3265    use similar::TextDiff;
3266
3267    let old_lines: Vec<&str> = old_content.lines().collect();
3268    let new_lines: Vec<&str> = new_content.lines().collect();
3269
3270    // Find anchored lines that appear exactly once in both old and new
3271    let mut anchor_pairs: Vec<(usize, usize)> = Vec::new(); // (old_idx, new_idx)
3272
3273    for anchor in anchors {
3274        let anchor_str = anchor.as_str();
3275
3276        // Count occurrences in old
3277        let old_positions: Vec<usize> = old_lines
3278            .iter()
3279            .enumerate()
3280            .filter(|(_, l)| l.trim_end() == anchor_str)
3281            .map(|(i, _)| i)
3282            .collect();
3283
3284        // Count occurrences in new
3285        let new_positions: Vec<usize> = new_lines
3286            .iter()
3287            .enumerate()
3288            .filter(|(_, l)| l.trim_end() == anchor_str)
3289            .map(|(i, _)| i)
3290            .collect();
3291
3292        // Only anchor if unique in both
3293        if old_positions.len() == 1 && new_positions.len() == 1 {
3294            anchor_pairs.push((old_positions[0], new_positions[0]));
3295        }
3296    }
3297
3298    // If no valid anchors, fall back to normal diff
3299    if anchor_pairs.is_empty() {
3300        return unified_diff_with_prefix_and_funcname_and_algorithm(
3301            old_content,
3302            new_content,
3303            old_path,
3304            new_path,
3305            context_lines,
3306            0,
3307            "a/",
3308            "b/",
3309            None,
3310            algorithm,
3311            false,
3312            use_git_histogram,
3313            indent_heuristic,
3314            quote_path_fully,
3315        );
3316    }
3317
3318    // Sort anchor pairs by their position in the old file
3319    anchor_pairs.sort_by_key(|&(old_idx, _)| old_idx);
3320
3321    // Filter to only keep pairs where new positions are also increasing
3322    // (longest increasing subsequence of new positions)
3323    let mut filtered: Vec<(usize, usize)> = Vec::new();
3324    for &pair in &anchor_pairs {
3325        if filtered.is_empty() || filtered.last().is_some_and(|last| pair.1 > last.1) {
3326            filtered.push(pair);
3327        }
3328    }
3329    let anchor_pairs = filtered;
3330
3331    // Build a modified version of old/new where we diff segments between anchors.
3332    // We'll construct the diff by processing segments:
3333    // - Before first anchor
3334    // - Between consecutive anchors
3335    // - After last anchor
3336    // Each anchor line itself is a fixed context match.
3337
3338    // Collect all diff operations
3339    struct LineDiffOp {
3340        tag: char, // ' ', '+', '-'
3341        line: String,
3342    }
3343
3344    let append_segment_diff =
3345        |ops: &mut Vec<LineDiffOp>, old_seg_input: &str, new_seg_input: &str| {
3346            use similar::ChangeTag;
3347            let old_ls: Vec<&str> = old_seg_input.lines().collect();
3348            let new_ls: Vec<&str> = new_seg_input.lines().collect();
3349            if old_ls.is_empty() && new_ls.is_empty() {
3350                return;
3351            }
3352            let seg_diff = TextDiff::configure()
3353                .algorithm(algorithm)
3354                .diff_slices(&old_ls, &new_ls);
3355            let raw = seg_diff.ops().to_vec();
3356            let compacted = diff_indent_heuristic::apply_change_compact_to_ops(
3357                &raw,
3358                &old_ls,
3359                &new_ls,
3360                indent_heuristic,
3361            );
3362            for op in &compacted {
3363                for ch in op.iter_changes(&old_ls, &new_ls) {
3364                    let t = match ch.tag() {
3365                        ChangeTag::Equal => ' ',
3366                        ChangeTag::Delete => '-',
3367                        ChangeTag::Insert => '+',
3368                    };
3369                    ops.push(LineDiffOp {
3370                        tag: t,
3371                        line: ch.value().to_string(),
3372                    });
3373                }
3374            }
3375        };
3376
3377    let mut ops: Vec<LineDiffOp> = Vec::new();
3378    let mut old_pos = 0usize;
3379    let mut new_pos = 0usize;
3380
3381    for &(old_anchor, new_anchor) in &anchor_pairs {
3382        // Diff the segment before this anchor
3383        let old_segment: Vec<&str> = old_lines[old_pos..old_anchor].to_vec();
3384        let new_segment: Vec<&str> = new_lines[new_pos..new_anchor].to_vec();
3385
3386        let old_seg_text = old_segment.join("\n");
3387        let new_seg_text = new_segment.join("\n");
3388
3389        if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
3390            let old_seg_input = if old_seg_text.is_empty() {
3391                String::new()
3392            } else {
3393                format!("{}\n", old_seg_text)
3394            };
3395            let new_seg_input = if new_seg_text.is_empty() {
3396                String::new()
3397            } else {
3398                format!("{}\n", new_seg_text)
3399            };
3400            append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
3401        }
3402
3403        // The anchor line itself is always context
3404        ops.push(LineDiffOp {
3405            tag: ' ',
3406            line: old_lines[old_anchor].to_string(),
3407        });
3408
3409        old_pos = old_anchor + 1;
3410        new_pos = new_anchor + 1;
3411    }
3412
3413    // Diff the remaining segment after the last anchor
3414    let old_segment: Vec<&str> = old_lines[old_pos..].to_vec();
3415    let new_segment: Vec<&str> = new_lines[new_pos..].to_vec();
3416    let old_seg_text = old_segment.join("\n");
3417    let new_seg_text = new_segment.join("\n");
3418
3419    if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
3420        let old_seg_input = if old_seg_text.is_empty() {
3421            String::new()
3422        } else {
3423            format!("{}\n", old_seg_text)
3424        };
3425        let new_seg_input = if new_seg_text.is_empty() {
3426            String::new()
3427        } else {
3428            format!("{}\n", new_seg_text)
3429        };
3430        append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
3431    }
3432
3433    // Now format as unified diff with hunks
3434    let mut output = String::new();
3435    if old_path == "/dev/null" {
3436        output.push_str("--- /dev/null\n");
3437    } else {
3438        output.push_str("--- ");
3439        output.push_str(&format_diff_path_with_prefix(
3440            "a/",
3441            old_path,
3442            quote_path_fully,
3443        ));
3444        output.push('\n');
3445    }
3446    if new_path == "/dev/null" {
3447        output.push_str("+++ /dev/null\n");
3448    } else {
3449        output.push_str("+++ ");
3450        output.push_str(&format_diff_path_with_prefix(
3451            "b/",
3452            new_path,
3453            quote_path_fully,
3454        ));
3455        output.push('\n');
3456    }
3457
3458    // Group ops into hunks with context
3459    let total_ops = ops.len();
3460    if total_ops == 0 {
3461        return output;
3462    }
3463
3464    // Find ranges of changes
3465    let mut hunks: Vec<(usize, usize)> = Vec::new(); // (start, end) indices into ops
3466    let mut i = 0;
3467    while i < total_ops {
3468        if ops[i].tag != ' ' {
3469            let start = i.saturating_sub(context_lines);
3470            let mut end = i;
3471            // Extend to include consecutive changes and their context
3472            while end < total_ops {
3473                if ops[end].tag != ' ' {
3474                    end += 1;
3475                    continue;
3476                }
3477                // Check if there's another change within context_lines
3478                let mut next_change = end;
3479                while next_change < total_ops && ops[next_change].tag == ' ' {
3480                    next_change += 1;
3481                }
3482                if next_change < total_ops && next_change - end <= context_lines * 2 {
3483                    end = next_change + 1;
3484                } else {
3485                    end = (end + context_lines).min(total_ops);
3486                    break;
3487                }
3488            }
3489            // Merge with previous hunk if overlapping
3490            if let Some(last) = hunks.last_mut() {
3491                if start <= last.1 {
3492                    last.1 = end;
3493                } else {
3494                    hunks.push((start, end));
3495                }
3496            } else {
3497                hunks.push((start, end));
3498            }
3499            i = end;
3500        } else {
3501            i += 1;
3502        }
3503    }
3504
3505    // Output each hunk
3506    for (start, end) in hunks {
3507        // Count old/new lines in this hunk
3508        let mut old_start = 1usize;
3509        let mut new_start = 1usize;
3510        // Calculate line numbers by counting ops before this hunk
3511        for op in &ops[..start] {
3512            match op.tag {
3513                ' ' => {
3514                    old_start += 1;
3515                    new_start += 1;
3516                }
3517                '-' => {
3518                    old_start += 1;
3519                }
3520                '+' => {
3521                    new_start += 1;
3522                }
3523                _ => {}
3524            }
3525        }
3526        let mut old_count = 0usize;
3527        let mut new_count = 0usize;
3528        for op in &ops[start..end] {
3529            match op.tag {
3530                ' ' => {
3531                    old_count += 1;
3532                    new_count += 1;
3533                }
3534                '-' => {
3535                    old_count += 1;
3536                }
3537                '+' => {
3538                    new_count += 1;
3539                }
3540                _ => {}
3541            }
3542        }
3543
3544        output.push_str(&format!(
3545            "@@ -{},{} +{},{} @@\n",
3546            old_start, old_count, new_start, new_count
3547        ));
3548        for op in &ops[start..end] {
3549            output.push(op.tag);
3550            output.push_str(&op.line);
3551            output.push('\n');
3552        }
3553    }
3554
3555    output
3556}
3557
3558/// Extract function context for a hunk header.
3559///
3560/// Given a hunk header like `@@ -8,7 +8,7 @@`, find the last line
3561/// before line 8 in the old content that looks like a function header
3562/// (starts with a non-whitespace character, like Git's default).
3563fn extract_function_context(
3564    header: &str,
3565    old_lines: &[&str],
3566    funcname_matcher: Option<&FuncnameMatcher>,
3567) -> Option<String> {
3568    // Parse the old start line number from "@@ -<start>,<count> ..."
3569    let at_pos = header.find("-")?;
3570    let rest = &header[at_pos + 1..];
3571    let comma_or_space = rest.find([',', ' '])?;
3572    let start_str = &rest[..comma_or_space];
3573    let start_line: usize = start_str.parse().ok()?;
3574
3575    if start_line <= 1 {
3576        return None;
3577    }
3578
3579    // Look backwards from the line before the hunk start for a line that
3580    // starts with a non-whitespace character (Git's default funcname pattern).
3581    // start_line is 1-indexed, so the hunk starts at old_lines[start_line-1].
3582    // We want to look at lines before that: old_lines[0..start_line-1].
3583    let search_end = (start_line - 1).min(old_lines.len());
3584    let truncate = |text: &str| {
3585        if text.len() > 80 {
3586            let mut end = 80;
3587            while end > 0 && !text.is_char_boundary(end) {
3588                end -= 1;
3589            }
3590            text[..end].to_owned()
3591        } else {
3592            text.to_owned()
3593        }
3594    };
3595
3596    for i in (0..search_end).rev() {
3597        let line = old_lines[i];
3598        if line.is_empty() {
3599            continue;
3600        }
3601        if let Some(matcher) = funcname_matcher {
3602            if let Some(matched) = matcher.match_line(line) {
3603                return Some(truncate(&matched));
3604            }
3605            continue;
3606        }
3607
3608        let first = line.as_bytes()[0];
3609        if first.is_ascii_alphabetic() || first == b'_' || first == b'$' {
3610            return Some(truncate(line.trim_end_matches(char::is_whitespace)));
3611        }
3612    }
3613    None
3614}
3615
3616/// Generate diff stat output (file name + insertions/deletions).
3617///
3618/// Returns a single line like: ` file.txt | 5 ++---`
3619pub fn format_stat_line(
3620    path: &str,
3621    insertions: usize,
3622    deletions: usize,
3623    max_path_len: usize,
3624) -> String {
3625    format_stat_line_width(path, insertions, deletions, max_path_len, 0)
3626}
3627
3628pub fn format_stat_line_width(
3629    path: &str,
3630    insertions: usize,
3631    deletions: usize,
3632    max_path_len: usize,
3633    count_width: usize,
3634) -> String {
3635    let total = insertions + deletions;
3636    let plus = "+".repeat(insertions.min(50));
3637    let minus = "-".repeat(deletions.min(50));
3638    let cw = if count_width > 0 {
3639        count_width
3640    } else {
3641        format!("{}", total).len()
3642    };
3643    let bar = format!("{}{}", plus, minus);
3644    if bar.is_empty() {
3645        format!(
3646            " {:<width$} | {:>cw$}",
3647            path,
3648            total,
3649            width = max_path_len,
3650            cw = cw
3651        )
3652    } else {
3653        format!(
3654            " {:<width$} | {:>cw$} {}",
3655            path,
3656            total,
3657            bar,
3658            width = max_path_len,
3659            cw = cw
3660        )
3661    }
3662}
3663
3664/// Normalise one line like Git's `-b` / `--ignore-space-change`.
3665#[must_use]
3666pub fn normalize_ignore_space_change_line(line: &str) -> String {
3667    let mut result = String::with_capacity(line.len());
3668    let mut in_space = false;
3669    for c in line.chars() {
3670        if c.is_whitespace() {
3671            if !in_space {
3672                result.push(' ');
3673                in_space = true;
3674            }
3675        } else {
3676            result.push(c);
3677            in_space = false;
3678        }
3679    }
3680    while result.ends_with(' ') {
3681        result.pop();
3682    }
3683    result
3684}
3685
3686/// Normalise text like Git's `-b` / `--ignore-space-change`: on each line, collapse runs of
3687/// whitespace to a single ASCII space and trim trailing spaces.
3688///
3689/// Line breaks are preserved by splitting on [`str::lines`] and rejoining with `\n` (same approach
3690/// as the porcelain `diff` whitespace handling in `grit`).
3691#[must_use]
3692pub fn normalize_ignore_space_change(content: &str) -> String {
3693    content
3694        .lines()
3695        .map(normalize_ignore_space_change_line)
3696        .collect::<Vec<_>>()
3697        .join("\n")
3698}
3699
3700/// Count insertions and deletions between two strings.
3701///
3702/// Returns `(insertions, deletions)`.
3703pub fn count_changes(old_content: &str, new_content: &str) -> (usize, usize) {
3704    count_changes_with_algorithm(old_content, new_content, similar::Algorithm::Myers, false)
3705}
3706
3707/// Count insertions and deletions using the given line-diff algorithm.
3708///
3709/// Git's `--stat` / `--numstat` follow the configured diff algorithm; this mirrors that by
3710/// running [`similar::TextDiff`] with an explicit [`similar::Algorithm`].
3711#[must_use]
3712pub fn count_changes_with_algorithm(
3713    old_content: &str,
3714    new_content: &str,
3715    algorithm: similar::Algorithm,
3716    use_git_histogram: bool,
3717) -> (usize, usize) {
3718    if use_git_histogram {
3719        use imara_diff::{Algorithm, Diff, InternedInput};
3720        let input = InternedInput::new(old_content, new_content);
3721        let mut d = Diff::compute(Algorithm::Histogram, &input);
3722        d.postprocess_lines(&input);
3723        return (d.count_additions() as usize, d.count_removals() as usize);
3724    }
3725
3726    use similar::{ChangeTag, TextDiff};
3727
3728    let diff = TextDiff::configure()
3729        .algorithm(algorithm)
3730        .diff_lines(old_content, new_content);
3731    let mut ins = 0;
3732    let mut del = 0;
3733
3734    for change in diff.iter_all_changes() {
3735        match change.tag() {
3736            ChangeTag::Insert => ins += 1,
3737            ChangeTag::Delete => del += 1,
3738            ChangeTag::Equal => {}
3739        }
3740    }
3741
3742    (ins, del)
3743}
3744
3745/// Line count for diffstat/`--numstat`, matching Git's `count_lines()` in `diff.c`.
3746///
3747/// Counts newline-terminated lines; a final line without trailing newline still counts as one line.
3748/// An empty buffer yields `0`.
3749#[must_use]
3750pub fn count_git_lines(data: &[u8]) -> usize {
3751    if data.is_empty() {
3752        return 0;
3753    }
3754    let mut count = 0usize;
3755    let mut nl_just_seen = false;
3756    for &ch in data {
3757        if ch == b'\n' {
3758            count += 1;
3759            nl_just_seen = true;
3760        } else {
3761            nl_just_seen = false;
3762        }
3763    }
3764    if !nl_just_seen {
3765        count += 1;
3766    }
3767    count
3768}
3769
3770/// Internal maximum diff score used by Git rename/break heuristics (`MAX_SCORE` in `diffcore.h`).
3771pub const GIT_DIFF_MAX_SCORE: u64 = 60_000;
3772const DIFF_MAX_SCORE: u64 = GIT_DIFF_MAX_SCORE;
3773const DIFF_MINIMUM_BREAK_SIZE: usize = 400;
3774const DIFF_DEFAULT_BREAK_SCORE: u64 = 30_000;
3775/// Default break threshold (`DEFAULT_BREAK_SCORE` in `diffcore.h`), internal 0–[`GIT_DIFF_MAX_SCORE`] scale.
3776pub const GIT_DIFF_DEFAULT_BREAK_SCORE: u64 = DIFF_DEFAULT_BREAK_SCORE;
3777/// Default merge threshold after a break (`DEFAULT_MERGE_SCORE` in `diffcore.h`): pairs broken for
3778/// rename/copy but not consumed are merged back when deletion-weight is below this (60% by default).
3779pub const GIT_DIFF_DEFAULT_MERGE_SCORE_AFTER_BREAK: u64 = 36_000;
3780const DIFF_HASHBASE: u32 = 107_927;
3781
3782#[derive(Clone, Copy, Default)]
3783struct SpanSlot {
3784    hashval: u32,
3785    cnt: u32,
3786}
3787
3788struct SpanHashTop {
3789    alloc_log2: u8,
3790    free_slots: i32,
3791    data: Vec<SpanSlot>,
3792}
3793
3794impl SpanHashTop {
3795    fn new(initial_log2: u8) -> Self {
3796        let cap = 1usize << initial_log2;
3797        Self {
3798            alloc_log2: initial_log2,
3799            free_slots: initial_free(initial_log2),
3800            data: vec![SpanSlot::default(); cap],
3801        }
3802    }
3803
3804    fn len(&self) -> usize {
3805        1usize << self.alloc_log2
3806    }
3807
3808    fn add_span(&mut self, hashval: u32, cnt: u32) {
3809        loop {
3810            let lim = self.len();
3811            let mut bucket = (hashval as usize) & (lim - 1);
3812            loop {
3813                let h = &mut self.data[bucket];
3814                if h.cnt == 0 {
3815                    h.hashval = hashval;
3816                    h.cnt = cnt;
3817                    self.free_slots -= 1;
3818                    if self.free_slots < 0 {
3819                        self.rehash();
3820                        break;
3821                    }
3822                    return;
3823                }
3824                if h.hashval == hashval {
3825                    h.cnt = h.cnt.saturating_add(cnt);
3826                    return;
3827                }
3828                bucket += 1;
3829                if bucket >= lim {
3830                    bucket = 0;
3831                }
3832            }
3833        }
3834    }
3835
3836    fn rehash(&mut self) {
3837        let old = std::mem::take(&mut self.data);
3838        let old_log = self.alloc_log2;
3839        self.alloc_log2 = old_log.saturating_add(1);
3840        let new_len = 1usize << self.alloc_log2;
3841        self.free_slots = initial_free(self.alloc_log2);
3842        self.data = vec![SpanSlot::default(); new_len];
3843        let old_sz = 1usize << old_log;
3844        for o in old.iter().take(old_sz) {
3845            let o = *o;
3846            if o.cnt == 0 {
3847                continue;
3848            }
3849            self.add_span_after_rehash(o.hashval, o.cnt);
3850        }
3851    }
3852
3853    fn add_span_after_rehash(&mut self, hashval: u32, cnt: u32) {
3854        loop {
3855            let lim = self.len();
3856            let mut bucket = (hashval as usize) & (lim - 1);
3857            loop {
3858                let h = &mut self.data[bucket];
3859                if h.cnt == 0 {
3860                    h.hashval = hashval;
3861                    h.cnt = cnt;
3862                    self.free_slots -= 1;
3863                    if self.free_slots < 0 {
3864                        self.rehash();
3865                        break;
3866                    }
3867                    return;
3868                }
3869                if h.hashval == hashval {
3870                    h.cnt = h.cnt.saturating_add(cnt);
3871                    return;
3872                }
3873                bucket += 1;
3874                if bucket >= lim {
3875                    bucket = 0;
3876                }
3877            }
3878        }
3879    }
3880
3881    fn sort_by_hashval(&mut self) {
3882        let sz = self.len();
3883        self.data[..sz].sort_by(|a, b| {
3884            if a.cnt == 0 {
3885                return std::cmp::Ordering::Greater;
3886            }
3887            if b.cnt == 0 {
3888                return std::cmp::Ordering::Less;
3889            }
3890            a.hashval.cmp(&b.hashval)
3891        });
3892    }
3893}
3894
3895fn initial_free(sz_log2: u8) -> i32 {
3896    let sz = sz_log2 as i32;
3897    ((1i32 << sz_log2) * (sz - 3) / sz).max(0)
3898}
3899
3900fn hash_blob_spans(buf: &[u8], is_text: bool) -> SpanHashTop {
3901    let mut hash = SpanHashTop::new(9);
3902    let mut n = 0u32;
3903    let mut accum1: u32 = 0;
3904    let mut accum2: u32 = 0;
3905    let mut i = 0usize;
3906    while i < buf.len() {
3907        let c = buf[i] as u32;
3908        let old_1 = accum1;
3909        i += 1;
3910
3911        if is_text && c == b'\r' as u32 && i < buf.len() && buf[i] == b'\n' {
3912            continue;
3913        }
3914
3915        accum1 = accum1.wrapping_shl(7) ^ accum2.wrapping_shr(25);
3916        accum2 = accum2.wrapping_shl(7) ^ old_1.wrapping_shr(25);
3917        accum1 = accum1.wrapping_add(c);
3918        n += 1;
3919        if n < 64 && c != b'\n' as u32 {
3920            continue;
3921        }
3922        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
3923        hash.add_span(hashval, n);
3924        n = 0;
3925        accum1 = 0;
3926        accum2 = 0;
3927    }
3928    if n > 0 {
3929        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
3930        hash.add_span(hashval, n);
3931    }
3932    hash.sort_by_hashval();
3933    hash
3934}
3935
3936/// Approximate copied vs added material between two blobs (Git `diffcore_count_changes`).
3937///
3938/// Returns `(copied_bytes_from_src, literal_added_bytes_in_dst)` matching Git's
3939/// `diffcore_count_changes` semantics (used for `--dirstat=changes` damage).
3940#[must_use]
3941pub fn diffcore_count_changes(old: &[u8], new: &[u8]) -> (u64, u64) {
3942    let src_is_text = !crate::merge_file::is_binary(old);
3943    let dst_is_text = !crate::merge_file::is_binary(new);
3944    let src_count = hash_blob_spans(old, src_is_text);
3945    let dst_count = hash_blob_spans(new, dst_is_text);
3946    let mut sc: u64 = 0;
3947    let mut la: u64 = 0;
3948    let mut si = 0usize;
3949    let mut di = 0usize;
3950    let src_len = src_count.len();
3951    let dst_len = dst_count.len();
3952    loop {
3953        if si >= src_len || src_count.data[si].cnt == 0 {
3954            break;
3955        }
3956        let s_hash = src_count.data[si].hashval;
3957        let s_cnt = u64::from(src_count.data[si].cnt);
3958        while di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval < s_hash {
3959            la += u64::from(dst_count.data[di].cnt);
3960            di += 1;
3961        }
3962        let mut dst_cnt = 0u64;
3963        if di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval == s_hash {
3964            dst_cnt = u64::from(dst_count.data[di].cnt);
3965            di += 1;
3966        }
3967        if s_cnt < dst_cnt {
3968            la += dst_cnt - s_cnt;
3969            sc += s_cnt;
3970        } else {
3971            sc += dst_cnt;
3972        }
3973        si += 1;
3974    }
3975    while di < dst_len && dst_count.data[di].cnt != 0 {
3976        la += u64::from(dst_count.data[di].cnt);
3977        di += 1;
3978    }
3979    (sc, la)
3980}
3981
3982/// Whether this modified blob pair should use Git's "complete rewrite" diffstat path when
3983/// `--break-rewrites` is in effect (`should_break` in `diffcore-break.c`).
3984#[must_use]
3985pub fn should_break_rewrite_for_stat(old: &[u8], new: &[u8]) -> bool {
3986    should_break_rewrite_inner(old, new, DIFF_DEFAULT_BREAK_SCORE)
3987}
3988
3989/// Whether an in-place blob edit should be split into delete+create for rename/copy (`should_break`
3990/// in `diffcore-break.c`). `break_score` is on the internal 0–[`GIT_DIFF_MAX_SCORE`] scale (default
3991/// [`DIFF_DEFAULT_BREAK_SCORE`]).
3992#[must_use]
3993pub fn should_break_rewrite_pair(old: &[u8], new: &[u8], break_score: u64) -> bool {
3994    should_break_rewrite_inner(old, new, break_score)
3995}
3996
3997/// Parse a single Git `parse_rename_score` token (`50`, `50%`, decimal forms) into internal
3998/// 0–[`GIT_DIFF_MAX_SCORE`] units.
3999pub fn parse_diff_rename_score_token(arg: &str) -> Option<u64> {
4000    let mut num: u64 = 0;
4001    let mut scale: u64 = 1;
4002    let mut dot = false;
4003    let mut saw_digit = false;
4004    for ch in arg.chars() {
4005        if !dot && ch == '.' {
4006            scale = 1;
4007            dot = true;
4008            continue;
4009        }
4010        if ch == '%' {
4011            scale = if dot { scale.saturating_mul(100) } else { 100 };
4012            break;
4013        }
4014        if ch.is_ascii_digit() {
4015            saw_digit = true;
4016            if scale < 100_000 {
4017                scale = scale.saturating_mul(10);
4018                num = num.saturating_mul(10) + u64::from(ch as u8 - b'0');
4019            }
4020        } else {
4021            break;
4022        }
4023    }
4024    if !saw_digit {
4025        return None;
4026    }
4027    Some(if num >= scale {
4028        GIT_DIFF_MAX_SCORE
4029    } else {
4030        GIT_DIFF_MAX_SCORE * num / scale
4031    })
4032}
4033
4034/// Git `merge_score` from `diffcore-break.c` when a pair is considered broken: how much of the
4035/// source blob was removed (0–[`DIFF_MAX_SCORE`] scale). Used for `dissimilarity index` metadata.
4036#[must_use]
4037pub fn rewrite_merge_score(old: &[u8], new: &[u8]) -> Option<u64> {
4038    if old.is_empty() {
4039        return None;
4040    }
4041    let max_size = old.len().max(new.len());
4042    if max_size < DIFF_MINIMUM_BREAK_SIZE {
4043        return None;
4044    }
4045    let (src_copied, _) = diffcore_count_changes(old, new);
4046    let src_copied = src_copied.min(old.len() as u64);
4047    let src_removed = (old.len() as u64).saturating_sub(src_copied);
4048    Some(src_removed * DIFF_MAX_SCORE / old.len() as u64)
4049}
4050
4051/// Percentage shown in `dissimilarity index N%` for a rewrite (`similarity_index` in Git's diff.c).
4052#[must_use]
4053pub fn rewrite_dissimilarity_index_percent(old: &[u8], new: &[u8]) -> Option<u32> {
4054    let score = rewrite_merge_score(old, new)?;
4055    Some((score * 100 / DIFF_MAX_SCORE).min(100) as u32)
4056}
4057
4058fn should_break_rewrite_inner(src: &[u8], dst: &[u8], break_score: u64) -> bool {
4059    if src.is_empty() {
4060        return false;
4061    }
4062    let max_size = src.len().max(dst.len());
4063    if max_size < DIFF_MINIMUM_BREAK_SIZE {
4064        return false;
4065    }
4066    let (src_copied, literal_added) = diffcore_count_changes(src, dst);
4067    let src_copied = src_copied.min(src.len() as u64);
4068    let mut literal_added = literal_added;
4069    let dst_len = dst.len() as u64;
4070    if src_copied < dst_len && literal_added + src_copied > dst_len {
4071        literal_added = dst_len.saturating_sub(src_copied);
4072    }
4073    let src_removed = (src.len() as u64).saturating_sub(src_copied);
4074    let merge_score = src_removed * DIFF_MAX_SCORE / src.len() as u64;
4075    if merge_score > break_score {
4076        return true;
4077    }
4078    let delta_size = src_removed.saturating_add(literal_added);
4079    if delta_size * DIFF_MAX_SCORE / (max_size as u64) < break_score {
4080        return false;
4081    }
4082    let s = src.len() as u64;
4083    if (s * break_score < src_removed * DIFF_MAX_SCORE)
4084        && (literal_added * 20 < src_removed)
4085        && (literal_added * 20 < src_copied)
4086    {
4087        return false;
4088    }
4089    true
4090}
4091
4092// ── Helpers ─────────────────────────────────────────────────────────
4093
4094/// Flatten a tree object recursively into a sorted list of (path, mode, oid).
4095struct FlatEntry {
4096    path: String,
4097    mode: u32,
4098    oid: ObjectId,
4099}
4100
4101fn flatten_tree(odb: &Odb, tree_oid: &ObjectId, prefix: &str) -> Result<Vec<FlatEntry>> {
4102    let entries = read_tree(odb, tree_oid)?;
4103    let mut result = Vec::new();
4104
4105    for entry in entries {
4106        let name_str = String::from_utf8_lossy(&entry.name);
4107        let path = format_path(prefix, &name_str);
4108        if is_tree_mode(entry.mode) {
4109            let nested = flatten_tree(odb, &entry.oid, &path)?;
4110            result.extend(nested);
4111        } else {
4112            result.push(FlatEntry {
4113                path,
4114                mode: entry.mode,
4115                oid: entry.oid,
4116            });
4117        }
4118    }
4119
4120    Ok(result)
4121}
4122
4123/// Paths present in `HEAD`'s tree with mode and blob/commit OID (for status porcelain v2).
4124pub fn head_path_states(
4125    odb: &Odb,
4126    head_tree: Option<&ObjectId>,
4127) -> Result<std::collections::BTreeMap<String, (u32, ObjectId)>> {
4128    let mut m = std::collections::BTreeMap::new();
4129    let Some(t) = head_tree else {
4130        return Ok(m);
4131    };
4132    for fe in flatten_tree(odb, t, "")? {
4133        m.insert(fe.path, (fe.mode, fe.oid));
4134    }
4135    Ok(m)
4136}
4137
4138/// Whether a mode represents a tree (directory).
4139fn is_tree_mode(mode: u32) -> bool {
4140    mode == 0o040000
4141}
4142
4143/// Build a path with an optional prefix.
4144fn format_path(prefix: &str, name: &str) -> String {
4145    if prefix.is_empty() {
4146        name.to_owned()
4147    } else {
4148        format!("{prefix}/{name}")
4149    }
4150}
4151
4152/// Format a numeric mode as a zero-padded octal string.
4153pub fn format_mode(mode: u32) -> String {
4154    format!("{mode:06o}")
4155}
4156
4157/// Read the HEAD commit OID from a submodule checkout directory.
4158///
4159/// Returns `None` if the path is missing, not a submodule checkout, or has no resolvable HEAD.
4160#[must_use]
4161pub fn read_submodule_head_for_checkout(sub_dir: &Path) -> Option<ObjectId> {
4162    read_submodule_head(sub_dir)
4163}
4164
4165/// First line of a commit's message for `git diff --submodule=log` output.
4166///
4167/// Honors `encoding` in the commit object (Latin-1 vs UTF-8) using the same
4168/// rules as Git's submodule summary.
4169#[must_use]
4170pub fn submodule_commit_subject_line(c: &CommitData) -> String {
4171    let enc = c.encoding.as_deref().unwrap_or("UTF-8");
4172    let is_latin1 = enc.eq_ignore_ascii_case("ISO8859-1")
4173        || enc.eq_ignore_ascii_case("ISO-8859-1")
4174        || enc.eq_ignore_ascii_case("LATIN1")
4175        || enc.eq_ignore_ascii_case("ISO-8859-15");
4176    if let Some(raw) = c.raw_message.as_deref() {
4177        let line = raw.split(|b| *b == b'\n').next().unwrap_or(raw);
4178        if is_latin1 {
4179            return line
4180                .iter()
4181                .map(|&b| b as char)
4182                .collect::<String>()
4183                .trim()
4184                .to_owned();
4185        }
4186        return String::from_utf8_lossy(line).trim().to_string();
4187    }
4188    c.message.lines().next().unwrap_or("").trim().to_owned()
4189}
4190
4191/// True when `sub_dir` is an empty directory (or missing), i.e. the placeholder left by
4192/// `git apply --index` before `git submodule update`.
4193fn submodule_worktree_is_unpopulated_placeholder(sub_dir: &Path) -> bool {
4194    match fs::read_dir(sub_dir) {
4195        Ok(mut it) => it.next().is_none(),
4196        Err(e) if e.kind() == std::io::ErrorKind::NotFound => true,
4197        Err(_) => false,
4198    }
4199}
4200
4201fn read_submodule_head(sub_dir: &Path) -> Option<ObjectId> {
4202    read_submodule_head_oid(sub_dir)
4203}
4204
4205/// Resolve the embedded git directory for a submodule work tree (`sub_dir/.git`).
4206#[must_use]
4207pub fn submodule_embedded_git_dir(sub_dir: &Path) -> Option<PathBuf> {
4208    let gitfile = sub_dir.join(".git");
4209    if gitfile.is_file() {
4210        let content = fs::read_to_string(&gitfile).ok()?;
4211        let gitdir = content
4212            .lines()
4213            .find_map(|l| l.strip_prefix("gitdir: "))?
4214            .trim();
4215        Some(if Path::new(gitdir).is_absolute() {
4216            PathBuf::from(gitdir)
4217        } else {
4218            sub_dir.join(gitdir)
4219        })
4220    } else if gitfile.is_dir() {
4221        Some(gitfile)
4222    } else {
4223        None
4224    }
4225}
4226
4227/// Walk upward from `sub_dir` to find the nearest containing Git work tree.
4228fn find_superproject_git(sub_dir: &Path) -> Option<(PathBuf, PathBuf)> {
4229    let mut cur = sub_dir.parent()?;
4230    loop {
4231        let git_path = cur.join(".git");
4232        if git_path.exists() {
4233            let gd = if git_path.is_file() {
4234                let content = fs::read_to_string(&git_path).ok()?;
4235                let line = content
4236                    .lines()
4237                    .find_map(|l| l.strip_prefix("gitdir: "))?
4238                    .trim();
4239                if Path::new(line).is_absolute() {
4240                    PathBuf::from(line)
4241                } else {
4242                    cur.join(line)
4243                }
4244            } else {
4245                git_path
4246            };
4247            return Some((cur.to_path_buf(), gd));
4248        }
4249        cur = cur.parent()?;
4250    }
4251}
4252
4253/// Read the HEAD commit OID from a submodule working tree directory.
4254///
4255/// Handles both embedded `.git` directories and `gitdir:` gitfiles pointing at
4256/// `.git/modules/...` (or other locations). Returns `None` if the path is not
4257/// a checkout or has no resolvable HEAD.
4258pub fn read_submodule_head_oid(sub_dir: &Path) -> Option<ObjectId> {
4259    // Submodule `.git` may be a gitfile pointing at `.git/modules/<name>` in another superproject
4260    // after `cp -R`. Prefer the current superproject's module dir when present.
4261    let mut git_dir = submodule_embedded_git_dir(sub_dir)?;
4262    if let Some((super_wt, super_git_dir)) = find_superproject_git(sub_dir) {
4263        let rel = sub_dir.strip_prefix(&super_wt).ok()?;
4264        let rel_str = rel.to_string_lossy().replace('\\', "/");
4265        let local_mod = super_git_dir
4266            .join("modules")
4267            .join(rel_str.trim_start_matches('/'));
4268        if local_mod.join("HEAD").exists() {
4269            let sg = super_git_dir.canonicalize().unwrap_or(super_git_dir);
4270            let cur = git_dir.canonicalize().unwrap_or_else(|_| git_dir.clone());
4271            if !cur.starts_with(&sg) {
4272                git_dir = local_mod;
4273            }
4274        }
4275    }
4276    let head_content = fs::read_to_string(git_dir.join("HEAD")).ok()?;
4277    let head_trimmed = head_content.trim();
4278    if head_trimmed.starts_with("ref: ") {
4279        // Use the full ref resolver so packed-refs and worktrees match Git. If `HEAD` is a stale
4280        // symref (e.g. still `refs/heads/master` while only `main` exists), fall back like
4281        // `resolve_gitlink_ref` / `git add` on embedded repos (`t6437-submodule-merge`).
4282        match crate::refs::resolve_ref(&git_dir, "HEAD") {
4283            Ok(oid) => Some(oid),
4284            Err(_) => {
4285                let mut found = None;
4286                for branch in ["main", "master"] {
4287                    let p = git_dir.join("refs/heads").join(branch);
4288                    if let Ok(s) = fs::read_to_string(&p) {
4289                        if let Ok(o) = ObjectId::from_hex(s.trim()) {
4290                            found = Some(o);
4291                            break;
4292                        }
4293                    }
4294                }
4295                found
4296            }
4297        }
4298    } else {
4299        ObjectId::from_hex(head_trimmed).ok()
4300    }
4301}
4302
4303/// True when a checked-out submodule at `rel_path` has modified or untracked content relative to
4304/// the gitlink `recorded_oid` stored in the superproject (used for `git diff <tree>` parity).
4305fn submodule_has_dirty_worktree_for_super_diff(
4306    super_worktree: &Path,
4307    rel_path: &str,
4308    recorded_oid: &ObjectId,
4309) -> bool {
4310    let flags = submodule_porcelain_flags(super_worktree, rel_path, *recorded_oid);
4311    flags.modified || flags.untracked
4312}
4313
4314/// Submodule dirty bits aligned with Git's `DIRTY_SUBMODULE_*` / porcelain v2 `S???` token.
4315#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
4316pub struct SubmodulePorcelainFlags {
4317    /// Submodule checkout HEAD differs from the gitlink OID recorded in the parent index.
4318    pub new_commits: bool,
4319    /// The submodule has its own staged or unstaged changes (`DIRTY_SUBMODULE_MODIFIED`).
4320    pub modified: bool,
4321    /// The submodule work tree contains paths not in its index (`DIRTY_SUBMODULE_UNTRACKED`).
4322    pub untracked: bool,
4323}
4324
4325/// Inspect a checked-out submodule at `rel_path` (relative to `super_worktree`) and return
4326/// flags used for `git status --porcelain=v2` submodule tokens.
4327///
4328/// `recorded_oid` is the gitlink OID stored in the **parent** index (stage 0). When the
4329/// submodule is not checked out or cannot be opened, returns [`Default::default()`].
4330pub fn submodule_porcelain_flags(
4331    super_worktree: &Path,
4332    rel_path: &str,
4333    recorded_oid: ObjectId,
4334) -> SubmodulePorcelainFlags {
4335    let sub_dir = super_worktree.join(rel_path);
4336    let Some(sub_git_dir) = submodule_embedded_git_dir(&sub_dir) else {
4337        return SubmodulePorcelainFlags::default();
4338    };
4339    let Some(sub_head) = read_submodule_head_oid(&sub_dir) else {
4340        return SubmodulePorcelainFlags::default();
4341    };
4342
4343    let new_commits = sub_head != recorded_oid;
4344
4345    let index_path = sub_git_dir.join("index");
4346    let sub_index = match crate::index::Index::load(&index_path) {
4347        Ok(ix) => ix,
4348        Err(_) => {
4349            return SubmodulePorcelainFlags {
4350                new_commits,
4351                ..Default::default()
4352            }
4353        }
4354    };
4355
4356    let tracked: std::collections::BTreeSet<String> = sub_index
4357        .entries
4358        .iter()
4359        .filter(|e| e.stage() == 0)
4360        .map(|e| String::from_utf8_lossy(&e.path).into_owned())
4361        .collect();
4362    let untracked = submodule_dir_has_untracked_inner(&sub_dir, &sub_dir, &tracked, &sub_index);
4363
4364    let objects_dir = sub_git_dir.join("objects");
4365    let odb = Odb::new(&objects_dir);
4366
4367    let sub_head_tree = (|| -> Option<ObjectId> {
4368        let h = fs::read_to_string(sub_git_dir.join("HEAD")).ok()?;
4369        let h_str = h.trim();
4370        let commit_oid = if let Some(r) = h_str.strip_prefix("ref: ") {
4371            let oid_hex = fs::read_to_string(sub_git_dir.join(r)).ok()?;
4372            ObjectId::from_hex(oid_hex.trim()).ok()?
4373        } else {
4374            ObjectId::from_hex(h_str).ok()?
4375        };
4376        let obj = odb.read(&commit_oid).ok()?;
4377        let commit = parse_commit(&obj.data).ok()?;
4378        Some(commit.tree)
4379    })();
4380
4381    let staged_dirty = sub_head_tree
4382        .as_ref()
4383        .map(|t| diff_index_to_tree(&odb, &sub_index, Some(t), false).map(|v| !v.is_empty()))
4384        .unwrap_or(Ok(false));
4385    let staged_dirty = staged_dirty.unwrap_or(false);
4386
4387    let unstaged_dirty = diff_index_to_worktree(&odb, &sub_index, &sub_dir, false, true)
4388        .map(|v| !v.is_empty())
4389        .unwrap_or(false);
4390
4391    let mut modified = staged_dirty || unstaged_dirty;
4392
4393    // Nested submodule has its own index: OR `modified` from immediate gitlink children so a
4394    // dirty nested checkout (e.g. staged `file` under `sub1/sub2`) marks the parent gitlink as
4395    // modified in the superproject (t7506). Do **not** OR `untracked` — untracked-only inside a
4396    // nested submodule must stay `S..U` on the parent, not `S.U` / `S.M.`.
4397    for e in &sub_index.entries {
4398        if e.stage() != 0 || e.mode != 0o160000 {
4399            continue;
4400        }
4401        let child = String::from_utf8_lossy(&e.path).into_owned();
4402        let full_rel = if rel_path.is_empty() {
4403            child
4404        } else {
4405            format!("{rel_path}/{child}")
4406        };
4407        let nested = submodule_porcelain_flags(super_worktree, &full_rel, e.oid);
4408        modified |= nested.modified;
4409    }
4410
4411    SubmodulePorcelainFlags {
4412        new_commits,
4413        modified,
4414        untracked,
4415    }
4416}
4417
4418fn submodule_dir_has_untracked_inner(
4419    dir: &Path,
4420    root: &Path,
4421    tracked: &std::collections::BTreeSet<String>,
4422    owning_index: &Index,
4423) -> bool {
4424    let entries = match fs::read_dir(dir) {
4425        Ok(e) => e,
4426        Err(_) => return false,
4427    };
4428    let mut sorted: Vec<_> = entries.filter_map(|e| e.ok()).collect();
4429    sorted.sort_by_key(|e| e.file_name());
4430
4431    for entry in sorted {
4432        let name = entry.file_name().to_string_lossy().to_string();
4433        if name == ".git" {
4434            continue;
4435        }
4436        let path = entry.path();
4437        let rel = path
4438            .strip_prefix(root)
4439            .map(|p| p.to_string_lossy().to_string())
4440            .unwrap_or_else(|_| name.clone());
4441
4442        let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
4443        if is_dir {
4444            let is_gitlink = owning_index
4445                .get(rel.as_bytes(), 0)
4446                .is_some_and(|e| e.mode == 0o160000);
4447            if is_gitlink {
4448                let Some(nested_git) = submodule_embedded_git_dir(&path) else {
4449                    continue;
4450                };
4451                let nested_index_path = nested_git.join("index");
4452                let Ok(nested_ix) = crate::index::Index::load(&nested_index_path) else {
4453                    continue;
4454                };
4455                let nested_tracked: std::collections::BTreeSet<String> = nested_ix
4456                    .entries
4457                    .iter()
4458                    .filter(|e| e.stage() == 0)
4459                    .map(|e| String::from_utf8_lossy(&e.path).into_owned())
4460                    .collect();
4461                if submodule_dir_has_untracked_inner(&path, &path, &nested_tracked, &nested_ix) {
4462                    return true;
4463                }
4464            } else if submodule_dir_has_untracked_inner(&path, root, tracked, owning_index) {
4465                return true;
4466            }
4467        } else if !tracked.contains(&rel) {
4468            return true;
4469        }
4470    }
4471    false
4472}
grit_lib/diff.rs

grit_lib/
diff.rs