grit_lib/
diff.rs

1//! Diff machinery — compare trees, index entries, and working tree files.
2//!
3//! # Overview
4//!
5//! This module provides the core diffing infrastructure shared by `diff`,
6//! `diff-index`, `status`, `log`, `show`, `commit`, and `merge`.
7//!
8//! ## Levels of comparison
9//!
10//! 1. **Tree-to-tree** — compare two tree objects (e.g. for `log`/`show`).
11//! 2. **Tree-to-index** — compare a tree (usually HEAD) against the index
12//!    (staged changes, used by `diff --cached` and `status`).
13//! 3. **Index-to-worktree** — compare index against the working directory
14//!    (unstaged changes, used by `diff` and `status`).
15//!
16//! ## Content diff
17//!
18//! Line-level diffing uses the `similar` crate (Myers, patience, minimal) and,
19//! for Git's `histogram` algorithm, `imara-diff` for output compatible with upstream Git.
20//! Output formats: unified patch, raw (`:old-mode new-mode ...`), stat,
21//! numstat.
22
23use std::fs;
24#[cfg(unix)]
25use std::os::unix::fs::MetadataExt;
26use std::path::{Path, PathBuf};
27
28use crate::config::ConfigSet;
29use crate::diff_indent_heuristic;
30use crate::error::{Error, Result};
31use crate::index::{Index, IndexEntry};
32use crate::objects::{parse_commit, parse_tree, CommitData, ObjectId, ObjectKind, TreeEntry};
33use crate::odb::Odb;
34use crate::userdiff::FuncnameMatcher;
35
36/// Splits imara-diff unified body (concatenated hunks) into per-hunk slices for post-processing.
37fn imara_unified_hunk_slices(body: &str) -> Vec<&str> {
38    let mut starts: Vec<usize> = Vec::new();
39    if body.starts_with("@@") {
40        starts.push(0);
41    }
42    for (idx, _) in body.match_indices("\n@@ ") {
43        starts.push(idx + 1);
44    }
45    starts.push(body.len());
46    starts.windows(2).map(|w| &body[w[0]..w[1]]).collect()
47}
48
49/// Effective context length for imara's hunk merger so it approximates Git's
50/// `2 * U + inter_hunk_context` unchanged-line merge threshold.
51fn imara_context_len_for_git(context_lines: usize, inter_hunk_context: usize) -> u32 {
52    (2usize
53        .saturating_mul(context_lines)
54        .saturating_add(inter_hunk_context))
55    .div_ceil(2)
56    .min(u32::MAX as usize) as u32
57}
58
59fn histogram_unified_body_raw(
60    old_content: &str,
61    new_content: &str,
62    context_lines: usize,
63    inter_hunk_context: usize,
64) -> String {
65    use imara_diff::{Algorithm, BasicLineDiffPrinter, Diff, InternedInput, UnifiedDiffConfig};
66
67    let input = InternedInput::new(old_content, new_content);
68    let mut diff = Diff::compute(Algorithm::Histogram, &input);
69    diff.postprocess_lines(&input);
70    let mut config = UnifiedDiffConfig::default();
71    config.context_len(imara_context_len_for_git(context_lines, inter_hunk_context));
72    let printer = BasicLineDiffPrinter(&input.interner);
73    diff.unified_diff(&printer, config, &input).to_string()
74}
75
76/// Unified diff hunks for Git's histogram algorithm (no `---` / `+++` lines).
77///
78/// Used by `--no-index` when whitespace normalization is off so the patch matches upstream Git.
79#[must_use]
80pub fn unified_diff_histogram_hunks_only(
81    old_content: &str,
82    new_content: &str,
83    context_lines: usize,
84    inter_hunk_context: usize,
85) -> String {
86    histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context)
87}
88
89/// Full unified diff (`---` / `+++` / hunks) using Git's histogram algorithm.
90#[must_use]
91pub fn unified_diff_histogram_with_prefix_and_funcname(
92    old_content: &str,
93    new_content: &str,
94    old_path: &str,
95    new_path: &str,
96    context_lines: usize,
97    inter_hunk_context: usize,
98    src_prefix: &str,
99    dst_prefix: &str,
100    funcname_matcher: Option<&FuncnameMatcher>,
101    quote_path_fully: bool,
102) -> String {
103    use crate::quote_path::format_diff_path_with_prefix;
104
105    let body =
106        histogram_unified_body_raw(old_content, new_content, context_lines, inter_hunk_context);
107
108    let mut output = String::new();
109    if old_path == "/dev/null" {
110        output.push_str("--- /dev/null\n");
111    } else if src_prefix.is_empty() {
112        output.push_str(&format!("--- {old_path}\n"));
113    } else {
114        output.push_str("--- ");
115        output.push_str(&format_diff_path_with_prefix(
116            src_prefix,
117            old_path,
118            quote_path_fully,
119        ));
120        output.push('\n');
121    }
122    if new_path == "/dev/null" {
123        output.push_str("+++ /dev/null\n");
124    } else if dst_prefix.is_empty() {
125        output.push_str(&format!("+++ {new_path}\n"));
126    } else {
127        output.push_str("+++ ");
128        output.push_str(&format_diff_path_with_prefix(
129            dst_prefix,
130            new_path,
131            quote_path_fully,
132        ));
133        output.push('\n');
134    }
135
136    let old_lines: Vec<&str> = old_content.lines().collect();
137    for hunk_str in imara_unified_hunk_slices(&body) {
138        if hunk_str.is_empty() {
139            continue;
140        }
141        if let Some(first_newline) = hunk_str.find('\n') {
142            let header_line = &hunk_str[..first_newline];
143            let rest = &hunk_str[first_newline..];
144            if let Some(func_ctx) =
145                extract_function_context(header_line, &old_lines, funcname_matcher)
146            {
147                output.push_str(header_line);
148                output.push(' ');
149                output.push_str(&func_ctx);
150                output.push_str(rest);
151            } else {
152                output.push_str(hunk_str);
153            }
154        } else {
155            output.push_str(hunk_str);
156        }
157    }
158
159    output
160}
161
162/// `diff.indentHeuristic` from config (Git defaults to true when unset).
163#[must_use]
164pub fn indent_heuristic_from_config(config: &ConfigSet) -> bool {
165    match config.get_bool("diff.indentHeuristic") {
166        Some(Ok(b)) => b,
167        Some(Err(_)) | None => true,
168    }
169}
170
171/// Resolve indent heuristic: `--no-indent-heuristic` and `--indent-heuristic` override config.
172#[must_use]
173pub fn resolve_indent_heuristic(
174    config: &ConfigSet,
175    cli_indent_heuristic: bool,
176    cli_no_indent_heuristic: bool,
177) -> bool {
178    if cli_no_indent_heuristic {
179        false
180    } else if cli_indent_heuristic {
181        true
182    } else {
183        indent_heuristic_from_config(config)
184    }
185}
186
187/// Parse `--indent-heuristic` / `--no-indent-heuristic` from a plumbing argv slice (last occurrence wins).
188#[must_use]
189pub fn parse_indent_heuristic_cli_flags(argv: &[String]) -> (bool, bool) {
190    let mut indent_heuristic = false;
191    let mut no_indent_heuristic = false;
192    for a in argv {
193        match a.as_str() {
194            "--indent-heuristic" => {
195                indent_heuristic = true;
196                no_indent_heuristic = false;
197            }
198            "--no-indent-heuristic" => {
199                no_indent_heuristic = true;
200                indent_heuristic = false;
201            }
202            _ => {}
203        }
204    }
205    (indent_heuristic, no_indent_heuristic)
206}
207
208/// Line-diff ops for string slices after Git `xdl_change_compact` (and optional indent heuristic).
209#[must_use]
210pub fn diff_slice_ops_compacted(
211    old_lines: &[&str],
212    new_lines: &[&str],
213    algorithm: similar::Algorithm,
214    indent_heuristic: bool,
215) -> Vec<similar::DiffOp> {
216    diff_indent_heuristic::diff_slice_ops_compacted(
217        old_lines,
218        new_lines,
219        algorithm,
220        indent_heuristic,
221    )
222}
223
224/// Map each line in `new_joined` to its origin in `old_joined` after Git-style compaction (for blame).
225#[must_use]
226pub fn map_new_to_old_lines_compacted(
227    old_joined: &str,
228    new_joined: &str,
229    algorithm: similar::Algorithm,
230    indent_heuristic: bool,
231    new_line_count: usize,
232) -> Vec<Option<usize>> {
233    let ops = diff_indent_heuristic::diff_lines_ops_compacted(
234        old_joined,
235        new_joined,
236        algorithm,
237        indent_heuristic,
238    );
239    diff_indent_heuristic::map_new_to_old_from_ops(&ops, new_line_count)
240}
241
242/// The kind of change between two sides of a diff.
243#[derive(Debug, Clone, Copy, PartialEq, Eq)]
244pub enum DiffStatus {
245    /// File was added.
246    Added,
247    /// File was deleted.
248    Deleted,
249    /// File was modified (content or mode change).
250    Modified,
251    /// File was renamed (with optional content change).
252    Renamed,
253    /// File was copied.
254    Copied,
255    /// File type changed (e.g. regular → symlink).
256    TypeChanged,
257    /// Unmerged (conflict).
258    Unmerged,
259}
260
261impl DiffStatus {
262    /// Single-character status letter used in raw diff output.
263    #[must_use]
264    pub fn letter(&self) -> char {
265        match self {
266            Self::Added => 'A',
267            Self::Deleted => 'D',
268            Self::Modified => 'M',
269            Self::Renamed => 'R',
270            Self::Copied => 'C',
271            Self::TypeChanged => 'T',
272            Self::Unmerged => 'U',
273        }
274    }
275}
276
277/// A single diff entry representing one changed path.
278#[derive(Debug, Clone, PartialEq, Eq)]
279pub struct DiffEntry {
280    /// The status of this change.
281    pub status: DiffStatus,
282    /// Path in the "old" side (None for Added).
283    pub old_path: Option<String>,
284    /// Path in the "new" side (None for Deleted).
285    pub new_path: Option<String>,
286    /// Old file mode (as octal string, e.g. "100644").
287    pub old_mode: String,
288    /// New file mode.
289    pub new_mode: String,
290    /// Old object ID (zero OID for Added).
291    pub old_oid: ObjectId,
292    /// New object ID (zero OID for Deleted).
293    pub new_oid: ObjectId,
294    /// Similarity score (0–100) for renames/copies.
295    pub score: Option<u32>,
296}
297
298impl DiffEntry {
299    /// The primary path for display (new_path for adds, old_path for deletes).
300    #[must_use]
301    pub fn path(&self) -> &str {
302        self.new_path
303            .as_deref()
304            .or(self.old_path.as_deref())
305            .unwrap_or("")
306    }
307
308    /// Return a human-oriented path display for this entry.
309    ///
310    /// For renames and copies this returns `old -> new`; for all other entry
311    /// kinds this returns the primary path.
312    #[must_use]
313    pub fn display_path(&self) -> String {
314        match self.status {
315            DiffStatus::Renamed | DiffStatus::Copied => {
316                let old = self.old_path.as_deref().unwrap_or("");
317                let new = self.new_path.as_deref().unwrap_or("");
318                if old.is_empty() || new.is_empty() {
319                    self.path().to_owned()
320                } else {
321                    format!("{old} -> {new}")
322                }
323            }
324            _ => self.path().to_owned(),
325        }
326    }
327}
328
329/// The zero (null) object ID used for "no object" in diff output.
330pub const ZERO_OID: &str = "0000000000000000000000000000000000000000";
331
332/// Return the zero ObjectId.
333#[must_use]
334pub fn zero_oid() -> ObjectId {
335    ObjectId::from_bytes(&[0u8; 20]).unwrap_or_else(|_| {
336        // This should never fail since we pass exactly 20 bytes
337        panic!("internal error: failed to create zero OID");
338    })
339}
340
341/// Return the ObjectId for the empty blob object.
342#[must_use]
343pub fn empty_blob_oid() -> ObjectId {
344    ObjectId::from_hex("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391").unwrap_or_else(|_| {
345        // This should never fail since the object ID literal is valid.
346        panic!("internal error: failed to create empty blob OID");
347    })
348}
349
350// ── Tree-to-tree diff ───────────────────────────────────────────────
351
352/// Compare two trees and return the list of changed entries.
353///
354/// # Parameters
355///
356/// - `odb` — object database to read tree objects from.
357/// - `old_tree_oid` — OID of the old tree (or `None` for comparison against empty).
358/// - `new_tree_oid` — OID of the new tree (or `None` for comparison against empty).
359/// - `prefix` — path prefix for nested tree recursion (empty string for root).
360///
361/// # Errors
362///
363/// Returns errors from object database reads.
364pub fn diff_trees(
365    odb: &Odb,
366    old_tree_oid: Option<&ObjectId>,
367    new_tree_oid: Option<&ObjectId>,
368    prefix: &str,
369) -> Result<Vec<DiffEntry>> {
370    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, false)
371}
372
373/// Like `diff_trees` but with `show_trees` flag: when true, emit entries for
374/// tree objects themselves in addition to their recursive contents (the `-t`
375/// flag of `diff-tree`).
376pub fn diff_trees_show_tree_entries(
377    odb: &Odb,
378    old_tree_oid: Option<&ObjectId>,
379    new_tree_oid: Option<&ObjectId>,
380    prefix: &str,
381) -> Result<Vec<DiffEntry>> {
382    diff_trees_opts(odb, old_tree_oid, new_tree_oid, prefix, true)
383}
384
385fn diff_trees_opts(
386    odb: &Odb,
387    old_tree_oid: Option<&ObjectId>,
388    new_tree_oid: Option<&ObjectId>,
389    prefix: &str,
390    show_trees: bool,
391) -> Result<Vec<DiffEntry>> {
392    let old_entries = match old_tree_oid {
393        Some(oid) => read_tree(odb, oid)?,
394        None => Vec::new(),
395    };
396    let new_entries = match new_tree_oid {
397        Some(oid) => read_tree(odb, oid)?,
398        None => Vec::new(),
399    };
400
401    let mut result = Vec::new();
402    diff_tree_entries_opts(
403        odb,
404        &old_entries,
405        &new_entries,
406        prefix,
407        show_trees,
408        &mut result,
409    )?;
410    Ok(result)
411}
412
413/// Read and parse a tree object from the ODB.
414fn read_tree(odb: &Odb, oid: &ObjectId) -> Result<Vec<TreeEntry>> {
415    let obj = odb.read(oid)?;
416    if obj.kind != ObjectKind::Tree {
417        return Err(Error::CorruptObject(format!(
418            "expected tree, got {}",
419            obj.kind.as_str()
420        )));
421    }
422    parse_tree(&obj.data)
423}
424
425/// Compare two sorted lists of tree entries, recursing into subtrees.
426fn diff_tree_entries_opts(
427    odb: &Odb,
428    old: &[TreeEntry],
429    new: &[TreeEntry],
430    prefix: &str,
431    show_trees: bool,
432    result: &mut Vec<DiffEntry>,
433) -> Result<()> {
434    let mut oi = 0;
435    let mut ni = 0;
436
437    while oi < old.len() || ni < new.len() {
438        match (old.get(oi), new.get(ni)) {
439            (Some(o), Some(n)) => {
440                let cmp = crate::objects::tree_entry_cmp(
441                    &o.name,
442                    is_tree_mode(o.mode),
443                    &n.name,
444                    is_tree_mode(n.mode),
445                );
446                match cmp {
447                    std::cmp::Ordering::Less => {
448                        // Old entry not in new → deleted
449                        emit_deleted_opts(odb, o, prefix, show_trees, result)?;
450                        oi += 1;
451                    }
452                    std::cmp::Ordering::Greater => {
453                        // New entry not in old → added
454                        emit_added_opts(odb, n, prefix, show_trees, result)?;
455                        ni += 1;
456                    }
457                    std::cmp::Ordering::Equal => {
458                        // Both present — check for changes
459                        if o.oid != n.oid || o.mode != n.mode {
460                            let name_str = String::from_utf8_lossy(&o.name);
461                            let path = format_path(prefix, &name_str);
462                            if is_tree_mode(o.mode) && is_tree_mode(n.mode) {
463                                // Both are trees
464                                if show_trees {
465                                    result.push(DiffEntry {
466                                        status: DiffStatus::Modified,
467                                        old_path: Some(path.clone()),
468                                        new_path: Some(path.clone()),
469                                        old_mode: format_mode(o.mode),
470                                        new_mode: format_mode(n.mode),
471                                        old_oid: o.oid,
472                                        new_oid: n.oid,
473                                        score: None,
474                                    });
475                                }
476                                // Recurse
477                                let nested = diff_trees_opts(
478                                    odb,
479                                    Some(&o.oid),
480                                    Some(&n.oid),
481                                    &path,
482                                    show_trees,
483                                )?;
484                                result.extend(nested);
485                            } else if is_tree_mode(o.mode) && !is_tree_mode(n.mode) {
486                                // Tree → blob: delete tree contents, add blob
487                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
488                                emit_added_opts(odb, n, prefix, show_trees, result)?;
489                            } else if !is_tree_mode(o.mode) && is_tree_mode(n.mode) {
490                                // Blob → tree: delete blob, add tree contents
491                                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
492                                emit_added_opts(odb, n, prefix, show_trees, result)?;
493                            } else {
494                                // Both blobs — modified.
495                                // A mode-only change (e.g. chmod) is Modified.
496                                // TypeChanged is only for actual type changes (blob ↔ symlink).
497                                let old_type = o.mode & 0o170000;
498                                let new_type = n.mode & 0o170000;
499                                result.push(DiffEntry {
500                                    status: if old_type != new_type {
501                                        DiffStatus::TypeChanged
502                                    } else {
503                                        DiffStatus::Modified
504                                    },
505                                    old_path: Some(path.clone()),
506                                    new_path: Some(path),
507                                    old_mode: format_mode(o.mode),
508                                    new_mode: format_mode(n.mode),
509                                    old_oid: o.oid,
510                                    new_oid: n.oid,
511                                    score: None,
512                                });
513                            }
514                        }
515                        oi += 1;
516                        ni += 1;
517                    }
518                }
519            }
520            (Some(o), None) => {
521                emit_deleted_opts(odb, o, prefix, show_trees, result)?;
522                oi += 1;
523            }
524            (None, Some(n)) => {
525                emit_added_opts(odb, n, prefix, show_trees, result)?;
526                ni += 1;
527            }
528            (None, None) => break,
529        }
530    }
531
532    Ok(())
533}
534
535fn emit_deleted_opts(
536    odb: &Odb,
537    entry: &TreeEntry,
538    prefix: &str,
539    show_trees: bool,
540    result: &mut Vec<DiffEntry>,
541) -> Result<()> {
542    let name_str = String::from_utf8_lossy(&entry.name);
543    let path = format_path(prefix, &name_str);
544    if is_tree_mode(entry.mode) {
545        if show_trees {
546            result.push(DiffEntry {
547                status: DiffStatus::Deleted,
548                old_path: Some(path.clone()),
549                new_path: None,
550                old_mode: format_mode(entry.mode),
551                new_mode: "000000".to_owned(),
552                old_oid: entry.oid,
553                new_oid: zero_oid(),
554                score: None,
555            });
556        }
557        // Recurse into deleted tree
558        let nested = diff_trees_opts(odb, Some(&entry.oid), None, &path, show_trees)?;
559        result.extend(nested);
560    } else {
561        result.push(DiffEntry {
562            status: DiffStatus::Deleted,
563            old_path: Some(path.clone()),
564            new_path: None,
565            old_mode: format_mode(entry.mode),
566            new_mode: "000000".to_owned(),
567            old_oid: entry.oid,
568            new_oid: zero_oid(),
569            score: None,
570        });
571    }
572    Ok(())
573}
574
575fn emit_added_opts(
576    odb: &Odb,
577    entry: &TreeEntry,
578    prefix: &str,
579    show_trees: bool,
580    result: &mut Vec<DiffEntry>,
581) -> Result<()> {
582    let name_str = String::from_utf8_lossy(&entry.name);
583    let path = format_path(prefix, &name_str);
584    if is_tree_mode(entry.mode) {
585        if show_trees {
586            result.push(DiffEntry {
587                status: DiffStatus::Added,
588                old_path: None,
589                new_path: Some(path.clone()),
590                old_mode: "000000".to_owned(),
591                new_mode: format_mode(entry.mode),
592                old_oid: zero_oid(),
593                new_oid: entry.oid,
594                score: None,
595            });
596        }
597        // Recurse into added tree
598        let nested = diff_trees_opts(odb, None, Some(&entry.oid), &path, show_trees)?;
599        result.extend(nested);
600    } else {
601        result.push(DiffEntry {
602            status: DiffStatus::Added,
603            old_path: None,
604            new_path: Some(path),
605            old_mode: "000000".to_owned(),
606            new_mode: format_mode(entry.mode),
607            old_oid: zero_oid(),
608            new_oid: entry.oid,
609            score: None,
610        });
611    }
612    Ok(())
613}
614
615// ── Index-to-tree diff (staged changes) ─────────────────────────────
616
617/// Compare the index against a tree (usually HEAD's tree).
618///
619/// This shows "staged" changes — what would be committed.
620///
621/// # Parameters
622///
623/// - `odb` — object database.
624/// - `index` — the current index.
625/// - `tree_oid` — the tree to compare against (e.g. HEAD's tree), or `None`
626///   for comparison against an empty tree (initial commit).
627///
628/// # Errors
629///
630/// Returns errors from ODB reads.
631///
632/// When `ignore_submodules` is true, gitlink (`160000`) paths are omitted from the diff, matching
633/// Git's `require_clean_work_tree(..., ignore_submodules=1)` used by `git rebase` / `git pull`.
634pub fn diff_index_to_tree(
635    odb: &Odb,
636    index: &Index,
637    tree_oid: Option<&ObjectId>,
638    ignore_submodules: bool,
639) -> Result<Vec<DiffEntry>> {
640    // Flatten the tree into a sorted list of (path, mode, oid)
641    let tree_entries = match tree_oid {
642        Some(oid) => flatten_tree(odb, oid, "")?,
643        None => Vec::new(),
644    };
645
646    // Build maps keyed by path
647    let mut tree_map: std::collections::BTreeMap<&str, &FlatEntry> =
648        std::collections::BTreeMap::new();
649    for entry in &tree_entries {
650        tree_map.insert(&entry.path, entry);
651    }
652
653    let mut result = Vec::new();
654    let mut stage0_paths = std::collections::BTreeSet::new();
655    let mut unmerged_modes: std::collections::BTreeMap<String, (u8, u32)> =
656        std::collections::BTreeMap::new();
657
658    // Check index entries against tree
659    for ie in &index.entries {
660        let path = String::from_utf8_lossy(&ie.path).to_string();
661        if ie.stage() == 0 && ie.intent_to_add() {
662            // Intent-to-add entries are not "staged" for diff-index / status
663            // (matches Git: `git diff --cached` is empty for `-N` paths).
664            continue;
665        }
666        if ie.stage() != 0 {
667            let rank = match ie.stage() {
668                2 => 0u8,
669                3 => 1u8,
670                1 => 2u8,
671                _ => 3u8,
672            };
673            match unmerged_modes.get(&path) {
674                Some((existing_rank, _)) if *existing_rank <= rank => {}
675                _ => {
676                    unmerged_modes.insert(path, (rank, ie.mode));
677                }
678            }
679            continue;
680        }
681        if ignore_submodules && ie.mode == 0o160000 {
682            let _ = tree_map.remove(path.as_str());
683            stage0_paths.insert(path.clone());
684            continue;
685        }
686        stage0_paths.insert(path.clone());
687        match tree_map.remove(path.as_str()) {
688            Some(te) => {
689                // Present in both — check for differences
690                if te.oid != ie.oid || te.mode != ie.mode {
691                    result.push(DiffEntry {
692                        status: DiffStatus::Modified,
693                        old_path: Some(path.clone()),
694                        new_path: Some(path),
695                        old_mode: format_mode(te.mode),
696                        new_mode: format_mode(ie.mode),
697                        old_oid: te.oid,
698                        new_oid: ie.oid,
699                        score: None,
700                    });
701                }
702            }
703            None => {
704                // In index but not tree → added
705                result.push(DiffEntry {
706                    status: DiffStatus::Added,
707                    old_path: None,
708                    new_path: Some(path),
709                    old_mode: "000000".to_owned(),
710                    new_mode: format_mode(ie.mode),
711                    old_oid: zero_oid(),
712                    new_oid: ie.oid,
713                    score: None,
714                });
715            }
716        }
717    }
718
719    for (path, (_, mode)) in &unmerged_modes {
720        if stage0_paths.contains(path) {
721            continue;
722        }
723        tree_map.remove(path.as_str());
724        result.push(DiffEntry {
725            status: DiffStatus::Unmerged,
726            old_path: Some(path.clone()),
727            new_path: Some(path.clone()),
728            old_mode: "000000".to_owned(),
729            new_mode: format_mode(*mode),
730            old_oid: zero_oid(),
731            new_oid: zero_oid(),
732            score: None,
733        });
734    }
735
736    // Remaining tree entries not in index → deleted
737    for (path, te) in tree_map {
738        if ignore_submodules && te.mode == 0o160000 {
739            continue;
740        }
741        result.push(DiffEntry {
742            status: DiffStatus::Deleted,
743            old_path: Some(path.to_owned()),
744            new_path: None,
745            old_mode: format_mode(te.mode),
746            new_mode: "000000".to_owned(),
747            old_oid: te.oid,
748            new_oid: zero_oid(),
749            score: None,
750        });
751    }
752
753    result.sort_by(|a, b| a.path().cmp(b.path()));
754    Ok(result)
755}
756
757// ── Index-to-worktree diff (unstaged changes) ───────────────────────
758
759/// Compare the index against the working tree.
760///
761/// This shows "unstaged" changes — modifications not yet staged.
762///
763/// Entries with [`IndexEntry::assume_unchanged`] or [`IndexEntry::skip_worktree`] are treated as
764/// matching the work tree without examining the filesystem (Git `CE_VALID` / skip-worktree).
765///
766/// # Parameters
767///
768/// - `odb` — object database (for hashing worktree files).
769/// - `index` — the current index.
770/// - `work_tree` — path to the working tree root.
771/// - `ignore_submodule_untracked` — when true, gitlink entries are not dirty solely from untracked
772///   files inside the submodule (matches `git status -uno`).
773/// - `simplify_gitlinks` — when true, nested gitlink entries only compare the submodule checkout
774///   HEAD to the recorded OID (ignore dirty work trees inside nested submodules). Used when
775///   computing `submodule_porcelain_flags` so untracked files under a nested submodule do not set
776///   the parent submodule's `modified` bit (Git `DIRTY_SUBMODULE_MODIFIED`; t7506).
777///
778/// # Errors
779///
780/// Returns errors from I/O or hashing.
781pub fn diff_index_to_worktree(
782    odb: &Odb,
783    index: &Index,
784    work_tree: &Path,
785    ignore_submodule_untracked: bool,
786    simplify_gitlinks: bool,
787) -> Result<Vec<DiffEntry>> {
788    diff_index_to_worktree_with_options(
789        odb,
790        index,
791        work_tree,
792        DiffIndexToWorktreeOptions {
793            ignore_submodule_untracked,
794            simplify_gitlinks,
795            ..DiffIndexToWorktreeOptions::default()
796        },
797    )
798}
799
800/// Additional inputs for [`diff_index_to_worktree_with_options`].
801#[derive(Debug, Clone, Copy, Default)]
802pub struct DiffIndexToWorktreeOptions {
803    /// Optional index mtime pair `(sec, nsec)` sampled when the index was read.
804    ///
805    /// When provided, entries with matching stat data are still considered dirty candidates if
806    /// their recorded mtime is "racy" (at or after this timestamp), matching Git's
807    /// `is_racy_timestamp` behavior.
808    pub index_mtime: Option<(u32, u32)>,
809    /// When true, gitlink entries are not dirty solely from untracked files inside the submodule.
810    pub ignore_submodule_untracked: bool,
811    /// When true, nested gitlink entries only compare the submodule checkout HEAD to the recorded OID.
812    pub simplify_gitlinks: bool,
813}
814
815/// Compare the index against the working tree with optional racy-timestamp context.
816///
817/// This variant enables a stat-trust fast path: if an entry's stat tuple matches and the mode is
818/// unchanged, the worktree blob hash is skipped unless the entry is racy relative to the supplied
819/// index mtime.
820///
821/// # Parameters
822///
823/// - `odb` — object database (for hashing worktree files).
824/// - `index` — the current index.
825/// - `work_tree` — path to the working tree root.
826/// - `options` — optional context for racy timestamp checks.
827///
828/// # Errors
829///
830/// Returns errors from I/O or hashing.
831pub fn diff_index_to_worktree_with_options(
832    odb: &Odb,
833    index: &Index,
834    work_tree: &Path,
835    options: DiffIndexToWorktreeOptions,
836) -> Result<Vec<DiffEntry>> {
837    use crate::config::ConfigSet;
838    use crate::crlf;
839
840    let ignore_submodule_untracked = options.ignore_submodule_untracked;
841    let simplify_gitlinks = options.simplify_gitlinks;
842
843    let git_dir = work_tree.join(".git");
844    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
845    let conv = crlf::ConversionConfig::from_config(&config);
846    let attrs = crlf::load_gitattributes(work_tree);
847
848    let mut result = Vec::new();
849    let mut unmerged_base: std::collections::BTreeMap<String, (u8, &IndexEntry)> =
850        std::collections::BTreeMap::new();
851
852    for ie in &index.entries {
853        if ie.stage() != 0 {
854            let path = String::from_utf8_lossy(&ie.path).to_string();
855            let rank = match ie.stage() {
856                2 => 0u8,
857                3 => 1u8,
858                1 => 2u8,
859                _ => 3u8,
860            };
861            match unmerged_base.get(&path) {
862                Some((existing_rank, _)) if *existing_rank <= rank => {}
863                _ => {
864                    unmerged_base.insert(path, (rank, ie));
865                }
866            }
867            continue;
868        }
869        // Sparse checkout: paths outside the cone are not expected on disk; `assume_unchanged`
870        // is treated as clean without reading the filesystem (wt-status.c).
871        if ie.skip_worktree() || ie.assume_unchanged() {
872            continue;
873        }
874        // Use str slice directly to avoid allocation for path joining;
875        // only allocate String if we need it for DiffEntry output.
876        let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
877        let is_intent_to_add = ie.intent_to_add();
878
879        // Gitlink entries (submodules): Git's `diff-index` reports `M` when the recorded
880        // commit differs from the submodule checkout **or** when the submodule work tree is
881        // dirty (staged/unstaged/untracked) even if HEAD still matches the gitlink. For the
882        // latter case the "new" OID column is the null OID (see `git diff-index` / t7506).
883        if ie.mode == 0o160000 {
884            let sub_dir = work_tree.join(path_str_ref);
885            let sub_head_oid = read_submodule_head_oid(&sub_dir);
886            let ref_matches = match sub_head_oid {
887                Some(oid) => oid == ie.oid,
888                None => submodule_worktree_is_unpopulated_placeholder(&sub_dir),
889            };
890            if simplify_gitlinks {
891                if !ref_matches {
892                    let path_owned = path_str_ref.to_owned();
893                    let new_oid = sub_head_oid.unwrap_or_else(zero_oid);
894                    result.push(DiffEntry {
895                        status: DiffStatus::Modified,
896                        old_path: Some(path_owned.clone()),
897                        new_path: Some(path_owned),
898                        old_mode: format_mode(ie.mode),
899                        new_mode: format_mode(ie.mode),
900                        old_oid: ie.oid,
901                        new_oid,
902                        score: None,
903                    });
904                }
905                continue;
906            }
907            let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
908            if ignore_submodule_untracked {
909                flags.untracked = false;
910            }
911            let inner_dirty = flags.modified || flags.untracked;
912            if !ref_matches || inner_dirty {
913                let path_owned = path_str_ref.to_owned();
914                let new_oid = if !ref_matches {
915                    sub_head_oid.unwrap_or_else(zero_oid)
916                } else {
917                    zero_oid()
918                };
919                result.push(DiffEntry {
920                    status: DiffStatus::Modified,
921                    old_path: Some(path_owned.clone()),
922                    new_path: Some(path_owned),
923                    old_mode: format_mode(ie.mode),
924                    new_mode: format_mode(ie.mode),
925                    old_oid: ie.oid,
926                    new_oid,
927                    score: None,
928                });
929            }
930            continue;
931        }
932
933        let file_path = work_tree.join(path_str_ref);
934
935        if is_intent_to_add {
936            match fs::symlink_metadata(&file_path) {
937                Ok(meta) => {
938                    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
939                    let worktree_oid = hash_worktree_file(
940                        odb,
941                        &file_path,
942                        &meta,
943                        &conv,
944                        &file_attrs,
945                        path_str_ref,
946                        None,
947                    )?;
948                    let worktree_mode = mode_from_metadata(&meta);
949                    result.push(DiffEntry {
950                        status: DiffStatus::Added,
951                        old_path: None,
952                        new_path: Some(path_str_ref.to_owned()),
953                        old_mode: "000000".to_owned(),
954                        new_mode: format_mode(worktree_mode),
955                        // `ita_invisible_in_index`: null OID on the index side for patch output
956                        // (`index 0000000..`, t2203); index entry still stores the empty blob.
957                        old_oid: zero_oid(),
958                        new_oid: worktree_oid,
959                        score: None,
960                    });
961                }
962                Err(e)
963                    if e.kind() == std::io::ErrorKind::NotFound
964                        || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
965                {
966                    result.push(DiffEntry {
967                        status: DiffStatus::Deleted,
968                        old_path: Some(path_str_ref.to_owned()),
969                        new_path: None,
970                        old_mode: format_mode(ie.mode),
971                        new_mode: "000000".to_owned(),
972                        old_oid: ie.oid,
973                        new_oid: zero_oid(),
974                        score: None,
975                    });
976                }
977                Err(e) => return Err(Error::Io(e)),
978            }
979            continue;
980        }
981
982        // If any parent component of the path is a symlink, the file is effectively
983        // deleted from the working tree (a symlink replaced a directory).
984        if has_symlink_in_path(work_tree, path_str_ref) {
985            result.push(DiffEntry {
986                status: DiffStatus::Deleted,
987                old_path: Some(path_str_ref.to_owned()),
988                new_path: None,
989                old_mode: format_mode(ie.mode),
990                new_mode: "000000".to_owned(),
991                old_oid: ie.oid,
992                new_oid: zero_oid(),
993                score: None,
994            });
995            continue;
996        }
997
998        match fs::symlink_metadata(&file_path) {
999            Ok(meta) if meta.is_dir() => {
1000                // A directory exists where the index expects a file.
1001                // Treat as a type change: the indexed file is effectively deleted.
1002                result.push(DiffEntry {
1003                    status: DiffStatus::Deleted,
1004                    old_path: Some(path_str_ref.to_owned()),
1005                    new_path: None,
1006                    old_mode: format_mode(ie.mode),
1007                    new_mode: String::new(),
1008                    old_oid: ie.oid,
1009                    new_oid: zero_oid(),
1010                    score: None,
1011                });
1012            }
1013            Ok(meta) => {
1014                let worktree_mode = mode_from_metadata(&meta);
1015                let stat_same = stat_matches(ie, &meta);
1016                // Mode-only change: stat still matches the index entry but executable bit differs.
1017                if stat_same && worktree_mode != ie.mode {
1018                    let path_owned = path_str_ref.to_owned();
1019                    result.push(DiffEntry {
1020                        status: DiffStatus::Modified,
1021                        old_path: Some(path_owned.clone()),
1022                        new_path: Some(path_owned),
1023                        old_mode: format_mode(ie.mode),
1024                        new_mode: format_mode(worktree_mode),
1025                        old_oid: ie.oid,
1026                        new_oid: ie.oid,
1027                        score: None,
1028                    });
1029                    continue;
1030                }
1031
1032                // Fast path: unchanged stat + unchanged mode + non-racy timestamp means this entry
1033                // is clean without re-hashing blob data.
1034                if stat_same && worktree_mode == ie.mode && !entry_is_racy(ie, options.index_mtime) {
1035                    continue;
1036                }
1037
1038                // Hash the worktree blob for uncertain/racy entries.
1039                let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
1040                let worktree_oid = hash_worktree_file(
1041                    odb,
1042                    &file_path,
1043                    &meta,
1044                    &conv,
1045                    &file_attrs,
1046                    path_str_ref,
1047                    Some(ie),
1048                )?;
1049
1050                // If clean conversion disagrees with the index but raw bytes match the
1051                // blob (e.g. mixed line endings committed with autocrlf off), Git reports
1052                // no diff (t0020: touch + git diff --exit-code).
1053                let mut eff_oid = worktree_oid;
1054                if eff_oid != ie.oid {
1055                    if let Ok(raw) = fs::read(&file_path) {
1056                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1057                        if raw_oid == ie.oid {
1058                            eff_oid = ie.oid;
1059                        }
1060                    }
1061                }
1062
1063                if eff_oid != ie.oid || worktree_mode != ie.mode {
1064                    let path_owned = path_str_ref.to_owned();
1065                    result.push(DiffEntry {
1066                        status: DiffStatus::Modified,
1067                        old_path: Some(path_owned.clone()),
1068                        new_path: Some(path_owned),
1069                        old_mode: format_mode(ie.mode),
1070                        new_mode: format_mode(worktree_mode),
1071                        old_oid: ie.oid,
1072                        new_oid: eff_oid,
1073                    score: None,
1074                    });
1075                }
1076            }
1077            Err(e) if e.kind() == std::io::ErrorKind::NotFound
1078                || e.raw_os_error() == Some(20) /* ENOTDIR */ => {
1079                // File deleted from working tree (or parent replaced by a file)
1080                result.push(DiffEntry {
1081                    status: DiffStatus::Deleted,
1082                    old_path: Some(path_str_ref.to_owned()),
1083                    new_path: None,
1084                    old_mode: format_mode(ie.mode),
1085                    new_mode: "000000".to_owned(),
1086                    old_oid: ie.oid,
1087                    new_oid: zero_oid(),
1088                    score: None,
1089                });
1090            }
1091            Err(e) => return Err(Error::Io(e)),
1092        }
1093    }
1094
1095    for (path, (_, base_entry)) in unmerged_base {
1096        let file_path = work_tree.join(&path);
1097        let wt_meta = match fs::symlink_metadata(&file_path) {
1098            Ok(meta) => Some(meta),
1099            Err(e)
1100                if e.kind() == std::io::ErrorKind::NotFound
1101                    || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
1102            {
1103                None
1104            }
1105            Err(e) => return Err(Error::Io(e)),
1106        };
1107
1108        let new_mode = wt_meta.as_ref().map_or_else(
1109            || "000000".to_owned(),
1110            |meta| format_mode(mode_from_metadata(meta)),
1111        );
1112        result.push(DiffEntry {
1113            status: DiffStatus::Unmerged,
1114            old_path: Some(path.clone()),
1115            new_path: Some(path.clone()),
1116            old_mode: "000000".to_owned(),
1117            new_mode,
1118            old_oid: zero_oid(),
1119            new_oid: zero_oid(),
1120            score: None,
1121        });
1122
1123        if let Some(meta) = wt_meta {
1124            let file_attrs = crlf::get_file_attrs(&attrs, &path, false, &config);
1125            let wt_oid = hash_worktree_file(
1126                odb,
1127                &file_path,
1128                &meta,
1129                &conv,
1130                &file_attrs,
1131                &path,
1132                Some(base_entry),
1133            )?;
1134            let wt_mode = mode_from_metadata(&meta);
1135            if wt_oid != base_entry.oid || wt_mode != base_entry.mode {
1136                result.push(DiffEntry {
1137                    status: DiffStatus::Modified,
1138                    old_path: Some(path.clone()),
1139                    new_path: Some(path),
1140                    old_mode: format_mode(base_entry.mode),
1141                    new_mode: format_mode(wt_mode),
1142                    old_oid: base_entry.oid,
1143                    new_oid: wt_oid,
1144                    score: None,
1145                });
1146            }
1147        }
1148    }
1149
1150    Ok(result)
1151}
1152
1153fn entry_is_racy(ie: &IndexEntry, index_mtime: Option<(u32, u32)>) -> bool {
1154    let Some((index_mtime_sec, index_mtime_nsec)) = index_mtime else {
1155        return false;
1156    };
1157    if index_mtime_sec == 0 {
1158        return false;
1159    }
1160    index_mtime_sec < ie.mtime_sec
1161        || (index_mtime_sec == ie.mtime_sec && index_mtime_nsec <= ie.mtime_nsec)
1162}
1163
1164/// Quick stat check: does the index entry's cached stat data match the file?
1165/// Returns true when the file at `ie`'s path differs from the index entry (mode or blob).
1166///
1167/// Used by commands such as `git mv` to detect "dirty" paths under sparse checkout.
1168/// Symlinks and submodules are compared in a Git-compatible way.
1169///
1170/// `ignore_submodule_untracked` mirrors [`diff_index_to_worktree`]'s same flag for gitlinks.
1171pub fn worktree_differs_from_index_entry(
1172    odb: &Odb,
1173    work_tree: &Path,
1174    ie: &IndexEntry,
1175    ignore_submodule_untracked: bool,
1176) -> Result<bool> {
1177    use crate::config::ConfigSet;
1178    use crate::crlf;
1179
1180    let path_str_ref = std::str::from_utf8(&ie.path).unwrap_or("");
1181    let file_path = work_tree.join(path_str_ref);
1182
1183    if ie.mode == 0o160000 {
1184        let sub_head_oid = read_submodule_head(&file_path);
1185        let ref_matches = match sub_head_oid {
1186            Some(oid) => oid == ie.oid,
1187            None => submodule_worktree_is_unpopulated_placeholder(&file_path),
1188        };
1189        let mut flags = submodule_porcelain_flags(work_tree, path_str_ref, ie.oid);
1190        if ignore_submodule_untracked {
1191            flags.untracked = false;
1192        }
1193        return Ok(!ref_matches || flags.modified || flags.untracked);
1194    }
1195
1196    let meta = match fs::symlink_metadata(&file_path) {
1197        Ok(m) => m,
1198        Err(e)
1199            if e.kind() == std::io::ErrorKind::NotFound
1200                || e.raw_os_error() == Some(20) /* ENOTDIR */ =>
1201        {
1202            return Ok(true);
1203        }
1204        Err(e) => return Err(Error::Io(e)),
1205    };
1206
1207    if meta.is_dir() {
1208        return Ok(true);
1209    }
1210
1211    let worktree_mode = mode_from_metadata(&meta);
1212    if worktree_mode != ie.mode {
1213        return Ok(true);
1214    }
1215
1216    let git_dir = work_tree.join(".git");
1217    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1218    let conv = crlf::ConversionConfig::from_config(&config);
1219    let attrs = crlf::load_gitattributes(work_tree);
1220    let file_attrs = crlf::get_file_attrs(&attrs, path_str_ref, false, &config);
1221    let worktree_oid = hash_worktree_file(
1222        odb,
1223        &file_path,
1224        &meta,
1225        &conv,
1226        &file_attrs,
1227        path_str_ref,
1228        Some(ie),
1229    )?;
1230
1231    let mut eff_oid = worktree_oid;
1232    if eff_oid != ie.oid {
1233        if let Ok(raw) = fs::read(&file_path) {
1234            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1235            if raw_oid == ie.oid {
1236                eff_oid = ie.oid;
1237            }
1238        }
1239    }
1240
1241    Ok(eff_oid != ie.oid)
1242}
1243
1244pub fn stat_matches(ie: &IndexEntry, meta: &fs::Metadata) -> bool {
1245    // Compare size
1246    if meta.len() as u32 != ie.size {
1247        return false;
1248    }
1249    #[cfg(unix)]
1250    {
1251        use std::os::unix::fs::MetadataExt;
1252        // Compare mtime (seconds + nanoseconds)
1253        if meta.mtime() as u32 != ie.mtime_sec {
1254            return false;
1255        }
1256        if meta.mtime_nsec() as u32 != ie.mtime_nsec {
1257            return false;
1258        }
1259        // Compare ctime (seconds + nanoseconds)
1260        if meta.ctime() as u32 != ie.ctime_sec {
1261            return false;
1262        }
1263        if meta.ctime_nsec() as u32 != ie.ctime_nsec {
1264            return false;
1265        }
1266        // Compare inode and device
1267        if meta.ino() as u32 != ie.ino {
1268            return false;
1269        }
1270        if meta.dev() as u32 != ie.dev {
1271            return false;
1272        }
1273    }
1274    #[cfg(not(unix))]
1275    {
1276        use std::time::UNIX_EPOCH;
1277        if let Ok(mtime) = meta.modified() {
1278            if let Ok(dur) = mtime.duration_since(UNIX_EPOCH) {
1279                if dur.as_secs() as u32 != ie.mtime_sec {
1280                    return false;
1281                }
1282                if dur.subsec_nanos() != ie.mtime_nsec {
1283                    return false;
1284                }
1285            }
1286        }
1287    }
1288    true
1289}
1290
1291/// Hash a working tree file as a blob to get its OID.
1292/// Check if any parent component of `rel_path` (relative to `work_tree`) is a symlink.
1293fn has_symlink_in_path(work_tree: &Path, rel_path: &str) -> bool {
1294    let mut check = work_tree.to_path_buf();
1295    let components: Vec<&str> = rel_path.split('/').collect();
1296    // Check all components except the last one (which is the file itself)
1297    for component in &components[..components.len().saturating_sub(1)] {
1298        check.push(component);
1299        match fs::symlink_metadata(&check) {
1300            Ok(meta) if meta.file_type().is_symlink() => return true,
1301            _ => {}
1302        }
1303    }
1304    false
1305}
1306
1307pub fn hash_worktree_file(
1308    odb: &Odb,
1309    path: &Path,
1310    meta: &fs::Metadata,
1311    conv: &crate::crlf::ConversionConfig,
1312    file_attrs: &crate::crlf::FileAttrs,
1313    rel_path: &str,
1314    index_entry: Option<&IndexEntry>,
1315) -> Result<ObjectId> {
1316    let prior_blob: Option<Vec<u8>> = index_entry
1317        .filter(|e| e.oid != zero_oid())
1318        .and_then(|e| odb.read(&e.oid).ok().map(|o| o.data));
1319    let data = if meta.file_type().is_symlink() {
1320        // For symlinks, hash the target path
1321        let target = fs::read_link(path)?;
1322        target.to_string_lossy().into_owned().into_bytes()
1323    } else if meta.is_dir() {
1324        // `read()` on a directory fails with EISDIR; unmerged paths may leave an empty
1325        // placeholder directory (e.g. t4027 combined submodule conflict).
1326        Vec::new()
1327    } else {
1328        let raw = fs::read(path)?;
1329        // Apply clean conversion (CRLF→LF) so hash matches index blob.
1330        // Do not run safecrlf here: diff/commit use this for hashing and must not print warnings.
1331        let opts = crate::crlf::ConvertToGitOpts {
1332            index_blob: prior_blob.as_deref(),
1333            renormalize: false,
1334            check_safecrlf: false,
1335        };
1336        crate::crlf::convert_to_git_with_opts(&raw, rel_path, conv, file_attrs, opts).unwrap_or(raw)
1337    };
1338
1339    Ok(Odb::hash_object_data(ObjectKind::Blob, &data))
1340}
1341
1342/// Derive a Git file mode from filesystem metadata.
1343pub fn mode_from_metadata(meta: &fs::Metadata) -> u32 {
1344    if meta.file_type().is_symlink() {
1345        0o120000
1346    } else {
1347        #[cfg(unix)]
1348        {
1349            if meta.mode() & 0o111 != 0 {
1350                return 0o100755;
1351            }
1352        }
1353        0o100644
1354    }
1355}
1356
1357/// Compare a tree against the working tree.
1358///
1359/// Shows changes from `tree_oid` to the current working directory state.
1360/// Files tracked in the index but not in the tree are shown as Added.
1361/// Files in the tree but missing from the working tree are shown as Deleted.
1362///
1363/// # Parameters
1364///
1365/// - `odb` — object database.
1366/// - `tree_oid` — the tree to compare against (`None` for empty tree).
1367/// - `work_tree` — path to the working tree root.
1368/// - `index` — current index (used to discover new tracked files not in tree).
1369///
1370/// # Errors
1371///
1372/// Returns errors from ODB reads or I/O.
1373pub fn diff_tree_to_worktree(
1374    odb: &Odb,
1375    tree_oid: Option<&ObjectId>,
1376    work_tree: &Path,
1377    index: &Index,
1378) -> Result<Vec<DiffEntry>> {
1379    use crate::config::ConfigSet;
1380    use crate::crlf;
1381
1382    let git_dir = work_tree.join(".git");
1383    let config = ConfigSet::load(Some(&git_dir), true).unwrap_or_else(|_| ConfigSet::new());
1384    let conv = crlf::ConversionConfig::from_config(&config);
1385    let attrs = crlf::load_gitattributes(work_tree);
1386
1387    // Flatten the tree into a BTreeMap keyed by path
1388    let tree_flat = match tree_oid {
1389        Some(oid) => flatten_tree(odb, oid, "")?,
1390        None => Vec::new(),
1391    };
1392    let tree_map: std::collections::BTreeMap<String, &FlatEntry> =
1393        tree_flat.iter().map(|e| (e.path.clone(), e)).collect();
1394
1395    // Build index lookup: path → &IndexEntry (stage 0 only)
1396    let mut index_entries: std::collections::BTreeMap<&[u8], &IndexEntry> =
1397        std::collections::BTreeMap::new();
1398    let mut index_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1399    let mut stage0_paths: std::collections::BTreeSet<Vec<u8>> = std::collections::BTreeSet::new();
1400    for ie in &index.entries {
1401        if ie.stage() != 0 {
1402            continue;
1403        }
1404        let path = String::from_utf8_lossy(&ie.path).to_string();
1405        index_entries.insert(&ie.path, ie);
1406        index_paths.insert(path);
1407        stage0_paths.insert(ie.path.clone());
1408    }
1409
1410    // Paths with only unmerged stages (1–3) and no stage 0 — `git diff <rev>` must still list them
1411    // so combined `diff --cc` conflict hunks can be emitted (`t4108-apply-threeway`).
1412    let mut unmerged_only_paths: std::collections::BTreeSet<String> =
1413        std::collections::BTreeSet::new();
1414    for ie in &index.entries {
1415        if !(1..=3).contains(&ie.stage()) {
1416            continue;
1417        }
1418        if stage0_paths.contains(&ie.path) {
1419            continue;
1420        }
1421        unmerged_only_paths.insert(String::from_utf8_lossy(&ie.path).into_owned());
1422    }
1423
1424    // Union of tree paths + index paths
1425    let mut all_paths: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
1426    all_paths.extend(tree_map.keys().cloned());
1427    all_paths.extend(index_paths.iter().cloned());
1428    all_paths.extend(unmerged_only_paths.iter().cloned());
1429
1430    let mut result = Vec::new();
1431
1432    for path in &all_paths {
1433        if index_entries
1434            .get(path.as_bytes())
1435            .is_some_and(|ie| ie.skip_worktree())
1436        {
1437            // Sparse checkout: `git diff <rev>` does not report tree↔worktree drift for
1438            // skip-worktree paths (they are outside the sparse cone). Matches t7012 stash flow.
1439            continue;
1440        }
1441
1442        let tree_entry = tree_map.get(path.as_str());
1443
1444        // Gitlink entries (submodules) — compare HEAD commit, not file content.
1445        let is_gitlink = tree_entry.is_some_and(|te| te.mode == 0o160000)
1446            || index_entries
1447                .get(path.as_bytes())
1448                .is_some_and(|ie| ie.mode == 0o160000);
1449        if is_gitlink {
1450            if let Some(te) = tree_entry {
1451                let sub_dir = work_tree.join(path);
1452                let sub_head = read_submodule_head_oid(&sub_dir);
1453                let index_oid = index_entries
1454                    .get(path.as_bytes())
1455                    .filter(|ie| ie.mode == 0o160000)
1456                    .map(|ie| ie.oid);
1457                let index_matches_tree = index_oid.is_some_and(|oid| oid == te.oid);
1458                let head_differs = sub_head.as_ref() != Some(&te.oid);
1459                let dirty_while_aligned = index_matches_tree
1460                    && !head_differs
1461                    && submodule_has_dirty_worktree_for_super_diff(work_tree, path, &te.oid);
1462                if head_differs || dirty_while_aligned {
1463                    // Raw `git diff <tree>` lines use a null OID on the worktree side when the
1464                    // checked-out submodule HEAD differs from the tree's gitlink; patch output still
1465                    // resolves the real commit from the submodule directory.
1466                    let new_oid = if head_differs { zero_oid() } else { te.oid };
1467                    result.push(DiffEntry {
1468                        status: DiffStatus::Modified,
1469                        old_path: Some(path.clone()),
1470                        new_path: Some(path.clone()),
1471                        old_mode: format_mode(te.mode),
1472                        new_mode: format_mode(te.mode),
1473                        old_oid: te.oid,
1474                        new_oid,
1475                        score: None,
1476                    });
1477                }
1478            }
1479            continue;
1480        }
1481
1482        let file_path = work_tree.join(path);
1483
1484        let wt_meta = match fs::symlink_metadata(&file_path) {
1485            Ok(m) => Some(m),
1486            Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
1487            Err(e) => return Err(Error::Io(e)),
1488        };
1489
1490        if unmerged_only_paths.contains(path) {
1491            if let (Some(te), Some(meta)) = (tree_entry, wt_meta.as_ref()) {
1492                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1493                let wt_oid =
1494                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, None)?;
1495                let wt_mode = mode_from_metadata(meta);
1496                if wt_oid != te.oid || wt_mode != te.mode {
1497                    result.push(DiffEntry {
1498                        status: DiffStatus::Modified,
1499                        old_path: Some(path.clone()),
1500                        new_path: Some(path.clone()),
1501                        old_mode: format_mode(te.mode),
1502                        new_mode: format_mode(wt_mode),
1503                        old_oid: te.oid,
1504                        new_oid: wt_oid,
1505                        score: None,
1506                    });
1507                }
1508            }
1509            continue;
1510        }
1511
1512        match (tree_entry, wt_meta) {
1513            (Some(te), Some(ref meta)) => {
1514                let wt_mode = mode_from_metadata(meta);
1515                let Some(ie) = index_entries.get(path.as_bytes()) else {
1516                    continue;
1517                };
1518
1519                let index_matches_tree = ie.oid == te.oid && ie.mode == te.mode;
1520
1521                // Fully clean: index matches `HEAD`, worktree matches index, stat cache fresh.
1522                if index_matches_tree && wt_mode == te.mode && stat_matches(ie, meta) {
1523                    continue;
1524                }
1525
1526                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1527                let idx_ent = index_entries.get(path.as_bytes()).copied();
1528
1529                // Staged mode (same blob as `HEAD`, different mode recorded in the index).
1530                if ie.oid == te.oid && ie.mode != te.mode {
1531                    result.push(DiffEntry {
1532                        status: DiffStatus::Modified,
1533                        old_path: Some(path.clone()),
1534                        new_path: Some(path.clone()),
1535                        old_mode: format_mode(te.mode),
1536                        new_mode: format_mode(ie.mode),
1537                        old_oid: te.oid,
1538                        new_oid: te.oid,
1539                        score: None,
1540                    });
1541                    continue;
1542                }
1543
1544                // Index still matches `HEAD`: only unstaged worktree drift (content and/or
1545                // worktree-only exec bit when `update-index` was not run — t4049 harness).
1546                if index_matches_tree {
1547                    let wt_oid = hash_worktree_file(
1548                        odb,
1549                        &file_path,
1550                        meta,
1551                        &conv,
1552                        &file_attrs,
1553                        path,
1554                        idx_ent,
1555                    )?;
1556                    let mut eff_oid = wt_oid;
1557                    if eff_oid != te.oid {
1558                        if let Ok(raw) = fs::read(&file_path) {
1559                            let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1560                            if raw_oid == te.oid {
1561                                eff_oid = te.oid;
1562                            }
1563                        }
1564                    }
1565                    if eff_oid != te.oid {
1566                        result.push(DiffEntry {
1567                            status: DiffStatus::Modified,
1568                            old_path: Some(path.clone()),
1569                            new_path: Some(path.clone()),
1570                            old_mode: format_mode(te.mode),
1571                            new_mode: format_mode(wt_mode),
1572                            old_oid: te.oid,
1573                            new_oid: eff_oid,
1574                            score: None,
1575                        });
1576                    } else if wt_mode != te.mode {
1577                        result.push(DiffEntry {
1578                            status: DiffStatus::Modified,
1579                            old_path: Some(path.clone()),
1580                            new_path: Some(path.clone()),
1581                            old_mode: format_mode(te.mode),
1582                            new_mode: format_mode(wt_mode),
1583                            old_oid: te.oid,
1584                            new_oid: te.oid,
1585                            score: None,
1586                        });
1587                    }
1588                    continue;
1589                }
1590
1591                // Staged content (and possibly mode): `git diff <rev>` is tree vs working tree.
1592                let wt_oid =
1593                    hash_worktree_file(odb, &file_path, meta, &conv, &file_attrs, path, idx_ent)?;
1594                let mut eff_oid = wt_oid;
1595                if eff_oid != te.oid {
1596                    if let Ok(raw) = fs::read(&file_path) {
1597                        let raw_oid = Odb::hash_object_data(ObjectKind::Blob, &raw);
1598                        if raw_oid == te.oid {
1599                            eff_oid = te.oid;
1600                        }
1601                    }
1602                }
1603                if eff_oid != te.oid || wt_mode != te.mode {
1604                    result.push(DiffEntry {
1605                        status: DiffStatus::Modified,
1606                        old_path: Some(path.clone()),
1607                        new_path: Some(path.clone()),
1608                        old_mode: format_mode(te.mode),
1609                        new_mode: format_mode(wt_mode),
1610                        old_oid: te.oid,
1611                        new_oid: eff_oid,
1612                        score: None,
1613                    });
1614                }
1615            }
1616            (Some(te), None) => {
1617                // In tree but missing from worktree
1618                result.push(DiffEntry {
1619                    status: DiffStatus::Deleted,
1620                    old_path: Some(path.clone()),
1621                    new_path: None,
1622                    old_mode: format_mode(te.mode),
1623                    new_mode: "000000".to_owned(),
1624                    old_oid: te.oid,
1625                    new_oid: zero_oid(),
1626                    score: None,
1627                });
1628            }
1629            (None, Some(ref meta)) => {
1630                // In index but not in tree, and exists in worktree
1631                let file_attrs = crlf::get_file_attrs(&attrs, path, false, &config);
1632                let wt_oid = hash_worktree_file(
1633                    odb,
1634                    &file_path,
1635                    meta,
1636                    &conv,
1637                    &file_attrs,
1638                    path,
1639                    index_entries.get(path.as_bytes()).copied(),
1640                )?;
1641                let wt_mode = mode_from_metadata(meta);
1642                result.push(DiffEntry {
1643                    status: DiffStatus::Added,
1644                    old_path: None,
1645                    new_path: Some(path.clone()),
1646                    old_mode: "000000".to_owned(),
1647                    new_mode: format_mode(wt_mode),
1648                    old_oid: zero_oid(),
1649                    new_oid: wt_oid,
1650                    score: None,
1651                });
1652            }
1653            (None, None) => {
1654                // Tracked in index but neither in tree nor worktree — skip
1655            }
1656        }
1657    }
1658
1659    result.sort_by(|a, b| a.path().cmp(b.path()));
1660    Ok(result)
1661}
1662
1663// ── Rename detection ────────────────────────────────────────────────
1664
1665fn read_added_entry_bytes(
1666    odb: &Odb,
1667    entry: &DiffEntry,
1668    work_root: Option<&Path>,
1669) -> Option<Vec<u8>> {
1670    if entry.new_oid != zero_oid() {
1671        return odb.read(&entry.new_oid).ok().map(|obj| obj.data);
1672    }
1673    let path = entry.new_path.as_deref()?;
1674    let root = work_root?;
1675    fs::read(root.join(path)).ok()
1676}
1677
1678fn modified_as_copy_from_sources(
1679    odb: &Odb,
1680    work_root: Option<&Path>,
1681    e: &DiffEntry,
1682    threshold: u32,
1683    sources: &[(String, ObjectId, bool)],
1684    source_contents: &[Option<Vec<u8>>],
1685    source_tree_entries: &[(String, String, ObjectId)],
1686) -> Option<DiffEntry> {
1687    fn regular_file_mode(mode: &str) -> bool {
1688        mode == "100644" || mode == "100755"
1689    }
1690
1691    if e.status != DiffStatus::Modified || !regular_file_mode(&e.new_mode) {
1692        return None;
1693    }
1694    let new_data = read_added_entry_bytes(odb, e, work_root)?;
1695    let new_oid_eff = if e.new_oid != zero_oid() {
1696        e.new_oid
1697    } else {
1698        Odb::hash_object_data(ObjectKind::Blob, &new_data)
1699    };
1700
1701    let mut best: Option<(usize, u32)> = None;
1702    for (si, (src_path, src_oid, is_deleted)) in sources.iter().enumerate() {
1703        if *is_deleted {
1704            continue;
1705        }
1706        if e.new_path.as_deref() == Some(src_path.as_str()) {
1707            continue;
1708        }
1709        let src_mode_str = source_tree_entries
1710            .iter()
1711            .find(|(p, _, _)| p == src_path)
1712            .map(|(_, m, _)| m.as_str())
1713            .unwrap_or("100644");
1714        if !regular_file_mode(src_mode_str) {
1715            continue;
1716        }
1717
1718        let score = if *src_oid == new_oid_eff {
1719            100
1720        } else {
1721            match (&source_contents[si], Some(new_data.as_slice())) {
1722                (Some(old_data), Some(nd)) => compute_similarity(old_data, nd),
1723                _ => 0,
1724            }
1725        };
1726        if score >= threshold {
1727            let replace = match best {
1728                None => true,
1729                Some((_, s)) => score > s,
1730            };
1731            if replace {
1732                best = Some((si, score));
1733            }
1734        }
1735    }
1736
1737    let (si, score) = best?;
1738    let (src_path, src_oid, _) = &sources[si];
1739    let src_mode = source_tree_entries
1740        .iter()
1741        .find(|(p, _, _)| p == src_path)
1742        .map(|(_, m, _)| m.clone())
1743        .unwrap_or_else(|| e.old_mode.clone());
1744
1745    Some(DiffEntry {
1746        status: DiffStatus::Copied,
1747        old_path: Some(src_path.clone()),
1748        new_path: e.new_path.clone(),
1749        old_mode: src_mode,
1750        new_mode: e.new_mode.clone(),
1751        old_oid: *src_oid,
1752        new_oid: e.new_oid,
1753        score: Some(score),
1754    })
1755}
1756
1757/// Detect renames by pairing Deleted and Added entries with similar content.
1758///
1759/// `threshold` is the minimum similarity percentage (0–100) for a pair to
1760/// be considered a rename (Git's default is 50%).  The function reads blob
1761/// content from the ODB to compute a line-level similarity score.
1762///
1763/// Exact-OID matches are always 100% similar regardless of content.
1764///
1765/// When `work_root` is set, added entries whose `new_oid` is the zero placeholder (as in
1766/// uncached `diff-index` when the work tree diverged from the index) load content from disk
1767/// under that root instead of the object database.
1768pub fn detect_renames(
1769    odb: &Odb,
1770    work_root: Option<&Path>,
1771    entries: Vec<DiffEntry>,
1772    threshold: u32,
1773) -> Vec<DiffEntry> {
1774    // Split entries into deleted, added, and others.
1775    let mut deleted: Vec<DiffEntry> = Vec::new();
1776    let mut added: Vec<DiffEntry> = Vec::new();
1777    let mut others: Vec<DiffEntry> = Vec::new();
1778
1779    for entry in entries {
1780        match entry.status {
1781            DiffStatus::Deleted => deleted.push(entry),
1782            DiffStatus::Added => added.push(entry),
1783            _ => others.push(entry),
1784        }
1785    }
1786
1787    if deleted.is_empty() || added.is_empty() {
1788        // Nothing to pair — return original order.
1789        let mut result = others;
1790        result.extend(deleted);
1791        result.extend(added);
1792        result.sort_by(|a, b| a.path().cmp(b.path()));
1793        return result;
1794    }
1795
1796    // Read content for all deleted blobs.
1797    let deleted_contents: Vec<Option<Vec<u8>>> = deleted
1798        .iter()
1799        .map(|d| odb.read(&d.old_oid).ok().map(|obj| obj.data))
1800        .collect();
1801
1802    // Read content for all added blobs.
1803    let added_contents: Vec<Option<Vec<u8>>> = added
1804        .iter()
1805        .map(|a| read_added_entry_bytes(odb, a, work_root))
1806        .collect();
1807
1808    // Build a matrix of similarity scores and find the best pairings.
1809    // We use a greedy approach: pick the highest-scoring pair first.
1810    let mut scores: Vec<(u32, usize, usize)> = Vec::new();
1811
1812    fn is_regularish_mode(mode: &str) -> bool {
1813        mode == "100644" || mode == "100755"
1814    }
1815
1816    for (di, del) in deleted.iter().enumerate() {
1817        for (ai, add) in added.iter().enumerate() {
1818            // Exact OID match → 100%
1819            if del.old_oid == add.new_oid {
1820                scores.push((100, di, ai));
1821                continue;
1822            }
1823
1824            // Do not use line similarity across file types (e.g. regular ↔ symlink); Git keeps these
1825            // as separate changes (`t4008-diff-break-rewrite` #7).
1826            if !is_regularish_mode(&del.old_mode) || !is_regularish_mode(&add.new_mode) {
1827                continue;
1828            }
1829
1830            let score = match (&deleted_contents[di], &added_contents[ai]) {
1831                (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
1832                _ => 0,
1833            };
1834
1835            if score >= threshold {
1836                scores.push((score, di, ai));
1837            }
1838        }
1839    }
1840
1841    // Sort: prefer same-basename pairs first, then by score descending.
1842    // This matches Git's behavior where basename matches are checked first.
1843    scores.sort_by(|a, b| {
1844        let a_same = same_basename(&deleted[a.1], &added[a.2]);
1845        let b_same = same_basename(&deleted[b.1], &added[b.2]);
1846        b_same.cmp(&a_same).then_with(|| b.0.cmp(&a.0))
1847    });
1848
1849    let mut used_deleted = vec![false; deleted.len()];
1850    let mut used_added = vec![false; added.len()];
1851    let mut renames: Vec<DiffEntry> = Vec::new();
1852
1853    for (score, di, ai) in &scores {
1854        if used_deleted[*di] || used_added[*ai] {
1855            continue;
1856        }
1857        used_deleted[*di] = true;
1858        used_added[*ai] = true;
1859
1860        let del = &deleted[*di];
1861        let add = &added[*ai];
1862
1863        renames.push(DiffEntry {
1864            status: DiffStatus::Renamed,
1865            old_path: del.old_path.clone(),
1866            new_path: add.new_path.clone(),
1867            old_mode: del.old_mode.clone(),
1868            new_mode: add.new_mode.clone(),
1869            old_oid: del.old_oid,
1870            new_oid: add.new_oid,
1871            score: Some(*score),
1872        });
1873    }
1874
1875    // Collect unmatched entries.
1876    let mut result = others;
1877    result.extend(renames);
1878    for (i, entry) in deleted.into_iter().enumerate() {
1879        if !used_deleted[i] {
1880            result.push(entry);
1881        }
1882    }
1883    for (i, entry) in added.into_iter().enumerate() {
1884        if !used_added[i] {
1885            result.push(entry);
1886        }
1887    }
1888
1889    result.sort_by(|a, b| a.path().cmp(b.path()));
1890    result
1891}
1892
1893/// Detect copies among diff entries.
1894///
1895/// This first runs rename detection (pairing Deleted+Added), then for any
1896/// remaining Added entries, looks for copy sources.
1897///
1898/// - `find_copies_harder` = false: only Modified entries are copy source candidates.
1899/// - `find_copies_harder` = true: also examine unmodified files from `source_tree_entries`.
1900///
1901/// `source_tree_entries` should be a list of (path, mode, oid) from the source tree;
1902/// used when `find_copies_harder` is true to consider unmodified files as copy sources.
1903pub fn detect_copies(
1904    odb: &Odb,
1905    work_root: Option<&Path>,
1906    entries: Vec<DiffEntry>,
1907    threshold: u32,
1908    find_copies_harder: bool,
1909    source_tree_entries: &[(String, String, ObjectId)],
1910) -> Vec<DiffEntry> {
1911    use std::collections::{HashMap, HashSet};
1912
1913    // Separate entries by status.
1914    let mut deleted: Vec<DiffEntry> = Vec::new();
1915    let mut added: Vec<DiffEntry> = Vec::new();
1916    let mut others: Vec<DiffEntry> = Vec::new();
1917
1918    for entry in entries {
1919        match entry.status {
1920            DiffStatus::Deleted => deleted.push(entry),
1921            DiffStatus::Added => added.push(entry),
1922            _ => others.push(entry),
1923        }
1924    }
1925
1926    // Build source candidates: deleted files, modified files, and optionally tree entries.
1927    // Track which sources are from deleted files (can become renames).
1928    let mut sources: Vec<(String, ObjectId, bool)> = Vec::new(); // (path, oid, is_deleted)
1929    let mut deleted_source_idx: HashMap<String, usize> = HashMap::new();
1930
1931    for entry in &deleted {
1932        if let Some(ref path) = entry.old_path {
1933            deleted_source_idx.insert(path.clone(), sources.len());
1934            sources.push((path.clone(), entry.old_oid, true));
1935        }
1936    }
1937
1938    // Modified and type-changed files are candidates for `-C` (e.g. symlink rewrite leaves the
1939    // old blob available as a copy source for another path; see `t4008-diff-break-rewrite`).
1940    for entry in &others {
1941        if matches!(entry.status, DiffStatus::Modified | DiffStatus::TypeChanged) {
1942            if let Some(ref old_path) = entry.old_path {
1943                if !sources.iter().any(|(p, _, _)| p == old_path) {
1944                    sources.push((old_path.clone(), entry.old_oid, false));
1945                }
1946            }
1947        }
1948    }
1949
1950    // With find_copies_harder, add all source tree entries.
1951    if find_copies_harder {
1952        for (path, _mode, oid) in source_tree_entries {
1953            if !sources.iter().any(|(p, _, _)| p == path) {
1954                sources.push((path.clone(), *oid, false));
1955            }
1956        }
1957    }
1958
1959    if sources.is_empty() {
1960        let mut result = others;
1961        result.extend(deleted);
1962        result.extend(added);
1963        result.sort_by(|a, b| a.path().cmp(b.path()));
1964        return result;
1965    }
1966
1967    // Read content for sources.
1968    let source_contents: Vec<Option<Vec<u8>>> = sources
1969        .iter()
1970        .map(|(_, oid, _)| odb.read(oid).ok().map(|obj| obj.data))
1971        .collect();
1972
1973    let mut result_entries: Vec<DiffEntry> = Vec::new();
1974    let mut renamed_deleted: HashSet<usize> = HashSet::new();
1975    let mut used_added2 = vec![false; added.len()];
1976
1977    if !added.is_empty() {
1978        // Read content for added blobs.
1979        let added_contents: Vec<Option<Vec<u8>>> = added
1980            .iter()
1981            .map(|a| read_added_entry_bytes(odb, a, work_root))
1982            .collect();
1983
1984        // Build score matrix: (score, source_idx, added_idx)
1985        let mut scores: Vec<(u32, usize, usize)> = Vec::new();
1986        for (si, (src_path, src_oid, _)) in sources.iter().enumerate() {
1987            for (ai, add) in added.iter().enumerate() {
1988                // Never pair a path with itself as copy source (matches Git; avoids
1989                // arbitrary tie-breaking when several sources share the same blob).
1990                if add.new_path.as_deref() == Some(src_path.as_str()) {
1991                    continue;
1992                }
1993                let add_oid = if add.new_oid != zero_oid() {
1994                    add.new_oid
1995                } else if let Some(ref data) = added_contents[ai] {
1996                    Odb::hash_object_data(ObjectKind::Blob, data)
1997                } else {
1998                    zero_oid()
1999                };
2000                if *src_oid == add_oid {
2001                    scores.push((100, si, ai));
2002                    continue;
2003                }
2004                let score = match (&source_contents[si], &added_contents[ai]) {
2005                    (Some(old_data), Some(new_data)) => compute_similarity(old_data, new_data),
2006                    _ => 0,
2007                };
2008                if score >= threshold {
2009                    scores.push((score, si, ai));
2010                }
2011            }
2012        }
2013
2014        // Sort by score descending.
2015        scores.sort_by(|a, b| b.0.cmp(&a.0));
2016
2017        // Build source->added mappings, each added file assigned to best source.
2018        let mut used_added = vec![false; added.len()];
2019        let mut source_to_added: HashMap<usize, Vec<(usize, u32)>> = HashMap::new();
2020        for &(score, si, ai) in &scores {
2021            if used_added[ai] {
2022                continue;
2023            }
2024            used_added[ai] = true;
2025            source_to_added.entry(si).or_default().push((ai, score));
2026        }
2027
2028        // For each deleted source, pick one assignment as Rename, rest as Copy.
2029        for (&si, assignments_for_src) in &source_to_added {
2030            let (_, _, is_deleted) = &sources[si];
2031            if *is_deleted && !assignments_for_src.is_empty() {
2032                // Pick the last one (by path) as the rename target.
2033                // Git tends to pick the rename as the last alphabetically.
2034                let rename_ai = assignments_for_src
2035                    .iter()
2036                    .max_by_key(|(ai, _score)| added[*ai].path().to_string())
2037                    .map(|(ai, _)| *ai);
2038
2039                for &(ai, score) in assignments_for_src {
2040                    let (ref src_path, _, _) = sources[si];
2041                    let add = &added[ai];
2042                    let src_mode = source_tree_entries
2043                        .iter()
2044                        .find(|(p, _, _)| p == src_path)
2045                        .map(|(_, m, _)| m.clone())
2046                        .unwrap_or_else(|| add.old_mode.clone());
2047
2048                    let is_rename = Some(ai) == rename_ai;
2049                    result_entries.push(DiffEntry {
2050                        status: if is_rename {
2051                            DiffStatus::Renamed
2052                        } else {
2053                            DiffStatus::Copied
2054                        },
2055                        old_path: Some(src_path.clone()),
2056                        new_path: add.new_path.clone(),
2057                        old_mode: src_mode,
2058                        new_mode: add.new_mode.clone(),
2059                        old_oid: sources[si].1,
2060                        new_oid: add.new_oid,
2061                        score: Some(score),
2062                    });
2063                    used_added2[ai] = true;
2064                }
2065                renamed_deleted.insert(si);
2066            } else {
2067                // Non-deleted source: all assignments are copies.
2068                for &(ai, score) in assignments_for_src {
2069                    let (ref src_path, _, _) = sources[si];
2070                    let add = &added[ai];
2071                    let src_mode = source_tree_entries
2072                        .iter()
2073                        .find(|(p, _, _)| p == src_path)
2074                        .map(|(_, m, _)| m.clone())
2075                        .unwrap_or_else(|| add.old_mode.clone());
2076
2077                    result_entries.push(DiffEntry {
2078                        status: DiffStatus::Copied,
2079                        old_path: Some(src_path.clone()),
2080                        new_path: add.new_path.clone(),
2081                        old_mode: src_mode,
2082                        new_mode: add.new_mode.clone(),
2083                        old_oid: sources[si].1,
2084                        new_oid: add.new_oid,
2085                        score: Some(score),
2086                    });
2087                    used_added2[ai] = true;
2088                }
2089            }
2090        }
2091    }
2092
2093    // Keep deleted entries that weren't consumed by a rename.
2094    for entry in deleted.into_iter() {
2095        if let Some(ref path) = entry.old_path {
2096            if let Some(&si) = deleted_source_idx.get(path) {
2097                if renamed_deleted.contains(&si) {
2098                    // This deletion was consumed by a rename; skip it.
2099                    continue;
2100                }
2101            }
2102        }
2103        result_entries.push(entry);
2104    }
2105
2106    let mut result = others;
2107    result.extend(result_entries);
2108    // Keep unmatched added entries.
2109    for (i, entry) in added.into_iter().enumerate() {
2110        if !used_added2[i] {
2111            result.push(entry);
2112        }
2113    }
2114
2115    let mut final_result = Vec::with_capacity(result.len());
2116    for e in result {
2117        if let Some(c) = modified_as_copy_from_sources(
2118            odb,
2119            work_root,
2120            &e,
2121            threshold,
2122            &sources,
2123            &source_contents,
2124            source_tree_entries,
2125        ) {
2126            final_result.push(c);
2127        } else {
2128            final_result.push(e);
2129        }
2130    }
2131
2132    final_result.sort_by(|a, b| a.path().cmp(b.path()));
2133    final_result
2134}
2135
2136/// Apply Git-style rename and optional copy detection for index↔worktree diffs.
2137///
2138/// When `copies` is true (Git `diff.renames` / `status.renames` set to `copy`/`copies`),
2139/// runs [`detect_copies`] after rename detection so added files can match unchanged
2140/// paths from `HEAD` (e.g. intent-to-add copies).
2141///
2142/// # Errors
2143///
2144/// Propagates errors from reading the `head_tree` object from `odb`.
2145pub fn status_apply_rename_copy_detection(
2146    odb: &Odb,
2147    unstaged_raw: Vec<DiffEntry>,
2148    threshold: u32,
2149    copies: bool,
2150    head_tree: Option<&ObjectId>,
2151) -> Result<Vec<DiffEntry>> {
2152    let after_renames = detect_renames(odb, None, unstaged_raw, threshold);
2153    if !copies {
2154        return Ok(after_renames);
2155    }
2156    let source_tree_entries: Vec<(String, String, ObjectId)> = match head_tree {
2157        Some(oid) => flatten_tree(odb, oid, "")?
2158            .into_iter()
2159            .map(|e| (e.path, format_mode(e.mode), e.oid))
2160            .collect(),
2161        None => Vec::new(),
2162    };
2163    Ok(detect_copies(
2164        odb,
2165        None,
2166        after_renames,
2167        threshold,
2168        false,
2169        &source_tree_entries,
2170    ))
2171}
2172
2173/// Format a rename pair using Git's compact path format.
2174///
2175/// Examples:
2176/// - `a/b/c` → `c/b/a` → `a/b/c => c/b/a`
2177/// - `c/b/a` → `c/d/e` → `c/{b/a => d/e}`
2178/// - `c/d/e` → `d/e` → `{c/d => d}/e`
2179/// - `d/e` → `d/f/e` → `d/{ => f}/e`
2180pub fn format_rename_path(old: &str, new: &str) -> String {
2181    let ob = old.as_bytes();
2182    let nb = new.as_bytes();
2183
2184    // Find common prefix length, snapped to '/' boundary.
2185    let pfx = {
2186        let mut last_sep = 0usize;
2187        let min_len = ob.len().min(nb.len());
2188        for i in 0..min_len {
2189            if ob[i] != nb[i] {
2190                break;
2191            }
2192            if ob[i] == b'/' {
2193                last_sep = i + 1;
2194            }
2195        }
2196        last_sep
2197    };
2198
2199    // Find common suffix length, snapped to '/' boundary.
2200    let mut sfx = {
2201        let mut last_sep = 0usize;
2202        let min_len = ob.len().min(nb.len());
2203        for i in 0..min_len {
2204            let oi = ob.len() - 1 - i;
2205            let ni = nb.len() - 1 - i;
2206            if ob[oi] != nb[ni] {
2207                break;
2208            }
2209            if ob[oi] == b'/' {
2210                last_sep = i + 1;
2211            }
2212        }
2213        last_sep
2214    };
2215
2216    // Suffix starts at this position in each string.
2217    let mut sfx_at_old = ob.len() - sfx;
2218    let mut sfx_at_new = nb.len() - sfx;
2219
2220    // If prefix and suffix overlap in both strings (both middles empty),
2221    // reduce the suffix so that at least the longer string has a non-empty middle.
2222    while pfx > sfx_at_old && pfx > sfx_at_new && sfx > 0 {
2223        // Reduce suffix by snapping to the next smaller '/' boundary.
2224        let suffix_bytes = &ob[sfx_at_old..];
2225        let mut new_sfx = 0;
2226        // Find the next '/' after sfx_at_old (i.e., reduce suffix).
2227        for (i, &b) in suffix_bytes.iter().enumerate().skip(1) {
2228            if b == b'/' {
2229                new_sfx = sfx - i;
2230                break;
2231            }
2232        }
2233        if new_sfx == 0 || new_sfx >= sfx {
2234            sfx_at_old = ob.len();
2235            sfx_at_new = nb.len();
2236            break;
2237        }
2238        sfx = new_sfx;
2239        sfx_at_old = ob.len() - sfx;
2240        sfx_at_new = nb.len() - sfx;
2241    }
2242
2243    // When prefix and suffix overlap in the shorter string, they share
2244    // the '/' boundary character. In the output format, the shared '/'
2245    // appears in both positions (e.g. "d/{ => f}/e" for d/e → d/f/e).
2246    // Compute the middle parts. When prefix and suffix overlap in a
2247    // string, the middle for that string is empty. The shared '/' shows
2248    // in both prefix (trailing) and suffix (leading) positions.
2249    let prefix = &old[..pfx];
2250    let suffix = &old[sfx_at_old..];
2251    let old_mid = if pfx <= sfx_at_old {
2252        &old[pfx..sfx_at_old]
2253    } else {
2254        ""
2255    };
2256    let new_mid = if pfx <= sfx_at_new {
2257        &new[pfx..sfx_at_new]
2258    } else {
2259        ""
2260    };
2261
2262    if prefix.is_empty() && suffix.is_empty() {
2263        return format!("{old} => {new}");
2264    }
2265
2266    format!("{prefix}{{{old_mid} => {new_mid}}}{suffix}")
2267}
2268
2269/// Check if two entries share the same filename (basename).
2270fn same_basename(del: &DiffEntry, add: &DiffEntry) -> bool {
2271    let old = del.old_path.as_deref().unwrap_or("");
2272    let new = add.new_path.as_deref().unwrap_or("");
2273    let old_base = old.rsplit('/').next().unwrap_or(old);
2274    let new_base = new.rsplit('/').next().unwrap_or(new);
2275    old_base == new_base && !old_base.is_empty()
2276}
2277
2278/// Compute a similarity percentage (0–100) between two byte slices.
2279///
2280/// Uses Git's approach: count the bytes that are "shared" (appear in
2281/// equal lines), then compute `score = shared_bytes * 2 * 100 / (src_size + dst_size)`.
2282fn compute_similarity(old: &[u8], new: &[u8]) -> u32 {
2283    // Normalize CRLF → LF before comparing so that files differing
2284    // only in line endings are detected as renames.
2285    let old_norm = crate::crlf::crlf_to_lf(old);
2286    let new_norm = crate::crlf::crlf_to_lf(new);
2287
2288    let src_size = old_norm.len();
2289    let dst_size = new_norm.len();
2290
2291    if src_size == 0 && dst_size == 0 {
2292        return 100;
2293    }
2294    let total = src_size + dst_size;
2295    if total == 0 {
2296        return 100;
2297    }
2298
2299    // Use line-level diff to find shared content, then count bytes.
2300    use similar::{ChangeTag, TextDiff};
2301    let old_str = String::from_utf8_lossy(&old_norm);
2302    let new_str = String::from_utf8_lossy(&new_norm);
2303    let diff = TextDiff::from_lines(&old_str as &str, &new_str as &str);
2304
2305    let mut shared_bytes = 0usize;
2306    for change in diff.iter_all_changes() {
2307        if change.tag() == ChangeTag::Equal {
2308            // Count bytes in the matching line (including newline).
2309            shared_bytes += change.value().len();
2310        }
2311    }
2312
2313    // Git: score = copied * MAX_SCORE / max(src_size, dst_size)
2314    // We normalize to 0-100.
2315    let max_size = src_size.max(dst_size);
2316
2317    ((shared_bytes * 100) / max_size).min(100) as u32
2318}
2319
2320/// Compute rename/copy similarity percentage (0–100) between two byte slices.
2321///
2322/// This uses the same scoring logic as internal rename detection.
2323#[must_use]
2324pub fn rename_similarity_score(old: &[u8], new: &[u8]) -> u32 {
2325    compute_similarity(old, new)
2326}
2327
2328// ── Output formatting ───────────────────────────────────────────────
2329
2330/// Format a diff entry in Git's raw diff format.
2331///
2332/// Example: `:100644 100644 abc1234... def5678... M\tfile.txt`
2333pub fn format_raw(entry: &DiffEntry) -> String {
2334    let path = match entry.status {
2335        DiffStatus::Renamed | DiffStatus::Copied => {
2336            format!(
2337                "{}\t{}",
2338                entry.old_path.as_deref().unwrap_or(""),
2339                entry.new_path.as_deref().unwrap_or("")
2340            )
2341        }
2342        _ => entry.path().to_owned(),
2343    };
2344
2345    let status_str = match (entry.status, entry.score) {
2346        (DiffStatus::Renamed, Some(s)) => format!("R{:03}", s),
2347        (DiffStatus::Copied, Some(s)) => format!("C{:03}", s),
2348        _ => entry.status.letter().to_string(),
2349    };
2350
2351    format!(
2352        ":{} {} {} {} {}\t{}",
2353        entry.old_mode, entry.new_mode, entry.old_oid, entry.new_oid, status_str, path
2354    )
2355}
2356
2357/// Format a diff entry with abbreviated OIDs.
2358pub fn format_raw_abbrev(entry: &DiffEntry, abbrev_len: usize) -> String {
2359    let ellipsis = if std::env::var("GIT_PRINT_SHA1_ELLIPSIS").ok().as_deref() == Some("yes") {
2360        "..."
2361    } else {
2362        ""
2363    };
2364    let old_hex = format!("{}", entry.old_oid);
2365    let new_hex = format!("{}", entry.new_oid);
2366    let old_abbrev = &old_hex[..abbrev_len.min(old_hex.len())];
2367    let new_abbrev = &new_hex[..abbrev_len.min(new_hex.len())];
2368
2369    let path = entry.path();
2370
2371    format!(
2372        ":{} {} {}{} {}{} {}\t{}",
2373        entry.old_mode,
2374        entry.new_mode,
2375        old_abbrev,
2376        ellipsis,
2377        new_abbrev,
2378        ellipsis,
2379        entry.status.letter(),
2380        path
2381    )
2382}
2383
2384/// Generate a unified diff patch for two blobs.
2385///
2386/// # Parameters
2387///
2388/// - `old_content` — the old file content (empty for added files).
2389/// - `new_content` — the new file content (empty for deleted files).
2390/// - `old_path` — display path for the old side.
2391/// - `new_path` — display path for the new side.
2392/// - `context_lines` — number of context lines around changes (default: 3).
2393/// - Inter-hunk context defaults to `0` (see [`unified_diff_with_prefix`]).
2394///
2395/// # Returns
2396///
2397/// The unified diff as a string.
2398pub fn unified_diff(
2399    old_content: &str,
2400    new_content: &str,
2401    old_path: &str,
2402    new_path: &str,
2403    context_lines: usize,
2404    indent_heuristic: bool,
2405    quote_path_fully: bool,
2406) -> String {
2407    unified_diff_with_prefix(
2408        old_content,
2409        new_content,
2410        old_path,
2411        new_path,
2412        context_lines,
2413        0,
2414        "a/",
2415        "b/",
2416        indent_heuristic,
2417        quote_path_fully,
2418    )
2419}
2420
2421/// Same as `unified_diff` but with configurable source/destination prefixes.
2422///
2423/// `inter_hunk_context` is Git's `--inter-hunk-context`: adjacent hunks merge when
2424/// the unchanged gap between them is at most `2 * context_lines + inter_hunk_context` lines.
2425#[allow(clippy::too_many_arguments)] // Mirrors Git-style unified diff parameters.
2426pub fn unified_diff_with_prefix(
2427    old_content: &str,
2428    new_content: &str,
2429    old_path: &str,
2430    new_path: &str,
2431    context_lines: usize,
2432    inter_hunk_context: usize,
2433    src_prefix: &str,
2434    dst_prefix: &str,
2435    indent_heuristic: bool,
2436    quote_path_fully: bool,
2437) -> String {
2438    unified_diff_with_prefix_and_funcname(
2439        old_content,
2440        new_content,
2441        old_path,
2442        new_path,
2443        context_lines,
2444        inter_hunk_context,
2445        src_prefix,
2446        dst_prefix,
2447        None,
2448        indent_heuristic,
2449        quote_path_fully,
2450    )
2451}
2452
2453/// Same as [`unified_diff_with_prefix`] with optional custom hunk-header
2454/// function-name matching.
2455#[allow(clippy::too_many_arguments)]
2456pub fn unified_diff_with_prefix_and_funcname(
2457    old_content: &str,
2458    new_content: &str,
2459    old_path: &str,
2460    new_path: &str,
2461    context_lines: usize,
2462    inter_hunk_context: usize,
2463    src_prefix: &str,
2464    dst_prefix: &str,
2465    funcname_matcher: Option<&FuncnameMatcher>,
2466    indent_heuristic: bool,
2467    quote_path_fully: bool,
2468) -> String {
2469    unified_diff_with_prefix_and_funcname_and_algorithm(
2470        old_content,
2471        new_content,
2472        old_path,
2473        new_path,
2474        context_lines,
2475        inter_hunk_context,
2476        src_prefix,
2477        dst_prefix,
2478        funcname_matcher,
2479        similar::Algorithm::Myers,
2480        false,
2481        false,
2482        indent_heuristic,
2483        quote_path_fully,
2484    )
2485}
2486
2487/// Same as [`unified_diff_with_prefix_and_funcname`] but allows callers to
2488/// choose the line diff algorithm used for hunk generation.
2489///
2490/// When `function_context` is true (`git diff -W`), hunks are expanded to
2491/// whole logical functions using the same rules as Git's `XDL_EMIT_FUNCCONTEXT`.
2492#[allow(clippy::too_many_arguments)]
2493pub fn unified_diff_with_prefix_and_funcname_and_algorithm(
2494    old_content: &str,
2495    new_content: &str,
2496    old_path: &str,
2497    new_path: &str,
2498    context_lines: usize,
2499    inter_hunk_context: usize,
2500    src_prefix: &str,
2501    dst_prefix: &str,
2502    funcname_matcher: Option<&FuncnameMatcher>,
2503    algorithm: similar::Algorithm,
2504    function_context: bool,
2505    use_git_histogram: bool,
2506    indent_heuristic: bool,
2507    quote_path_fully: bool,
2508) -> String {
2509    if use_git_histogram {
2510        return unified_diff_histogram_with_prefix_and_funcname(
2511            old_content,
2512            new_content,
2513            old_path,
2514            new_path,
2515            context_lines,
2516            inter_hunk_context,
2517            src_prefix,
2518            dst_prefix,
2519            funcname_matcher,
2520            quote_path_fully,
2521        );
2522    }
2523
2524    if function_context {
2525        return unified_diff_with_function_context(
2526            old_content,
2527            new_content,
2528            old_path,
2529            new_path,
2530            context_lines,
2531            inter_hunk_context,
2532            src_prefix,
2533            dst_prefix,
2534            funcname_matcher,
2535            algorithm,
2536            indent_heuristic,
2537            quote_path_fully,
2538        );
2539    }
2540
2541    use crate::quote_path::format_diff_path_with_prefix;
2542    use similar::{group_diff_ops, udiff::UnifiedDiffHunk, TextDiff};
2543
2544    let diff = TextDiff::configure()
2545        .algorithm(algorithm)
2546        .diff_lines(old_content, new_content);
2547    let compacted_ops = diff_indent_heuristic::diff_lines_ops_compacted(
2548        old_content,
2549        new_content,
2550        algorithm,
2551        indent_heuristic,
2552    );
2553
2554    let mut output = String::new();
2555    if old_path == "/dev/null" {
2556        output.push_str("--- /dev/null\n");
2557    } else if src_prefix.is_empty() {
2558        // Callers (e.g. `diff-tree`, `diff-index`) may pass a fully formatted token
2559        // (already includes `a/` and any C-style quoting).
2560        output.push_str(&format!("--- {old_path}\n"));
2561    } else {
2562        output.push_str("--- ");
2563        output.push_str(&format_diff_path_with_prefix(
2564            src_prefix,
2565            old_path,
2566            quote_path_fully,
2567        ));
2568        output.push('\n');
2569    }
2570    if new_path == "/dev/null" {
2571        output.push_str("+++ /dev/null\n");
2572    } else if dst_prefix.is_empty() {
2573        output.push_str(&format!("+++ {new_path}\n"));
2574    } else {
2575        output.push_str("+++ ");
2576        output.push_str(&format_diff_path_with_prefix(
2577            dst_prefix,
2578            new_path,
2579            quote_path_fully,
2580        ));
2581        output.push('\n');
2582    }
2583
2584    let old_lines: Vec<&str> = old_content.lines().collect();
2585
2586    // `similar::group_diff_ops` ends a hunk when an unchanged run has length > `2 * n`.
2587    // Git's xdiff merges adjacent changes while the gap between them in the old file is at most
2588    // `2 * context_lines + inter_hunk_context` (see `xdl_get_hunk` in xemit.c). Match that by
2589    // choosing `n` so `2 * n` equals that maximum merged gap (rounded up when the sum is odd).
2590    let max_common_gap = context_lines
2591        .saturating_mul(2)
2592        .saturating_add(inter_hunk_context);
2593    let group_radius = max_common_gap.div_ceil(2);
2594    let op_groups = group_diff_ops(compacted_ops, group_radius);
2595
2596    for ops in op_groups {
2597        if ops.is_empty() {
2598            continue;
2599        }
2600        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
2601        let hunk_str = format!("{hunk}");
2602        // The similar crate outputs @@ -a,b +c,d @@\n but Git adds
2603        // function context after the closing @@. Extract the hunk header
2604        // and add function context.
2605        if let Some(first_newline) = hunk_str.find('\n') {
2606            let header_line = &hunk_str[..first_newline];
2607            let rest = &hunk_str[first_newline..];
2608
2609            // Parse the old start line from the @@ header
2610            if let Some(func_ctx) =
2611                extract_function_context(header_line, &old_lines, funcname_matcher)
2612            {
2613                output.push_str(header_line);
2614                output.push(' ');
2615                output.push_str(&func_ctx);
2616                output.push_str(rest);
2617            } else {
2618                output.push_str(&hunk_str);
2619            }
2620        } else {
2621            output.push_str(&hunk_str);
2622        }
2623    }
2624
2625    output
2626}
2627
2628/// `git diff -W`: expand each hunk to include full function bodies (see Git `xemit.c`).
2629fn unified_diff_with_function_context(
2630    old_content: &str,
2631    new_content: &str,
2632    old_path: &str,
2633    new_path: &str,
2634    context_lines: usize,
2635    inter_hunk_context: usize,
2636    src_prefix: &str,
2637    dst_prefix: &str,
2638    funcname_matcher: Option<&FuncnameMatcher>,
2639    algorithm: similar::Algorithm,
2640    indent_heuristic: bool,
2641    quote_path_fully: bool,
2642) -> String {
2643    use crate::quote_path::format_diff_path_with_prefix;
2644    use similar::{group_diff_ops, udiff::UnifiedDiffHunk, TextDiff};
2645
2646    let diff = TextDiff::configure()
2647        .algorithm(algorithm)
2648        .diff_lines(old_content, new_content);
2649
2650    let old_lines: Vec<&str> = old_content.lines().collect();
2651    let new_lines: Vec<&str> = new_content.lines().collect();
2652    let n_old = old_lines.len();
2653    let n_new = new_lines.len();
2654
2655    let group_radius = context_lines
2656        .saturating_mul(2)
2657        .saturating_add(inter_hunk_context);
2658    let all_ops = diff.ops().to_vec();
2659    let op_groups = group_diff_ops(all_ops.clone(), group_radius);
2660
2661    let mut ranges: Vec<(usize, usize, usize, usize)> = Vec::new();
2662
2663    for ops in op_groups {
2664        if ops.is_empty() {
2665            continue;
2666        }
2667        let i1_anchor = func_context_old_anchor(&ops, n_old);
2668        let i1_end = hunk_old_change_end_exclusive(&ops);
2669        let skip_preimage_pull =
2670            append_with_whole_function_added(&ops, n_old, n_new, &new_lines, funcname_matcher);
2671        let hunk = UnifiedDiffHunk::new(ops, &diff, true);
2672        let hunk_str = format!("{hunk}");
2673        let header_line = hunk_str
2674            .lines()
2675            .next()
2676            .unwrap_or("")
2677            .trim_end_matches(['\r', '\n']);
2678        let Some((base_s1, base_e1, _base_s2, _base_e2)) =
2679            parse_unified_hunk_header_ranges(header_line)
2680        else {
2681            continue;
2682        };
2683
2684        let ctx = context_lines;
2685        let (s1, e1, s2, e2) = if skip_preimage_pull {
2686            let s = n_old.saturating_sub(ctx);
2687            let s2 = map_old_line_to_new(&all_ops, s, n_new).min(n_new);
2688            (s, n_old, s2, n_new)
2689        } else {
2690            let mut s1 = base_s1.saturating_sub(ctx);
2691            let mut s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
2692
2693            let base_pre_s1 = i1_anchor.saturating_sub(ctx);
2694            if base_pre_s1 < s1 {
2695                s1 = base_pre_s1;
2696                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
2697            }
2698
2699            let fs1 = expand_func_pre_start(s1, i1_anchor, n_old, &old_lines, funcname_matcher);
2700            if fs1 < s1 {
2701                s1 = fs1;
2702                s2 = map_old_line_to_new(&all_ops, s1, n_new).min(n_new);
2703            }
2704
2705            let mut e1 = (base_e1 + ctx).min(n_old);
2706            let mut e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
2707            let fe1 = expand_func_post_end(e1, i1_end, n_old, &old_lines, funcname_matcher);
2708            if fe1 > e1 {
2709                e1 = fe1;
2710                e2 = map_old_line_to_new(&all_ops, e1, n_new).min(n_new);
2711            }
2712            (s1, e1, s2, e2)
2713        };
2714
2715        ranges.push((s1, e1, s2, e2));
2716    }
2717
2718    let mut output = String::new();
2719    if old_path == "/dev/null" {
2720        output.push_str("--- /dev/null\n");
2721    } else if src_prefix.is_empty() {
2722        output.push_str(&format!("--- {old_path}\n"));
2723    } else {
2724        output.push_str("--- ");
2725        output.push_str(&format_diff_path_with_prefix(
2726            src_prefix,
2727            old_path,
2728            quote_path_fully,
2729        ));
2730        output.push('\n');
2731    }
2732    if new_path == "/dev/null" {
2733        output.push_str("+++ /dev/null\n");
2734    } else if dst_prefix.is_empty() {
2735        output.push_str(&format!("+++ {new_path}\n"));
2736    } else {
2737        output.push_str("+++ ");
2738        output.push_str(&format_diff_path_with_prefix(
2739            dst_prefix,
2740            new_path,
2741            quote_path_fully,
2742        ));
2743        output.push('\n');
2744    }
2745
2746    for (s1, e1, s2, e2) in ranges {
2747        if s1 >= e1 && s2 >= e2 {
2748            continue;
2749        }
2750        let old_seg =
2751            line_slice_for_diff_with_eof_nl(&old_lines, s1, e1, old_content.ends_with('\n'));
2752        let new_seg =
2753            line_slice_for_diff_with_eof_nl(&new_lines, s2, e2, new_content.ends_with('\n'));
2754        let inner_ctx = old_seg.lines().count().max(new_seg.lines().count()).max(1);
2755        let piece = unified_diff_with_prefix_and_funcname_and_algorithm(
2756            &old_seg,
2757            &new_seg,
2758            old_path,
2759            new_path,
2760            inner_ctx,
2761            0,
2762            src_prefix,
2763            dst_prefix,
2764            funcname_matcher,
2765            algorithm,
2766            false,
2767            false,
2768            indent_heuristic,
2769            quote_path_fully,
2770        );
2771        let shifted = shift_unified_hunk_headers_to_full_file(&piece, s1, s2);
2772        let with_func =
2773            enrich_unified_hunk_headers_funcname(&shifted, &old_lines, funcname_matcher);
2774        for line in with_func.lines() {
2775            if line.starts_with("--- ") || line.starts_with("+++ ") {
2776                continue;
2777            }
2778            output.push_str(line);
2779            output.push('\n');
2780        }
2781    }
2782
2783    output
2784}
2785
2786/// `piece` is a unified diff for a slice of the file; hunk headers use 1-based
2787/// coordinates relative to that slice. Shift them by `delta_old` / `delta_new`
2788/// (0-based offsets of the slice in the full file) so the combined patch applies
2789/// to the whole file.
2790fn shift_unified_hunk_headers_to_full_file(
2791    patch: &str,
2792    delta_old: usize,
2793    delta_new: usize,
2794) -> String {
2795    if delta_old == 0 && delta_new == 0 {
2796        return patch.to_owned();
2797    }
2798    let mut out = String::with_capacity(patch.len());
2799    for line in patch.lines() {
2800        if let Some(shifted) = shift_one_unified_hunk_header(line, delta_old, delta_new) {
2801            out.push_str(&shifted);
2802        } else {
2803            out.push_str(line);
2804        }
2805        out.push('\n');
2806    }
2807    out
2808}
2809
2810fn shift_one_unified_hunk_header(line: &str, delta_old: usize, delta_new: usize) -> Option<String> {
2811    let rest = line.strip_prefix("@@ ")?;
2812    let (old_chunk, after_plus) = rest.split_once(" +")?;
2813    let old_spec = old_chunk.strip_prefix('-')?;
2814    let (new_spec, suffix) = after_plus.split_once(" @@")?;
2815    let shifted_old = shift_unified_range_spec(old_spec, delta_old)?;
2816    let shifted_new = shift_unified_range_spec(new_spec, delta_new)?;
2817    Some(format!("@@ -{shifted_old} +{shifted_new} @@{suffix}"))
2818}
2819
2820fn shift_unified_range_spec(spec: &str, delta: usize) -> Option<String> {
2821    let spec = spec.trim();
2822    if let Some((start_s, count_s)) = spec.split_once(',') {
2823        let start: usize = start_s.parse().ok()?;
2824        let count: usize = count_s.parse().ok()?;
2825        Some(format!("{},{}", start.saturating_add(delta), count))
2826    } else {
2827        let start: usize = spec.parse().ok()?;
2828        Some(format!("{}", start.saturating_add(delta)))
2829    }
2830}
2831
2832/// Re-attach `@@ ... @@ <funcname>` using full-file line indices (inner diffs use slices).
2833fn enrich_unified_hunk_headers_funcname(
2834    patch: &str,
2835    full_old_lines: &[&str],
2836    funcname_matcher: Option<&FuncnameMatcher>,
2837) -> String {
2838    let mut out = String::with_capacity(patch.len());
2839    for line in patch.lines() {
2840        if let Some(fixed) = enrich_one_hunk_header_funcname(line, full_old_lines, funcname_matcher)
2841        {
2842            out.push_str(&fixed);
2843        } else {
2844            out.push_str(line);
2845        }
2846        out.push('\n');
2847    }
2848    out
2849}
2850
2851fn enrich_one_hunk_header_funcname(
2852    line: &str,
2853    full_old_lines: &[&str],
2854    funcname_matcher: Option<&FuncnameMatcher>,
2855) -> Option<String> {
2856    let after_at = line.strip_prefix("@@ ")?;
2857    let idx = after_at.find(" @@")?;
2858    let mid = after_at[..idx].trim();
2859    let tail = after_at[idx + 3..].trim_start();
2860    let header_for_parse = format!("@@ {mid} @@");
2861    let func = extract_function_context(&header_for_parse, full_old_lines, funcname_matcher);
2862    Some(if let Some(f) = func {
2863        format!("@@ {mid} @@ {f}")
2864    } else if !tail.is_empty() {
2865        format!("@@ {mid} @@ {tail}")
2866    } else {
2867        format!("@@ {mid} @@")
2868    })
2869}
2870
2871fn line_slice_for_diff_with_eof_nl(
2872    lines: &[&str],
2873    start: usize,
2874    end: usize,
2875    full_file_ends_with_newline: bool,
2876) -> String {
2877    if start >= end {
2878        return String::new();
2879    }
2880    let mut s = lines[start..end].join("\n");
2881    let slice_is_suffix_of_file = end == lines.len();
2882    let need_trailing_nl = if slice_is_suffix_of_file {
2883        full_file_ends_with_newline
2884    } else {
2885        true
2886    };
2887    if need_trailing_nl && !s.ends_with('\n') {
2888        s.push('\n');
2889    }
2890    s
2891}
2892
2893/// Map a 0-based old line index to the corresponding 0-based new line index using the full-file
2894/// diff ops (Git aligns context across deletions/insertions).
2895fn map_old_line_to_new(ops: &[similar::DiffOp], old_line: usize, n_new: usize) -> usize {
2896    use similar::DiffOp;
2897    let mut n = 0usize;
2898    for op in ops {
2899        match *op {
2900            DiffOp::Equal {
2901                old_index,
2902                new_index,
2903                len,
2904            } => {
2905                if old_index + len <= old_line {
2906                    n = new_index + len;
2907                    continue;
2908                }
2909                if old_index < old_line {
2910                    let take = old_line - old_index;
2911                    return (new_index + take).min(n_new);
2912                }
2913                return new_index.min(n_new);
2914            }
2915            DiffOp::Delete {
2916                old_index,
2917                old_len,
2918                new_index,
2919            } => {
2920                if old_index + old_len <= old_line {
2921                    n = new_index;
2922                    continue;
2923                }
2924                if old_index < old_line {
2925                    return new_index.min(n_new);
2926                }
2927            }
2928            DiffOp::Insert {
2929                old_index,
2930                new_index,
2931                new_len,
2932            } => {
2933                if old_index < old_line {
2934                    n = new_index + new_len;
2935                    continue;
2936                }
2937                if old_index == old_line {
2938                    // `old_line` is an exclusive end or insertion point aligned with this insert
2939                    // (e.g. EOF append maps to after the inserted block).
2940                    return (new_index + new_len).min(n_new);
2941                }
2942                return new_index.min(n_new);
2943            }
2944            DiffOp::Replace {
2945                old_index,
2946                old_len,
2947                new_index,
2948                new_len,
2949            } => {
2950                if old_index + old_len <= old_line {
2951                    n = new_index + new_len;
2952                    continue;
2953                }
2954                if old_index < old_line {
2955                    let into_old = old_line - old_index;
2956                    let mapped = new_index + into_old.min(new_len);
2957                    return mapped.min(n_new);
2958                }
2959                return new_index.min(n_new);
2960            }
2961        }
2962    }
2963    n.min(n_new)
2964}
2965
2966/// Parse `@@ -old +new @@` into 0-based half-open ranges in each file.
2967fn parse_unified_hunk_header_ranges(header: &str) -> Option<(usize, usize, usize, usize)> {
2968    let rest = header.strip_prefix("@@ ")?;
2969    let (old_tok, rest2) = rest.split_once(" +")?;
2970    let old_tok = old_tok.strip_prefix('-')?;
2971    let new_tok = rest2.split_once(" @@").map(|(a, _)| a)?;
2972
2973    fn parse_side(spec: &str) -> Option<(usize, usize)> {
2974        let spec = spec.trim();
2975        let (start_one_based, count) = if let Some((a, b)) = spec.split_once(',') {
2976            (a.parse::<usize>().ok()?, b.parse::<usize>().ok()?)
2977        } else {
2978            let s = spec.parse::<usize>().ok()?;
2979            (s, 1usize)
2980        };
2981        let s0 = start_one_based.saturating_sub(1);
2982        let e0 = s0.saturating_add(count);
2983        Some((s0, e0))
2984    }
2985
2986    let (os, oe) = parse_side(old_tok)?;
2987    let (ns, ne) = parse_side(new_tok)?;
2988    Some((os, oe, ns, ne))
2989}
2990
2991/// Git `xemit.c`: when a hunk only inserts at EOF (first inserted line is `new_index == n_old`)
2992/// and the added text already contains a funcname line, do not pull extra context from the preimage.
2993fn append_with_whole_function_added(
2994    ops: &[similar::DiffOp],
2995    n_old: usize,
2996    n_new: usize,
2997    new_lines: &[&str],
2998    matcher: Option<&FuncnameMatcher>,
2999) -> bool {
3000    use similar::DiffOp;
3001    if n_old == 0 {
3002        return false;
3003    }
3004    let mut only_ins_or_eq = true;
3005    let mut min_new_ins = usize::MAX;
3006    for op in ops {
3007        match *op {
3008            DiffOp::Equal { .. } => {}
3009            DiffOp::Insert {
3010                new_index, new_len, ..
3011            } => {
3012                min_new_ins = min_new_ins.min(new_index);
3013                if new_len == 0 {
3014                    only_ins_or_eq = false;
3015                }
3016            }
3017            DiffOp::Delete { .. } | DiffOp::Replace { .. } => {
3018                only_ins_or_eq = false;
3019            }
3020        }
3021    }
3022    let mut insert_at_eof = false;
3023    for op in ops {
3024        if let DiffOp::Insert { old_index, .. } = *op {
3025            if old_index == n_old {
3026                insert_at_eof = true;
3027                break;
3028            }
3029        }
3030    }
3031    let append_at_eof = min_new_ins == n_old || insert_at_eof;
3032    if !only_ins_or_eq || !append_at_eof || min_new_ins == usize::MAX {
3033        return false;
3034    }
3035    // Git only skips preimage pull when the inserted block is clearly a new logical
3036    // function (see `xemit.c` walking `xdf2` for `is_func_rec`). A loose "any line
3037    // looks like a function" check would match `return` / `printf` and break `-W`
3038    // hunks that still need preimage context (t4051 `extended`).
3039    let mut j = min_new_ins;
3040    while j < n_new {
3041        let line = new_lines[j];
3042        if line.trim().is_empty() {
3043            j += 1;
3044            continue;
3045        }
3046        if let Some(m) = matcher {
3047            if m.match_line(line).is_some() {
3048                return true;
3049            }
3050        } else if inserted_block_starts_with_c_like_function_definition(line) {
3051            return true;
3052        }
3053        j += 1;
3054    }
3055    false
3056}
3057
3058fn inserted_block_starts_with_c_like_function_definition(line: &str) -> bool {
3059    let t = line.trim_start();
3060    let Some(open_paren) = t.find('(') else {
3061        return false;
3062    };
3063    let head = &t[..open_paren];
3064    let tokens: Vec<&str> = head.split_whitespace().collect();
3065    if tokens.len() < 2 {
3066        // `printf(...)`, `return (`, etc. — not `return_type name(`.
3067        return false;
3068    }
3069    let nameish = tokens.last().copied().unwrap_or("");
3070    let name = nameish.trim_end_matches(['*', '&']);
3071    if name.is_empty() || !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
3072        return false;
3073    }
3074    let type_or_modifier = |tok: &str| {
3075        matches!(
3076            tok,
3077            "static"
3078                | "extern"
3079                | "inline"
3080                | "void"
3081                | "int"
3082                | "char"
3083                | "short"
3084                | "long"
3085                | "float"
3086                | "double"
3087                | "unsigned"
3088                | "signed"
3089                | "struct"
3090                | "enum"
3091                | "union"
3092                | "const"
3093                | "volatile"
3094                | "typedef"
3095        )
3096    };
3097    tokens[..tokens.len() - 1]
3098        .iter()
3099        .any(|tok| type_or_modifier(tok))
3100}
3101
3102fn hunk_old_change_end_exclusive(ops: &[similar::DiffOp]) -> usize {
3103    use similar::DiffOp;
3104    let mut max_o = 0usize;
3105    for op in ops {
3106        match *op {
3107            DiffOp::Delete {
3108                old_index, old_len, ..
3109            } => {
3110                max_o = max_o.max(old_index + old_len);
3111            }
3112            DiffOp::Replace {
3113                old_index, old_len, ..
3114            } => {
3115                max_o = max_o.max(old_index + old_len);
3116            }
3117            DiffOp::Insert { old_index, .. } => {
3118                // Pure insertions do not consume old lines; Git's post-context anchor is the
3119                // insertion point (`old_index`), not 0 (t4051 `extended`).
3120                max_o = max_o.max(old_index);
3121            }
3122            DiffOp::Equal { .. } => {}
3123        }
3124    }
3125    max_o
3126}
3127
3128fn func_context_old_anchor(ops: &[similar::DiffOp], n_old: usize) -> usize {
3129    use similar::DiffOp;
3130    let mut has_delete_or_replace = false;
3131    let mut min_del = usize::MAX;
3132    let mut min_ins_old = usize::MAX;
3133
3134    for op in ops {
3135        match *op {
3136            DiffOp::Delete {
3137                old_index, old_len, ..
3138            } => {
3139                has_delete_or_replace = true;
3140                min_del = min_del.min(old_index);
3141                min_del = min_del.min(old_index + old_len.saturating_sub(1));
3142            }
3143            DiffOp::Replace {
3144                old_index, old_len, ..
3145            } => {
3146                has_delete_or_replace = true;
3147                min_del = min_del.min(old_index);
3148                min_del = min_del.min(old_index + old_len.saturating_sub(1));
3149            }
3150            DiffOp::Insert { old_index, .. } => {
3151                min_ins_old = min_ins_old.min(old_index);
3152            }
3153            DiffOp::Equal { .. } => {}
3154        }
3155    }
3156
3157    let mut i1 = if has_delete_or_replace {
3158        min_del
3159    } else if min_ins_old != usize::MAX {
3160        min_ins_old
3161    } else {
3162        0
3163    };
3164
3165    let pure_insert = ops
3166        .iter()
3167        .all(|op| matches!(op, DiffOp::Insert { .. } | DiffOp::Equal { .. }))
3168        && ops.iter().any(|op| matches!(op, DiffOp::Insert { .. }));
3169
3170    if pure_insert && i1 >= n_old && n_old > 0 {
3171        i1 = n_old - 1;
3172    }
3173
3174    i1.min(n_old.saturating_sub(1))
3175}
3176
3177fn expand_func_pre_start(
3178    s1: usize,
3179    i1: usize,
3180    n_old: usize,
3181    old_lines: &[&str],
3182    matcher: Option<&FuncnameMatcher>,
3183) -> usize {
3184    if n_old == 0 {
3185        return s1;
3186    }
3187    let i1 = i1.min(n_old.saturating_sub(1));
3188    let mut fs1 = get_func_line_backward(old_lines, i1, matcher).unwrap_or(i1);
3189    while fs1 > 0
3190        && !is_line_empty_for_func_context(old_lines[fs1 - 1])
3191        && !is_func_line(old_lines[fs1 - 1], matcher)
3192    {
3193        fs1 -= 1;
3194    }
3195    s1.min(fs1)
3196}
3197
3198fn expand_func_post_end(
3199    e1: usize,
3200    i1_end: usize,
3201    n_old: usize,
3202    old_lines: &[&str],
3203    matcher: Option<&FuncnameMatcher>,
3204) -> usize {
3205    let from = i1_end.min(n_old);
3206    let fe1 = get_func_line_forward(old_lines, from, matcher).unwrap_or(n_old);
3207    let mut fe1_adj = fe1;
3208    while fe1_adj > 0 && is_line_empty_for_func_context(old_lines[fe1_adj - 1]) {
3209        fe1_adj -= 1;
3210    }
3211    e1.max(fe1_adj).min(n_old)
3212}
3213
3214fn is_line_empty_for_func_context(line: &str) -> bool {
3215    line.chars().all(|c| c.is_whitespace())
3216}
3217
3218fn is_func_line(line: &str, matcher: Option<&FuncnameMatcher>) -> bool {
3219    if let Some(m) = matcher {
3220        return m.match_line(line).is_some();
3221    }
3222    let t = line.trim_end_matches(['\n', '\r']);
3223    if t.is_empty() {
3224        return false;
3225    }
3226    let b = t.as_bytes()[0];
3227    b.is_ascii_alphabetic() || b == b'_' || b == b'$'
3228}
3229
3230fn get_func_line_backward(
3231    old_lines: &[&str],
3232    start: usize,
3233    matcher: Option<&FuncnameMatcher>,
3234) -> Option<usize> {
3235    let mut l = start.min(old_lines.len().saturating_sub(1));
3236    if old_lines.is_empty() {
3237        return None;
3238    }
3239    loop {
3240        if is_func_line(old_lines[l], matcher) {
3241            return Some(l);
3242        }
3243        if l == 0 {
3244            break;
3245        }
3246        l -= 1;
3247    }
3248    None
3249}
3250
3251fn get_func_line_forward(
3252    old_lines: &[&str],
3253    start: usize,
3254    matcher: Option<&FuncnameMatcher>,
3255) -> Option<usize> {
3256    let mut l = start;
3257    while l < old_lines.len() {
3258        if is_func_line(old_lines[l], matcher) {
3259            return Some(l);
3260        }
3261        l += 1;
3262    }
3263    None
3264}
3265
3266/// Compute a unified diff with anchored lines.
3267///
3268/// Anchored lines that appear exactly once in both old and new content are
3269/// forced to match, splitting the diff into segments around those anchor points.
3270/// This produces diffs where the anchored text stays as context and surrounding
3271/// lines are shown as additions/removals.
3272///
3273/// Segment diffs use `algorithm`. When `use_git_histogram` is true, histogram uses imara-diff
3274/// (Git-compatible); otherwise `algorithm` is passed to `similar`.
3275pub fn anchored_unified_diff(
3276    old_content: &str,
3277    new_content: &str,
3278    old_path: &str,
3279    new_path: &str,
3280    context_lines: usize,
3281    anchors: &[String],
3282    algorithm: similar::Algorithm,
3283    use_git_histogram: bool,
3284    indent_heuristic: bool,
3285    quote_path_fully: bool,
3286) -> String {
3287    use crate::quote_path::format_diff_path_with_prefix;
3288    use similar::TextDiff;
3289
3290    let old_lines: Vec<&str> = old_content.lines().collect();
3291    let new_lines: Vec<&str> = new_content.lines().collect();
3292
3293    // Find anchored lines that appear exactly once in both old and new
3294    let mut anchor_pairs: Vec<(usize, usize)> = Vec::new(); // (old_idx, new_idx)
3295
3296    for anchor in anchors {
3297        let anchor_str = anchor.as_str();
3298
3299        // Count occurrences in old
3300        let old_positions: Vec<usize> = old_lines
3301            .iter()
3302            .enumerate()
3303            .filter(|(_, l)| l.trim_end() == anchor_str)
3304            .map(|(i, _)| i)
3305            .collect();
3306
3307        // Count occurrences in new
3308        let new_positions: Vec<usize> = new_lines
3309            .iter()
3310            .enumerate()
3311            .filter(|(_, l)| l.trim_end() == anchor_str)
3312            .map(|(i, _)| i)
3313            .collect();
3314
3315        // Only anchor if unique in both
3316        if old_positions.len() == 1 && new_positions.len() == 1 {
3317            anchor_pairs.push((old_positions[0], new_positions[0]));
3318        }
3319    }
3320
3321    // If no valid anchors, fall back to normal diff
3322    if anchor_pairs.is_empty() {
3323        return unified_diff_with_prefix_and_funcname_and_algorithm(
3324            old_content,
3325            new_content,
3326            old_path,
3327            new_path,
3328            context_lines,
3329            0,
3330            "a/",
3331            "b/",
3332            None,
3333            algorithm,
3334            false,
3335            use_git_histogram,
3336            indent_heuristic,
3337            quote_path_fully,
3338        );
3339    }
3340
3341    // Sort anchor pairs by their position in the old file
3342    anchor_pairs.sort_by_key(|&(old_idx, _)| old_idx);
3343
3344    // Filter to only keep pairs where new positions are also increasing
3345    // (longest increasing subsequence of new positions)
3346    let mut filtered: Vec<(usize, usize)> = Vec::new();
3347    for &pair in &anchor_pairs {
3348        if filtered.is_empty() || filtered.last().is_some_and(|last| pair.1 > last.1) {
3349            filtered.push(pair);
3350        }
3351    }
3352    let anchor_pairs = filtered;
3353
3354    // Build a modified version of old/new where we diff segments between anchors.
3355    // We'll construct the diff by processing segments:
3356    // - Before first anchor
3357    // - Between consecutive anchors
3358    // - After last anchor
3359    // Each anchor line itself is a fixed context match.
3360
3361    // Collect all diff operations
3362    struct LineDiffOp {
3363        tag: char, // ' ', '+', '-'
3364        line: String,
3365    }
3366
3367    let append_segment_diff =
3368        |ops: &mut Vec<LineDiffOp>, old_seg_input: &str, new_seg_input: &str| {
3369            use similar::ChangeTag;
3370            let old_ls: Vec<&str> = old_seg_input.lines().collect();
3371            let new_ls: Vec<&str> = new_seg_input.lines().collect();
3372            if old_ls.is_empty() && new_ls.is_empty() {
3373                return;
3374            }
3375            let seg_diff = TextDiff::configure()
3376                .algorithm(algorithm)
3377                .diff_slices(&old_ls, &new_ls);
3378            let raw = seg_diff.ops().to_vec();
3379            let compacted = diff_indent_heuristic::apply_change_compact_to_ops(
3380                &raw,
3381                &old_ls,
3382                &new_ls,
3383                indent_heuristic,
3384            );
3385            for op in &compacted {
3386                for ch in op.iter_changes(&old_ls, &new_ls) {
3387                    let t = match ch.tag() {
3388                        ChangeTag::Equal => ' ',
3389                        ChangeTag::Delete => '-',
3390                        ChangeTag::Insert => '+',
3391                    };
3392                    ops.push(LineDiffOp {
3393                        tag: t,
3394                        line: ch.value().to_string(),
3395                    });
3396                }
3397            }
3398        };
3399
3400    let mut ops: Vec<LineDiffOp> = Vec::new();
3401    let mut old_pos = 0usize;
3402    let mut new_pos = 0usize;
3403
3404    for &(old_anchor, new_anchor) in &anchor_pairs {
3405        // Diff the segment before this anchor
3406        let old_segment: Vec<&str> = old_lines[old_pos..old_anchor].to_vec();
3407        let new_segment: Vec<&str> = new_lines[new_pos..new_anchor].to_vec();
3408
3409        let old_seg_text = old_segment.join("\n");
3410        let new_seg_text = new_segment.join("\n");
3411
3412        if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
3413            let old_seg_input = if old_seg_text.is_empty() {
3414                String::new()
3415            } else {
3416                format!("{}\n", old_seg_text)
3417            };
3418            let new_seg_input = if new_seg_text.is_empty() {
3419                String::new()
3420            } else {
3421                format!("{}\n", new_seg_text)
3422            };
3423            append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
3424        }
3425
3426        // The anchor line itself is always context
3427        ops.push(LineDiffOp {
3428            tag: ' ',
3429            line: old_lines[old_anchor].to_string(),
3430        });
3431
3432        old_pos = old_anchor + 1;
3433        new_pos = new_anchor + 1;
3434    }
3435
3436    // Diff the remaining segment after the last anchor
3437    let old_segment: Vec<&str> = old_lines[old_pos..].to_vec();
3438    let new_segment: Vec<&str> = new_lines[new_pos..].to_vec();
3439    let old_seg_text = old_segment.join("\n");
3440    let new_seg_text = new_segment.join("\n");
3441
3442    if !old_seg_text.is_empty() || !new_seg_text.is_empty() {
3443        let old_seg_input = if old_seg_text.is_empty() {
3444            String::new()
3445        } else {
3446            format!("{}\n", old_seg_text)
3447        };
3448        let new_seg_input = if new_seg_text.is_empty() {
3449            String::new()
3450        } else {
3451            format!("{}\n", new_seg_text)
3452        };
3453        append_segment_diff(&mut ops, &old_seg_input, &new_seg_input);
3454    }
3455
3456    // Now format as unified diff with hunks
3457    let mut output = String::new();
3458    if old_path == "/dev/null" {
3459        output.push_str("--- /dev/null\n");
3460    } else {
3461        output.push_str("--- ");
3462        output.push_str(&format_diff_path_with_prefix(
3463            "a/",
3464            old_path,
3465            quote_path_fully,
3466        ));
3467        output.push('\n');
3468    }
3469    if new_path == "/dev/null" {
3470        output.push_str("+++ /dev/null\n");
3471    } else {
3472        output.push_str("+++ ");
3473        output.push_str(&format_diff_path_with_prefix(
3474            "b/",
3475            new_path,
3476            quote_path_fully,
3477        ));
3478        output.push('\n');
3479    }
3480
3481    // Group ops into hunks with context
3482    let total_ops = ops.len();
3483    if total_ops == 0 {
3484        return output;
3485    }
3486
3487    // Find ranges of changes
3488    let mut hunks: Vec<(usize, usize)> = Vec::new(); // (start, end) indices into ops
3489    let mut i = 0;
3490    while i < total_ops {
3491        if ops[i].tag != ' ' {
3492            let start = i.saturating_sub(context_lines);
3493            let mut end = i;
3494            // Extend to include consecutive changes and their context
3495            while end < total_ops {
3496                if ops[end].tag != ' ' {
3497                    end += 1;
3498                    continue;
3499                }
3500                // Check if there's another change within context_lines
3501                let mut next_change = end;
3502                while next_change < total_ops && ops[next_change].tag == ' ' {
3503                    next_change += 1;
3504                }
3505                if next_change < total_ops && next_change - end <= context_lines * 2 {
3506                    end = next_change + 1;
3507                } else {
3508                    end = (end + context_lines).min(total_ops);
3509                    break;
3510                }
3511            }
3512            // Merge with previous hunk if overlapping
3513            if let Some(last) = hunks.last_mut() {
3514                if start <= last.1 {
3515                    last.1 = end;
3516                } else {
3517                    hunks.push((start, end));
3518                }
3519            } else {
3520                hunks.push((start, end));
3521            }
3522            i = end;
3523        } else {
3524            i += 1;
3525        }
3526    }
3527
3528    // Output each hunk
3529    for (start, end) in hunks {
3530        // Count old/new lines in this hunk
3531        let mut old_start = 1usize;
3532        let mut new_start = 1usize;
3533        // Calculate line numbers by counting ops before this hunk
3534        for op in &ops[..start] {
3535            match op.tag {
3536                ' ' => {
3537                    old_start += 1;
3538                    new_start += 1;
3539                }
3540                '-' => {
3541                    old_start += 1;
3542                }
3543                '+' => {
3544                    new_start += 1;
3545                }
3546                _ => {}
3547            }
3548        }
3549        let mut old_count = 0usize;
3550        let mut new_count = 0usize;
3551        for op in &ops[start..end] {
3552            match op.tag {
3553                ' ' => {
3554                    old_count += 1;
3555                    new_count += 1;
3556                }
3557                '-' => {
3558                    old_count += 1;
3559                }
3560                '+' => {
3561                    new_count += 1;
3562                }
3563                _ => {}
3564            }
3565        }
3566
3567        output.push_str(&format!(
3568            "@@ -{},{} +{},{} @@\n",
3569            old_start, old_count, new_start, new_count
3570        ));
3571        for op in &ops[start..end] {
3572            output.push(op.tag);
3573            output.push_str(&op.line);
3574            output.push('\n');
3575        }
3576    }
3577
3578    output
3579}
3580
3581/// Extract function context for a hunk header.
3582///
3583/// Given a hunk header like `@@ -8,7 +8,7 @@`, find the last line
3584/// before line 8 in the old content that looks like a function header
3585/// (starts with a non-whitespace character, like Git's default).
3586fn extract_function_context(
3587    header: &str,
3588    old_lines: &[&str],
3589    funcname_matcher: Option<&FuncnameMatcher>,
3590) -> Option<String> {
3591    // Parse the old start line number from "@@ -<start>,<count> ..."
3592    let at_pos = header.find("-")?;
3593    let rest = &header[at_pos + 1..];
3594    let comma_or_space = rest.find([',', ' '])?;
3595    let start_str = &rest[..comma_or_space];
3596    let start_line: usize = start_str.parse().ok()?;
3597
3598    if start_line <= 1 {
3599        return None;
3600    }
3601
3602    // Look backwards from the line before the hunk start for a line that
3603    // starts with a non-whitespace character (Git's default funcname pattern).
3604    // start_line is 1-indexed, so the hunk starts at old_lines[start_line-1].
3605    // We want to look at lines before that: old_lines[0..start_line-1].
3606    let search_end = (start_line - 1).min(old_lines.len());
3607    let truncate = |text: &str| {
3608        if text.len() > 80 {
3609            let mut end = 80;
3610            while end > 0 && !text.is_char_boundary(end) {
3611                end -= 1;
3612            }
3613            text[..end].to_owned()
3614        } else {
3615            text.to_owned()
3616        }
3617    };
3618
3619    for i in (0..search_end).rev() {
3620        let line = old_lines[i];
3621        if line.is_empty() {
3622            continue;
3623        }
3624        if let Some(matcher) = funcname_matcher {
3625            if let Some(matched) = matcher.match_line(line) {
3626                return Some(truncate(&matched));
3627            }
3628            continue;
3629        }
3630
3631        let first = line.as_bytes()[0];
3632        if first.is_ascii_alphabetic() || first == b'_' || first == b'$' {
3633            return Some(truncate(line.trim_end_matches(char::is_whitespace)));
3634        }
3635    }
3636    None
3637}
3638
3639/// Generate diff stat output (file name + insertions/deletions).
3640///
3641/// Returns a single line like: ` file.txt | 5 ++---`
3642pub fn format_stat_line(
3643    path: &str,
3644    insertions: usize,
3645    deletions: usize,
3646    max_path_len: usize,
3647) -> String {
3648    format_stat_line_width(path, insertions, deletions, max_path_len, 0)
3649}
3650
3651pub fn format_stat_line_width(
3652    path: &str,
3653    insertions: usize,
3654    deletions: usize,
3655    max_path_len: usize,
3656    count_width: usize,
3657) -> String {
3658    let total = insertions + deletions;
3659    let plus = "+".repeat(insertions.min(50));
3660    let minus = "-".repeat(deletions.min(50));
3661    let cw = if count_width > 0 {
3662        count_width
3663    } else {
3664        format!("{}", total).len()
3665    };
3666    let bar = format!("{}{}", plus, minus);
3667    if bar.is_empty() {
3668        format!(
3669            " {:<width$} | {:>cw$}",
3670            path,
3671            total,
3672            width = max_path_len,
3673            cw = cw
3674        )
3675    } else {
3676        format!(
3677            " {:<width$} | {:>cw$} {}",
3678            path,
3679            total,
3680            bar,
3681            width = max_path_len,
3682            cw = cw
3683        )
3684    }
3685}
3686
3687/// Normalise one line like Git's `-b` / `--ignore-space-change`.
3688#[must_use]
3689pub fn normalize_ignore_space_change_line(line: &str) -> String {
3690    let mut result = String::with_capacity(line.len());
3691    let mut in_space = false;
3692    for c in line.chars() {
3693        if c.is_whitespace() {
3694            if !in_space {
3695                result.push(' ');
3696                in_space = true;
3697            }
3698        } else {
3699            result.push(c);
3700            in_space = false;
3701        }
3702    }
3703    while result.ends_with(' ') {
3704        result.pop();
3705    }
3706    result
3707}
3708
3709/// Normalise text like Git's `-b` / `--ignore-space-change`: on each line, collapse runs of
3710/// whitespace to a single ASCII space and trim trailing spaces.
3711///
3712/// Line breaks are preserved by splitting on [`str::lines`] and rejoining with `\n` (same approach
3713/// as the porcelain `diff` whitespace handling in `grit`).
3714#[must_use]
3715pub fn normalize_ignore_space_change(content: &str) -> String {
3716    content
3717        .lines()
3718        .map(normalize_ignore_space_change_line)
3719        .collect::<Vec<_>>()
3720        .join("\n")
3721}
3722
3723/// Count insertions and deletions between two strings.
3724///
3725/// Returns `(insertions, deletions)`.
3726pub fn count_changes(old_content: &str, new_content: &str) -> (usize, usize) {
3727    count_changes_with_algorithm(old_content, new_content, similar::Algorithm::Myers, false)
3728}
3729
3730/// Count insertions and deletions using the given line-diff algorithm.
3731///
3732/// Git's `--stat` / `--numstat` follow the configured diff algorithm; this mirrors that by
3733/// running [`similar::TextDiff`] with an explicit [`similar::Algorithm`].
3734#[must_use]
3735pub fn count_changes_with_algorithm(
3736    old_content: &str,
3737    new_content: &str,
3738    algorithm: similar::Algorithm,
3739    use_git_histogram: bool,
3740) -> (usize, usize) {
3741    if use_git_histogram {
3742        use imara_diff::{Algorithm, Diff, InternedInput};
3743        let input = InternedInput::new(old_content, new_content);
3744        let mut d = Diff::compute(Algorithm::Histogram, &input);
3745        d.postprocess_lines(&input);
3746        return (d.count_additions() as usize, d.count_removals() as usize);
3747    }
3748
3749    use similar::{ChangeTag, TextDiff};
3750
3751    let diff = TextDiff::configure()
3752        .algorithm(algorithm)
3753        .diff_lines(old_content, new_content);
3754    let mut ins = 0;
3755    let mut del = 0;
3756
3757    for change in diff.iter_all_changes() {
3758        match change.tag() {
3759            ChangeTag::Insert => ins += 1,
3760            ChangeTag::Delete => del += 1,
3761            ChangeTag::Equal => {}
3762        }
3763    }
3764
3765    (ins, del)
3766}
3767
3768/// Line count for diffstat/`--numstat`, matching Git's `count_lines()` in `diff.c`.
3769///
3770/// Counts newline-terminated lines; a final line without trailing newline still counts as one line.
3771/// An empty buffer yields `0`.
3772#[must_use]
3773pub fn count_git_lines(data: &[u8]) -> usize {
3774    if data.is_empty() {
3775        return 0;
3776    }
3777    let mut count = 0usize;
3778    let mut nl_just_seen = false;
3779    for &ch in data {
3780        if ch == b'\n' {
3781            count += 1;
3782            nl_just_seen = true;
3783        } else {
3784            nl_just_seen = false;
3785        }
3786    }
3787    if !nl_just_seen {
3788        count += 1;
3789    }
3790    count
3791}
3792
3793/// Internal maximum diff score used by Git rename/break heuristics (`MAX_SCORE` in `diffcore.h`).
3794pub const GIT_DIFF_MAX_SCORE: u64 = 60_000;
3795const DIFF_MAX_SCORE: u64 = GIT_DIFF_MAX_SCORE;
3796const DIFF_MINIMUM_BREAK_SIZE: usize = 400;
3797const DIFF_DEFAULT_BREAK_SCORE: u64 = 30_000;
3798/// Default break threshold (`DEFAULT_BREAK_SCORE` in `diffcore.h`), internal 0–[`GIT_DIFF_MAX_SCORE`] scale.
3799pub const GIT_DIFF_DEFAULT_BREAK_SCORE: u64 = DIFF_DEFAULT_BREAK_SCORE;
3800/// Default merge threshold after a break (`DEFAULT_MERGE_SCORE` in `diffcore.h`): pairs broken for
3801/// rename/copy but not consumed are merged back when deletion-weight is below this (60% by default).
3802pub const GIT_DIFF_DEFAULT_MERGE_SCORE_AFTER_BREAK: u64 = 36_000;
3803const DIFF_HASHBASE: u32 = 107_927;
3804
3805#[derive(Clone, Copy, Default)]
3806struct SpanSlot {
3807    hashval: u32,
3808    cnt: u32,
3809}
3810
3811struct SpanHashTop {
3812    alloc_log2: u8,
3813    free_slots: i32,
3814    data: Vec<SpanSlot>,
3815}
3816
3817impl SpanHashTop {
3818    fn new(initial_log2: u8) -> Self {
3819        let cap = 1usize << initial_log2;
3820        Self {
3821            alloc_log2: initial_log2,
3822            free_slots: initial_free(initial_log2),
3823            data: vec![SpanSlot::default(); cap],
3824        }
3825    }
3826
3827    fn len(&self) -> usize {
3828        1usize << self.alloc_log2
3829    }
3830
3831    fn add_span(&mut self, hashval: u32, cnt: u32) {
3832        loop {
3833            let lim = self.len();
3834            let mut bucket = (hashval as usize) & (lim - 1);
3835            loop {
3836                let h = &mut self.data[bucket];
3837                if h.cnt == 0 {
3838                    h.hashval = hashval;
3839                    h.cnt = cnt;
3840                    self.free_slots -= 1;
3841                    if self.free_slots < 0 {
3842                        self.rehash();
3843                        break;
3844                    }
3845                    return;
3846                }
3847                if h.hashval == hashval {
3848                    h.cnt = h.cnt.saturating_add(cnt);
3849                    return;
3850                }
3851                bucket += 1;
3852                if bucket >= lim {
3853                    bucket = 0;
3854                }
3855            }
3856        }
3857    }
3858
3859    fn rehash(&mut self) {
3860        let old = std::mem::take(&mut self.data);
3861        let old_log = self.alloc_log2;
3862        self.alloc_log2 = old_log.saturating_add(1);
3863        let new_len = 1usize << self.alloc_log2;
3864        self.free_slots = initial_free(self.alloc_log2);
3865        self.data = vec![SpanSlot::default(); new_len];
3866        let old_sz = 1usize << old_log;
3867        for o in old.iter().take(old_sz) {
3868            let o = *o;
3869            if o.cnt == 0 {
3870                continue;
3871            }
3872            self.add_span_after_rehash(o.hashval, o.cnt);
3873        }
3874    }
3875
3876    fn add_span_after_rehash(&mut self, hashval: u32, cnt: u32) {
3877        loop {
3878            let lim = self.len();
3879            let mut bucket = (hashval as usize) & (lim - 1);
3880            loop {
3881                let h = &mut self.data[bucket];
3882                if h.cnt == 0 {
3883                    h.hashval = hashval;
3884                    h.cnt = cnt;
3885                    self.free_slots -= 1;
3886                    if self.free_slots < 0 {
3887                        self.rehash();
3888                        break;
3889                    }
3890                    return;
3891                }
3892                if h.hashval == hashval {
3893                    h.cnt = h.cnt.saturating_add(cnt);
3894                    return;
3895                }
3896                bucket += 1;
3897                if bucket >= lim {
3898                    bucket = 0;
3899                }
3900            }
3901        }
3902    }
3903
3904    fn sort_by_hashval(&mut self) {
3905        let sz = self.len();
3906        self.data[..sz].sort_by(|a, b| {
3907            if a.cnt == 0 {
3908                return std::cmp::Ordering::Greater;
3909            }
3910            if b.cnt == 0 {
3911                return std::cmp::Ordering::Less;
3912            }
3913            a.hashval.cmp(&b.hashval)
3914        });
3915    }
3916}
3917
3918fn initial_free(sz_log2: u8) -> i32 {
3919    let sz = sz_log2 as i32;
3920    ((1i32 << sz_log2) * (sz - 3) / sz).max(0)
3921}
3922
3923fn hash_blob_spans(buf: &[u8], is_text: bool) -> SpanHashTop {
3924    let mut hash = SpanHashTop::new(9);
3925    let mut n = 0u32;
3926    let mut accum1: u32 = 0;
3927    let mut accum2: u32 = 0;
3928    let mut i = 0usize;
3929    while i < buf.len() {
3930        let c = buf[i] as u32;
3931        let old_1 = accum1;
3932        i += 1;
3933
3934        if is_text && c == b'\r' as u32 && i < buf.len() && buf[i] == b'\n' {
3935            continue;
3936        }
3937
3938        accum1 = accum1.wrapping_shl(7) ^ accum2.wrapping_shr(25);
3939        accum2 = accum2.wrapping_shl(7) ^ old_1.wrapping_shr(25);
3940        accum1 = accum1.wrapping_add(c);
3941        n += 1;
3942        if n < 64 && c != b'\n' as u32 {
3943            continue;
3944        }
3945        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
3946        hash.add_span(hashval, n);
3947        n = 0;
3948        accum1 = 0;
3949        accum2 = 0;
3950    }
3951    if n > 0 {
3952        let hashval = (accum1.wrapping_add(accum2.wrapping_mul(0x61))) % DIFF_HASHBASE;
3953        hash.add_span(hashval, n);
3954    }
3955    hash.sort_by_hashval();
3956    hash
3957}
3958
3959/// Approximate copied vs added material between two blobs (Git `diffcore_count_changes`).
3960///
3961/// Returns `(copied_bytes_from_src, literal_added_bytes_in_dst)` matching Git's
3962/// `diffcore_count_changes` semantics (used for `--dirstat=changes` damage).
3963#[must_use]
3964pub fn diffcore_count_changes(old: &[u8], new: &[u8]) -> (u64, u64) {
3965    let src_is_text = !crate::merge_file::is_binary(old);
3966    let dst_is_text = !crate::merge_file::is_binary(new);
3967    let src_count = hash_blob_spans(old, src_is_text);
3968    let dst_count = hash_blob_spans(new, dst_is_text);
3969    let mut sc: u64 = 0;
3970    let mut la: u64 = 0;
3971    let mut si = 0usize;
3972    let mut di = 0usize;
3973    let src_len = src_count.len();
3974    let dst_len = dst_count.len();
3975    loop {
3976        if si >= src_len || src_count.data[si].cnt == 0 {
3977            break;
3978        }
3979        let s_hash = src_count.data[si].hashval;
3980        let s_cnt = u64::from(src_count.data[si].cnt);
3981        while di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval < s_hash {
3982            la += u64::from(dst_count.data[di].cnt);
3983            di += 1;
3984        }
3985        let mut dst_cnt = 0u64;
3986        if di < dst_len && dst_count.data[di].cnt != 0 && dst_count.data[di].hashval == s_hash {
3987            dst_cnt = u64::from(dst_count.data[di].cnt);
3988            di += 1;
3989        }
3990        if s_cnt < dst_cnt {
3991            la += dst_cnt - s_cnt;
3992            sc += s_cnt;
3993        } else {
3994            sc += dst_cnt;
3995        }
3996        si += 1;
3997    }
3998    while di < dst_len && dst_count.data[di].cnt != 0 {
3999        la += u64::from(dst_count.data[di].cnt);
4000        di += 1;
4001    }
4002    (sc, la)
4003}
4004
4005/// Whether this modified blob pair should use Git's "complete rewrite" diffstat path when
4006/// `--break-rewrites` is in effect (`should_break` in `diffcore-break.c`).
4007#[must_use]
4008pub fn should_break_rewrite_for_stat(old: &[u8], new: &[u8]) -> bool {
4009    should_break_rewrite_inner(old, new, DIFF_DEFAULT_BREAK_SCORE)
4010}
4011
4012/// Whether an in-place blob edit should be split into delete+create for rename/copy (`should_break`
4013/// in `diffcore-break.c`). `break_score` is on the internal 0–[`GIT_DIFF_MAX_SCORE`] scale (default
4014/// [`DIFF_DEFAULT_BREAK_SCORE`]).
4015#[must_use]
4016pub fn should_break_rewrite_pair(old: &[u8], new: &[u8], break_score: u64) -> bool {
4017    should_break_rewrite_inner(old, new, break_score)
4018}
4019
4020/// Parse a single Git `parse_rename_score` token (`50`, `50%`, decimal forms) into internal
4021/// 0–[`GIT_DIFF_MAX_SCORE`] units.
4022pub fn parse_diff_rename_score_token(arg: &str) -> Option<u64> {
4023    let mut num: u64 = 0;
4024    let mut scale: u64 = 1;
4025    let mut dot = false;
4026    let mut saw_digit = false;
4027    for ch in arg.chars() {
4028        if !dot && ch == '.' {
4029            scale = 1;
4030            dot = true;
4031            continue;
4032        }
4033        if ch == '%' {
4034            scale = if dot { scale.saturating_mul(100) } else { 100 };
4035            break;
4036        }
4037        if ch.is_ascii_digit() {
4038            saw_digit = true;
4039            if scale < 100_000 {
4040                scale = scale.saturating_mul(10);
4041                num = num.saturating_mul(10) + u64::from(ch as u8 - b'0');
4042            }
4043        } else {
4044            break;
4045        }
4046    }
4047    if !saw_digit {
4048        return None;
4049    }
4050    Some(if num >= scale {
4051        GIT_DIFF_MAX_SCORE
4052    } else {
4053        GIT_DIFF_MAX_SCORE * num / scale
4054    })
4055}
4056
4057/// Git `merge_score` from `diffcore-break.c` when a pair is considered broken: how much of the
4058/// source blob was removed (0–[`DIFF_MAX_SCORE`] scale). Used for `dissimilarity index` metadata.
4059#[must_use]
4060pub fn rewrite_merge_score(old: &[u8], new: &[u8]) -> Option<u64> {
4061    if old.is_empty() {
4062        return None;
4063    }
4064    let max_size = old.len().max(new.len());
4065    if max_size < DIFF_MINIMUM_BREAK_SIZE {
4066        return None;
4067    }
4068    let (src_copied, _) = diffcore_count_changes(old, new);
4069    let src_copied = src_copied.min(old.len() as u64);
4070    let src_removed = (old.len() as u64).saturating_sub(src_copied);
4071    Some(src_removed * DIFF_MAX_SCORE / old.len() as u64)
4072}
4073
4074/// Percentage shown in `dissimilarity index N%` for a rewrite (`similarity_index` in Git's diff.c).
4075#[must_use]
4076pub fn rewrite_dissimilarity_index_percent(old: &[u8], new: &[u8]) -> Option<u32> {
4077    let score = rewrite_merge_score(old, new)?;
4078    Some((score * 100 / DIFF_MAX_SCORE).min(100) as u32)
4079}
4080
4081fn should_break_rewrite_inner(src: &[u8], dst: &[u8], break_score: u64) -> bool {
4082    if src.is_empty() {
4083        return false;
4084    }
4085    let max_size = src.len().max(dst.len());
4086    if max_size < DIFF_MINIMUM_BREAK_SIZE {
4087        return false;
4088    }
4089    let (src_copied, literal_added) = diffcore_count_changes(src, dst);
4090    let src_copied = src_copied.min(src.len() as u64);
4091    let mut literal_added = literal_added;
4092    let dst_len = dst.len() as u64;
4093    if src_copied < dst_len && literal_added + src_copied > dst_len {
4094        literal_added = dst_len.saturating_sub(src_copied);
4095    }
4096    let src_removed = (src.len() as u64).saturating_sub(src_copied);
4097    let merge_score = src_removed * DIFF_MAX_SCORE / src.len() as u64;
4098    if merge_score > break_score {
4099        return true;
4100    }
4101    let delta_size = src_removed.saturating_add(literal_added);
4102    if delta_size * DIFF_MAX_SCORE / (max_size as u64) < break_score {
4103        return false;
4104    }
4105    let s = src.len() as u64;
4106    if (s * break_score < src_removed * DIFF_MAX_SCORE)
4107        && (literal_added * 20 < src_removed)
4108        && (literal_added * 20 < src_copied)
4109    {
4110        return false;
4111    }
4112    true
4113}
4114
4115// ── Helpers ─────────────────────────────────────────────────────────
4116
4117/// Flatten a tree object recursively into a sorted list of (path, mode, oid).
4118struct FlatEntry {
4119    path: String,
4120    mode: u32,
4121    oid: ObjectId,
4122}
4123
4124fn flatten_tree(odb: &Odb, tree_oid: &ObjectId, prefix: &str) -> Result<Vec<FlatEntry>> {
4125    let entries = read_tree(odb, tree_oid)?;
4126    let mut result = Vec::new();
4127
4128    for entry in entries {
4129        let name_str = String::from_utf8_lossy(&entry.name);
4130        let path = format_path(prefix, &name_str);
4131        if is_tree_mode(entry.mode) {
4132            let nested = flatten_tree(odb, &entry.oid, &path)?;
4133            result.extend(nested);
4134        } else {
4135            result.push(FlatEntry {
4136                path,
4137                mode: entry.mode,
4138                oid: entry.oid,
4139            });
4140        }
4141    }
4142
4143    Ok(result)
4144}
4145
4146/// Paths present in `HEAD`'s tree with mode and blob/commit OID (for status porcelain v2).
4147pub fn head_path_states(
4148    odb: &Odb,
4149    head_tree: Option<&ObjectId>,
4150) -> Result<std::collections::BTreeMap<String, (u32, ObjectId)>> {
4151    let mut m = std::collections::BTreeMap::new();
4152    let Some(t) = head_tree else {
4153        return Ok(m);
4154    };
4155    for fe in flatten_tree(odb, t, "")? {
4156        m.insert(fe.path, (fe.mode, fe.oid));
4157    }
4158    Ok(m)
4159}
4160
4161/// Whether a mode represents a tree (directory).
4162fn is_tree_mode(mode: u32) -> bool {
4163    mode == 0o040000
4164}
4165
4166/// Build a path with an optional prefix.
4167fn format_path(prefix: &str, name: &str) -> String {
4168    if prefix.is_empty() {
4169        name.to_owned()
4170    } else {
4171        format!("{prefix}/{name}")
4172    }
4173}
4174
4175/// Format a numeric mode as a zero-padded octal string.
4176pub fn format_mode(mode: u32) -> String {
4177    format!("{mode:06o}")
4178}
4179
4180/// Read the HEAD commit OID from a submodule checkout directory.
4181///
4182/// Returns `None` if the path is missing, not a submodule checkout, or has no resolvable HEAD.
4183#[must_use]
4184pub fn read_submodule_head_for_checkout(sub_dir: &Path) -> Option<ObjectId> {
4185    read_submodule_head(sub_dir)
4186}
4187
4188/// First line of a commit's message for `git diff --submodule=log` output.
4189///
4190/// Honors `encoding` in the commit object (Latin-1 vs UTF-8) using the same
4191/// rules as Git's submodule summary.
4192#[must_use]
4193pub fn submodule_commit_subject_line(c: &CommitData) -> String {
4194    let enc = c.encoding.as_deref().unwrap_or("UTF-8");
4195    let is_latin1 = enc.eq_ignore_ascii_case("ISO8859-1")
4196        || enc.eq_ignore_ascii_case("ISO-8859-1")
4197        || enc.eq_ignore_ascii_case("LATIN1")
4198        || enc.eq_ignore_ascii_case("ISO-8859-15");
4199    if let Some(raw) = c.raw_message.as_deref() {
4200        let line = raw.split(|b| *b == b'\n').next().unwrap_or(raw);
4201        if is_latin1 {
4202            return line
4203                .iter()
4204                .map(|&b| b as char)
4205                .collect::<String>()
4206                .trim()
4207                .to_owned();
4208        }
4209        return String::from_utf8_lossy(line).trim().to_string();
4210    }
4211    c.message.lines().next().unwrap_or("").trim().to_owned()
4212}
4213
4214/// True when `sub_dir` is an empty directory (or missing), i.e. the placeholder left by
4215/// `git apply --index` before `git submodule update`.
4216fn submodule_worktree_is_unpopulated_placeholder(sub_dir: &Path) -> bool {
4217    match fs::read_dir(sub_dir) {
4218        Ok(mut it) => it.next().is_none(),
4219        Err(e) if e.kind() == std::io::ErrorKind::NotFound => true,
4220        Err(_) => false,
4221    }
4222}
4223
4224fn read_submodule_head(sub_dir: &Path) -> Option<ObjectId> {
4225    read_submodule_head_oid(sub_dir)
4226}
4227
4228/// Resolve the embedded git directory for a submodule work tree (`sub_dir/.git`).
4229#[must_use]
4230pub fn submodule_embedded_git_dir(sub_dir: &Path) -> Option<PathBuf> {
4231    let gitfile = sub_dir.join(".git");
4232    if gitfile.is_file() {
4233        let content = fs::read_to_string(&gitfile).ok()?;
4234        let gitdir = content
4235            .lines()
4236            .find_map(|l| l.strip_prefix("gitdir: "))?
4237            .trim();
4238        Some(if Path::new(gitdir).is_absolute() {
4239            PathBuf::from(gitdir)
4240        } else {
4241            sub_dir.join(gitdir)
4242        })
4243    } else if gitfile.is_dir() {
4244        Some(gitfile)
4245    } else {
4246        None
4247    }
4248}
4249
4250/// Walk upward from `sub_dir` to find the nearest containing Git work tree.
4251fn find_superproject_git(sub_dir: &Path) -> Option<(PathBuf, PathBuf)> {
4252    let mut cur = sub_dir.parent()?;
4253    loop {
4254        let git_path = cur.join(".git");
4255        if git_path.exists() {
4256            let gd = if git_path.is_file() {
4257                let content = fs::read_to_string(&git_path).ok()?;
4258                let line = content
4259                    .lines()
4260                    .find_map(|l| l.strip_prefix("gitdir: "))?
4261                    .trim();
4262                if Path::new(line).is_absolute() {
4263                    PathBuf::from(line)
4264                } else {
4265                    cur.join(line)
4266                }
4267            } else {
4268                git_path
4269            };
4270            return Some((cur.to_path_buf(), gd));
4271        }
4272        cur = cur.parent()?;
4273    }
4274}
4275
4276/// Read the HEAD commit OID from a submodule working tree directory.
4277///
4278/// Handles both embedded `.git` directories and `gitdir:` gitfiles pointing at
4279/// `.git/modules/...` (or other locations). Returns `None` if the path is not
4280/// a checkout or has no resolvable HEAD.
4281pub fn read_submodule_head_oid(sub_dir: &Path) -> Option<ObjectId> {
4282    // Submodule `.git` may be a gitfile pointing at `.git/modules/<name>` in another superproject
4283    // after `cp -R`. Prefer the current superproject's module dir when present.
4284    let mut git_dir = submodule_embedded_git_dir(sub_dir)?;
4285    if let Some((super_wt, super_git_dir)) = find_superproject_git(sub_dir) {
4286        let rel = sub_dir.strip_prefix(&super_wt).ok()?;
4287        let rel_str = rel.to_string_lossy().replace('\\', "/");
4288        let local_mod = super_git_dir
4289            .join("modules")
4290            .join(rel_str.trim_start_matches('/'));
4291        if local_mod.join("HEAD").exists() {
4292            let sg = super_git_dir.canonicalize().unwrap_or(super_git_dir);
4293            let cur = git_dir.canonicalize().unwrap_or_else(|_| git_dir.clone());
4294            if !cur.starts_with(&sg) {
4295                git_dir = local_mod;
4296            }
4297        }
4298    }
4299    let head_content = fs::read_to_string(git_dir.join("HEAD")).ok()?;
4300    let head_trimmed = head_content.trim();
4301    if head_trimmed.starts_with("ref: ") {
4302        // Use the full ref resolver so packed-refs and worktrees match Git. If `HEAD` is a stale
4303        // symref (e.g. still `refs/heads/master` while only `main` exists), fall back like
4304        // `resolve_gitlink_ref` / `git add` on embedded repos (`t6437-submodule-merge`).
4305        match crate::refs::resolve_ref(&git_dir, "HEAD") {
4306            Ok(oid) => Some(oid),
4307            Err(_) => {
4308                let mut found = None;
4309                for branch in ["main", "master"] {
4310                    let p = git_dir.join("refs/heads").join(branch);
4311                    if let Ok(s) = fs::read_to_string(&p) {
4312                        if let Ok(o) = ObjectId::from_hex(s.trim()) {
4313                            found = Some(o);
4314                            break;
4315                        }
4316                    }
4317                }
4318                found
4319            }
4320        }
4321    } else {
4322        ObjectId::from_hex(head_trimmed).ok()
4323    }
4324}
4325
4326/// True when a checked-out submodule at `rel_path` has modified or untracked content relative to
4327/// the gitlink `recorded_oid` stored in the superproject (used for `git diff <tree>` parity).
4328fn submodule_has_dirty_worktree_for_super_diff(
4329    super_worktree: &Path,
4330    rel_path: &str,
4331    recorded_oid: &ObjectId,
4332) -> bool {
4333    let flags = submodule_porcelain_flags(super_worktree, rel_path, *recorded_oid);
4334    flags.modified || flags.untracked
4335}
4336
4337/// Submodule dirty bits aligned with Git's `DIRTY_SUBMODULE_*` / porcelain v2 `S???` token.
4338#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
4339pub struct SubmodulePorcelainFlags {
4340    /// Submodule checkout HEAD differs from the gitlink OID recorded in the parent index.
4341    pub new_commits: bool,
4342    /// The submodule has its own staged or unstaged changes (`DIRTY_SUBMODULE_MODIFIED`).
4343    pub modified: bool,
4344    /// The submodule work tree contains paths not in its index (`DIRTY_SUBMODULE_UNTRACKED`).
4345    pub untracked: bool,
4346}
4347
4348/// Inspect a checked-out submodule at `rel_path` (relative to `super_worktree`) and return
4349/// flags used for `git status --porcelain=v2` submodule tokens.
4350///
4351/// `recorded_oid` is the gitlink OID stored in the **parent** index (stage 0). When the
4352/// submodule is not checked out or cannot be opened, returns [`Default::default()`].
4353pub fn submodule_porcelain_flags(
4354    super_worktree: &Path,
4355    rel_path: &str,
4356    recorded_oid: ObjectId,
4357) -> SubmodulePorcelainFlags {
4358    let sub_dir = super_worktree.join(rel_path);
4359    let Some(sub_git_dir) = submodule_embedded_git_dir(&sub_dir) else {
4360        return SubmodulePorcelainFlags::default();
4361    };
4362    let Some(sub_head) = read_submodule_head_oid(&sub_dir) else {
4363        return SubmodulePorcelainFlags::default();
4364    };
4365
4366    let new_commits = sub_head != recorded_oid;
4367
4368    let index_path = sub_git_dir.join("index");
4369    let sub_index = match crate::index::Index::load(&index_path) {
4370        Ok(ix) => ix,
4371        Err(_) => {
4372            return SubmodulePorcelainFlags {
4373                new_commits,
4374                ..Default::default()
4375            }
4376        }
4377    };
4378
4379    let tracked: std::collections::BTreeSet<String> = sub_index
4380        .entries
4381        .iter()
4382        .filter(|e| e.stage() == 0)
4383        .map(|e| String::from_utf8_lossy(&e.path).into_owned())
4384        .collect();
4385    let untracked = submodule_dir_has_untracked_inner(&sub_dir, &sub_dir, &tracked, &sub_index);
4386
4387    let objects_dir = sub_git_dir.join("objects");
4388    let odb = Odb::new(&objects_dir);
4389
4390    let sub_head_tree = (|| -> Option<ObjectId> {
4391        let h = fs::read_to_string(sub_git_dir.join("HEAD")).ok()?;
4392        let h_str = h.trim();
4393        let commit_oid = if let Some(r) = h_str.strip_prefix("ref: ") {
4394            let oid_hex = fs::read_to_string(sub_git_dir.join(r)).ok()?;
4395            ObjectId::from_hex(oid_hex.trim()).ok()?
4396        } else {
4397            ObjectId::from_hex(h_str).ok()?
4398        };
4399        let obj = odb.read(&commit_oid).ok()?;
4400        let commit = parse_commit(&obj.data).ok()?;
4401        Some(commit.tree)
4402    })();
4403
4404    let staged_dirty = sub_head_tree
4405        .as_ref()
4406        .map(|t| diff_index_to_tree(&odb, &sub_index, Some(t), false).map(|v| !v.is_empty()))
4407        .unwrap_or(Ok(false));
4408    let staged_dirty = staged_dirty.unwrap_or(false);
4409
4410    let unstaged_dirty = diff_index_to_worktree(&odb, &sub_index, &sub_dir, false, true)
4411        .map(|v| !v.is_empty())
4412        .unwrap_or(false);
4413
4414    let mut modified = staged_dirty || unstaged_dirty;
4415
4416    // Nested submodule has its own index: OR `modified` from immediate gitlink children so a
4417    // dirty nested checkout (e.g. staged `file` under `sub1/sub2`) marks the parent gitlink as
4418    // modified in the superproject (t7506). Do **not** OR `untracked` — untracked-only inside a
4419    // nested submodule must stay `S..U` on the parent, not `S.U` / `S.M.`.
4420    for e in &sub_index.entries {
4421        if e.stage() != 0 || e.mode != 0o160000 {
4422            continue;
4423        }
4424        let child = String::from_utf8_lossy(&e.path).into_owned();
4425        let full_rel = if rel_path.is_empty() {
4426            child
4427        } else {
4428            format!("{rel_path}/{child}")
4429        };
4430        let nested = submodule_porcelain_flags(super_worktree, &full_rel, e.oid);
4431        modified |= nested.modified;
4432    }
4433
4434    SubmodulePorcelainFlags {
4435        new_commits,
4436        modified,
4437        untracked,
4438    }
4439}
4440
4441fn submodule_dir_has_untracked_inner(
4442    dir: &Path,
4443    root: &Path,
4444    tracked: &std::collections::BTreeSet<String>,
4445    owning_index: &Index,
4446) -> bool {
4447    let entries = match fs::read_dir(dir) {
4448        Ok(e) => e,
4449        Err(_) => return false,
4450    };
4451    let mut sorted: Vec<_> = entries.filter_map(|e| e.ok()).collect();
4452    sorted.sort_by_key(|e| e.file_name());
4453
4454    for entry in sorted {
4455        let name = entry.file_name().to_string_lossy().to_string();
4456        if name == ".git" {
4457            continue;
4458        }
4459        let path = entry.path();
4460        let rel = path
4461            .strip_prefix(root)
4462            .map(|p| p.to_string_lossy().to_string())
4463            .unwrap_or_else(|_| name.clone());
4464
4465        let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
4466        if is_dir {
4467            let is_gitlink = owning_index
4468                .get(rel.as_bytes(), 0)
4469                .is_some_and(|e| e.mode == 0o160000);
4470            if is_gitlink {
4471                let Some(nested_git) = submodule_embedded_git_dir(&path) else {
4472                    continue;
4473                };
4474                let nested_index_path = nested_git.join("index");
4475                let Ok(nested_ix) = crate::index::Index::load(&nested_index_path) else {
4476                    continue;
4477                };
4478                let nested_tracked: std::collections::BTreeSet<String> = nested_ix
4479                    .entries
4480                    .iter()
4481                    .filter(|e| e.stage() == 0)
4482                    .map(|e| String::from_utf8_lossy(&e.path).into_owned())
4483                    .collect();
4484                if submodule_dir_has_untracked_inner(&path, &path, &nested_tracked, &nested_ix) {
4485                    return true;
4486                }
4487            } else if submodule_dir_has_untracked_inner(&path, root, tracked, owning_index) {
4488                return true;
4489            }
4490        } else if !tracked.contains(&rel) {
4491            return true;
4492        }
4493    }
4494    false
4495}
grit_lib/diff.rs

grit_lib/
diff.rs