grit_lib/
crlf.rs

1//! CRLF / EOL conversion and clean/smudge filter support.
2//!
3//! This module handles line-ending conversion when staging files (`git add`)
4//! and checking out files (`git checkout`, `read-tree -u`, `checkout-index`).
5//!
6//! Config knobs:
7//!   - `core.autocrlf` (true / input / false)
8//!   - `core.eol` (lf / crlf / native)
9//!   - `core.safecrlf` (true / warn / false)
10//!
11//! Gitattributes:
12//!   - `text` / `text=auto` / `-text` / `binary`
13//!   - `eol=lf` / `eol=crlf`
14//!   - `filter=<name>` (with `filter.<name>.clean` / `filter.<name>.smudge`)
15//!   - `ident` keyword expansion
16
17use std::path::{Path, PathBuf};
18use std::process::{Command, Stdio};
19
20use encoding_rs::UTF_8;
21
22use crate::config::ConfigSet;
23use crate::filter_process::{apply_process_clean, apply_process_smudge, FilterSmudgeMeta};
24
25/// What `core.autocrlf` is set to.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum AutoCrlf {
28    True,
29    Input,
30    False,
31}
32
33/// What `core.eol` is set to.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum CoreEol {
36    Lf,
37    Crlf,
38    Native,
39}
40
41/// What `core.safecrlf` is set to.
42#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum SafeCrlf {
44    True,
45    Warn,
46    False,
47}
48
49/// Per-file text attribute from .gitattributes.
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum TextAttr {
52    /// `text` — always treat as text.
53    Set,
54    /// `text=auto` — auto-detect.
55    Auto,
56    /// `-text` or `binary` — never convert.
57    Unset,
58    /// No text attribute specified.
59    Unspecified,
60}
61
62/// Per-file eol attribute from .gitattributes.
63#[derive(Debug, Clone, Copy, PartialEq, Eq)]
64pub enum EolAttr {
65    Lf,
66    Crlf,
67    Unspecified,
68}
69
70/// Legacy `crlf` gitattribute (deprecated in Git; still honored for EOL conversion).
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
72pub enum CrlfLegacyAttr {
73    #[default]
74    Unspecified,
75    /// `-crlf` — disable CRLF conversion.
76    Unset,
77    /// `crlf=input` — normalize to LF in the object database; no CRLF on checkout.
78    Input,
79    /// Bare `crlf` (set) — force CRLF on checkout for text files.
80    Crlf,
81}
82
83/// Per-file merge attribute from .gitattributes.
84#[derive(Debug, Clone, PartialEq, Eq)]
85pub enum MergeAttr {
86    /// No merge attribute specified.
87    Unspecified,
88    /// `-merge` — treat as binary/non-text merge.
89    Unset,
90    /// `merge=<driver>` — use named merge driver.
91    Driver(String),
92}
93
94/// How the `diff` gitattribute affects diff output.
95#[derive(Debug, Clone, PartialEq, Eq)]
96pub enum DiffAttr {
97    /// No `diff` attribute (use heuristics / default).
98    Unspecified,
99    /// `-diff` / `diff=unset` / `binary` — treat as binary for diff purposes.
100    Unset,
101    /// `diff=<driver>` — use named driver (e.g. for textconv).
102    Driver(String),
103}
104
105/// Per-file attributes relevant to conversion.
106#[derive(Debug, Clone)]
107pub struct FileAttrs {
108    pub text: TextAttr,
109    pub eol: EolAttr,
110    /// Effect of the `diff` gitattribute on diff output.
111    pub diff_attr: DiffAttr,
112    /// `export-ignore` — omit from `git archive`.
113    pub export_ignore: bool,
114    /// `export-subst` — expand `$Format:` placeholders using the archived commit.
115    pub export_subst: bool,
116    pub filter_clean: Option<String>,
117    pub filter_smudge: Option<String>,
118    /// `filter.<name>.process` — long-running filter (takes precedence over clean/smudge commands).
119    pub filter_process: Option<String>,
120    /// Driver name from the active `filter=<name>` gitattribute (for error messages).
121    pub filter_driver_name: Option<String>,
122    /// Whether `filter.<name>.required` is set for this path's filter driver.
123    pub filter_smudge_required: bool,
124    /// Same config key as smudge; clean direction fails when unset if true.
125    pub filter_clean_required: bool,
126    pub ident: bool,
127    pub merge: MergeAttr,
128    pub conflict_marker_size: Option<String>,
129    /// Working tree encoding (e.g. "utf-16") — content is converted to UTF-8 on add.
130    pub working_tree_encoding: Option<String>,
131    /// Legacy `crlf` / `-crlf` / `crlf=input` from `.gitattributes`.
132    pub crlf_legacy: CrlfLegacyAttr,
133    /// `whitespace` attribute value: `None` if unset, `Some("set")` for bare `whitespace`,
134    /// `Some("unset")` for `-whitespace`, or `Some("trailing,...")` for `whitespace=...`.
135    pub whitespace: Option<String>,
136}
137
138impl Default for FileAttrs {
139    fn default() -> Self {
140        FileAttrs {
141            text: TextAttr::Unspecified,
142            eol: EolAttr::Unspecified,
143            diff_attr: DiffAttr::Unspecified,
144            export_ignore: false,
145            export_subst: false,
146            filter_clean: None,
147            filter_smudge: None,
148            filter_process: None,
149            filter_driver_name: None,
150            filter_smudge_required: false,
151            filter_clean_required: false,
152            ident: false,
153            merge: MergeAttr::Unspecified,
154            conflict_marker_size: None,
155            working_tree_encoding: None,
156            crlf_legacy: CrlfLegacyAttr::Unspecified,
157            whitespace: None,
158        }
159    }
160}
161
162/// Global conversion settings derived from config.
163#[derive(Debug, Clone)]
164pub struct ConversionConfig {
165    pub autocrlf: AutoCrlf,
166    pub eol: CoreEol,
167    pub safecrlf: SafeCrlf,
168}
169
170impl ConversionConfig {
171    /// Load conversion settings from a ConfigSet.
172    pub fn from_config(config: &ConfigSet) -> Self {
173        let autocrlf = match config.get("core.autocrlf") {
174            Some(v) => match v.to_lowercase().as_str() {
175                "true" | "yes" | "on" | "1" => AutoCrlf::True,
176                "input" => AutoCrlf::Input,
177                _ => AutoCrlf::False,
178            },
179            None => AutoCrlf::False,
180        };
181
182        let eol = match config.get("core.eol") {
183            Some(v) => match v.to_lowercase().as_str() {
184                "crlf" => CoreEol::Crlf,
185                "lf" => CoreEol::Lf,
186                "native" => CoreEol::Native,
187                _ => CoreEol::Native,
188            },
189            None => CoreEol::Native,
190        };
191
192        let safecrlf = match config.get("core.safecrlf") {
193            Some(v) => match v.to_lowercase().as_str() {
194                "true" | "yes" | "on" | "1" => SafeCrlf::True,
195                "warn" => SafeCrlf::Warn,
196                _ => SafeCrlf::False,
197            },
198            // Git warns on round-trip EOL issues by default when unset.
199            None => SafeCrlf::Warn,
200        };
201
202        ConversionConfig {
203            autocrlf,
204            eol,
205            safecrlf,
206        }
207    }
208}
209
210/// A parsed .gitattributes rule.
211#[derive(Debug, Clone)]
212pub struct AttrRule {
213    /// Glob text used for matching (trailing directory `/` stripped; see [`AttrRule::must_be_dir`]).
214    pattern: String,
215    /// When true, the source pattern ended with `/` and matches only directories (Git `PATTERN_FLAG_MUSTBEDIR`).
216    must_be_dir: bool,
217    /// When true, match only the path's final component (Git `PATTERN_FLAG_NODIR` / no `/` in the pattern body).
218    basename_only: bool,
219    attrs: Vec<(String, String)>, // (name, value) where value is "set"/"unset"/specific value
220}
221
222impl AttrRule {
223    /// Diff driver names assigned by this rule (`diff=<driver>`), excluding `set`/`unset`.
224    pub fn diff_drivers(&self) -> impl Iterator<Item = &str> + '_ {
225        self.attrs.iter().filter_map(|(name, value)| {
226            if name == "diff" && !value.is_empty() && value != "unset" && value != "set" {
227                Some(value.as_str())
228            } else {
229                None
230            }
231        })
232    }
233}
234
235/// Load .gitattributes from the worktree root.
236pub fn load_gitattributes(work_tree: &Path) -> Vec<AttrRule> {
237    let mut rules = Vec::new();
238
239    let root_attrs = work_tree.join(".gitattributes");
240    if let Ok(content) = std::fs::read_to_string(&root_attrs) {
241        parse_gitattributes(&content, &mut rules);
242    }
243
244    let info_attrs = work_tree.join(".git/info/attributes");
245    if let Ok(content) = std::fs::read_to_string(&info_attrs) {
246        parse_gitattributes(&content, &mut rules);
247    }
248
249    rules
250}
251
252/// Parse gitattributes content into attribute rules.
253///
254/// This is useful when attributes are sourced from non-worktree inputs
255/// (for example, tree objects selected by `--attr-source`).
256#[must_use]
257pub fn parse_gitattributes_content(content: &str) -> Vec<AttrRule> {
258    let mut rules = Vec::new();
259    parse_gitattributes(content, &mut rules);
260    rules
261}
262
263/// Load .gitattributes from the index (for use during checkout when
264/// the worktree file may not yet exist).
265pub fn load_gitattributes_from_index(
266    index: &crate::index::Index,
267    odb: &crate::odb::Odb,
268) -> Vec<AttrRule> {
269    let mut rules = Vec::new();
270
271    // Look for .gitattributes in the index (stage 0)
272    if let Some(entry) = index.get(b".gitattributes", 0) {
273        if let Ok(obj) = odb.read(&entry.oid) {
274            if let Ok(content) = String::from_utf8(obj.data) {
275                parse_gitattributes(&content, &mut rules);
276            }
277        }
278    }
279
280    rules
281}
282
283/// Load `.gitattributes` rules that apply to `rel_path`, including root and
284/// nested `dir/.gitattributes` along parent directories (Git-consistent order:
285/// root first, then each ancestor directory; later rules win in [`get_file_attrs`]).
286///
287/// Reads from the working tree when present, otherwise from a stage-0 index entry.
288pub fn load_gitattributes_for_checkout(
289    work_tree: &Path,
290    rel_path: &str,
291    index: &crate::index::Index,
292    odb: &crate::odb::Odb,
293) -> Vec<AttrRule> {
294    let mut rules = load_gitattributes(work_tree);
295
296    // Root `.gitattributes` may exist only in the index while the worktree file
297    // is missing (e.g. t0020 in-tree attributes after `rm -rf .gitattributes`).
298    if !work_tree.join(".gitattributes").exists() {
299        if let Some(entry) = index.get(b".gitattributes", 0) {
300            if let Ok(obj) = odb.read(&entry.oid) {
301                if let Ok(content) = String::from_utf8(obj.data) {
302                    parse_gitattributes(&content, &mut rules);
303                }
304            }
305        }
306    }
307
308    let path = Path::new(rel_path);
309    if let Some(parent) = path.parent() {
310        let mut accum = PathBuf::new();
311        for comp in parent.components() {
312            accum.push(comp);
313            let ga_rel = accum.join(".gitattributes");
314            let wt_ga = work_tree.join(&ga_rel);
315            if let Ok(content) = std::fs::read_to_string(&wt_ga) {
316                parse_gitattributes(&content, &mut rules);
317            } else {
318                let key = path_to_index_bytes(&ga_rel);
319                if let Some(entry) = index.get(&key, 0) {
320                    if let Ok(obj) = odb.read(&entry.oid) {
321                        if let Ok(content) = String::from_utf8(obj.data) {
322                            parse_gitattributes(&content, &mut rules);
323                        }
324                    }
325                }
326            }
327        }
328    }
329
330    rules
331}
332
333fn path_to_index_bytes(path: &Path) -> Vec<u8> {
334    use std::os::unix::ffi::OsStrExt;
335    path.as_os_str().as_bytes().to_vec()
336}
337
338fn parse_gitattributes(content: &str, rules: &mut Vec<AttrRule>) {
339    for line in content.lines() {
340        let line = line.trim();
341        if line.is_empty() || line.starts_with('#') {
342            continue;
343        }
344
345        let mut parts = line.split_whitespace();
346        let raw_pattern = match parts.next() {
347            Some(p) => p,
348            None => continue,
349        };
350
351        let mut pat = raw_pattern.to_owned();
352        let mut must_be_dir = false;
353        if pat.ends_with('/') && pat.len() > 1 {
354            pat.pop();
355            must_be_dir = true;
356        }
357        let basename_only = !pat.contains('/');
358
359        let mut attrs = Vec::new();
360        for part in parts {
361            if part == "binary" {
362                attrs.push(("text".to_owned(), "unset".to_owned()));
363                attrs.push(("diff".to_owned(), "unset".to_owned()));
364            } else if let Some(rest) = part.strip_prefix('-') {
365                attrs.push((rest.to_owned(), "unset".to_owned()));
366            } else if let Some((key, val)) = part.split_once('=') {
367                attrs.push((key.to_owned(), val.to_owned()));
368            } else {
369                attrs.push((part.to_owned(), "set".to_owned()));
370            }
371        }
372
373        if !attrs.is_empty() {
374            rules.push(AttrRule {
375                pattern: pat,
376                must_be_dir,
377                basename_only,
378                attrs,
379            });
380        }
381    }
382}
383
384fn config_bool_truthy(value: &str) -> bool {
385    matches!(
386        value.trim().to_ascii_lowercase().as_str(),
387        "true" | "yes" | "on" | "1"
388    )
389}
390
391/// Get file attributes for a given path from .gitattributes rules and config.
392///
393/// `is_dir` should be true when `rel_path` names a directory (Git passes a trailing `/` for
394/// directory paths in some call sites; we accept either trailing `/` or this flag from tree walks).
395pub fn get_file_attrs(
396    rules: &[AttrRule],
397    rel_path: &str,
398    is_dir: bool,
399    config: &ConfigSet,
400) -> FileAttrs {
401    let mut fa = FileAttrs::default();
402
403    // Walk rules; last match wins for each attribute.
404    for rule in rules {
405        if attr_rule_matches(rule, rel_path, is_dir) {
406            for (name, value) in &rule.attrs {
407                match name.as_str() {
408                    "text" => {
409                        fa.text = match value.as_str() {
410                            "set" => TextAttr::Set,
411                            "unset" => TextAttr::Unset,
412                            "auto" => TextAttr::Auto,
413                            _ => TextAttr::Unspecified,
414                        };
415                    }
416                    "eol" => {
417                        fa.eol = match value.as_str() {
418                            "lf" => EolAttr::Lf,
419                            "crlf" => EolAttr::Crlf,
420                            _ => EolAttr::Unspecified,
421                        };
422                    }
423                    "filter" => {
424                        if value == "unset" {
425                            fa.filter_clean = None;
426                            fa.filter_smudge = None;
427                            fa.filter_process = None;
428                            fa.filter_driver_name = None;
429                            fa.filter_smudge_required = false;
430                            fa.filter_clean_required = false;
431                        } else {
432                            let clean_key = format!("filter.{value}.clean");
433                            let smudge_key = format!("filter.{value}.smudge");
434                            let process_key = format!("filter.{value}.process");
435                            let req_key = format!("filter.{value}.required");
436                            fa.filter_driver_name = Some(value.clone());
437                            fa.filter_process = config.get(&process_key).filter(|s| !s.is_empty());
438                            if fa.filter_process.is_some() {
439                                fa.filter_clean = None;
440                                fa.filter_smudge = None;
441                            } else {
442                                fa.filter_clean = config.get(&clean_key);
443                                fa.filter_smudge = config.get(&smudge_key);
444                            }
445                            let required =
446                                config.get(&req_key).is_some_and(|v| config_bool_truthy(&v));
447                            fa.filter_smudge_required = required;
448                            fa.filter_clean_required = required;
449                        }
450                    }
451                    "diff" => {
452                        if value == "unset" {
453                            fa.diff_attr = DiffAttr::Unset;
454                        } else if !value.is_empty() && value != "set" {
455                            fa.diff_attr = DiffAttr::Driver(value.clone());
456                        }
457                    }
458                    "ident" => {
459                        fa.ident = value == "set";
460                    }
461                    "export-ignore" => {
462                        fa.export_ignore = value != "unset";
463                    }
464                    "export-subst" => {
465                        fa.export_subst = value != "unset";
466                    }
467                    "merge" => {
468                        fa.merge = match value.as_str() {
469                            "unset" => MergeAttr::Unset,
470                            "set" => MergeAttr::Unspecified,
471                            other => MergeAttr::Driver(other.to_string()),
472                        };
473                    }
474                    "conflict-marker-size" => {
475                        if value == "unset" {
476                            fa.conflict_marker_size = None;
477                        } else {
478                            fa.conflict_marker_size = Some(value.clone());
479                        }
480                    }
481                    "working-tree-encoding" => {
482                        if value != "unset" && !value.is_empty() {
483                            fa.working_tree_encoding = Some(value.clone());
484                        }
485                    }
486                    "crlf" => {
487                        fa.crlf_legacy = match value.as_str() {
488                            "unset" => CrlfLegacyAttr::Unset,
489                            "input" => CrlfLegacyAttr::Input,
490                            "set" => CrlfLegacyAttr::Crlf,
491                            _ => CrlfLegacyAttr::Unspecified,
492                        };
493                    }
494                    "whitespace" => {
495                        if value == "unset" {
496                            fa.whitespace = Some("unset".to_owned());
497                        } else if !value.is_empty() {
498                            fa.whitespace = Some(value.clone());
499                        }
500                    }
501                    _ => {}
502                }
503            }
504        }
505    }
506
507    fa
508}
509
510/// Returns whether gitattribute `attr_name` is set (last matching rule wins), for arbitrary
511/// attribute names used by pathspec `:(attr:...)`.
512///
513/// `is_dir` is whether `path` refers to a directory (see [`get_file_attrs`]).
514#[must_use]
515pub fn path_has_gitattribute(
516    rules: &[AttrRule],
517    path: &str,
518    is_dir: bool,
519    attr_name: &str,
520) -> bool {
521    let mut last: Option<&str> = None;
522    for rule in rules {
523        if attr_rule_matches(rule, path, is_dir) {
524            for (name, value) in &rule.attrs {
525                if name == attr_name {
526                    last = Some(value.as_str());
527                }
528            }
529        }
530    }
531    match last {
532        None | Some("unset") => false,
533        Some(_) => true,
534    }
535}
536
537/// Whether `rule` matches `rel_path` given directory vs file context (Git `path_matches`).
538#[must_use]
539pub fn attr_rule_matches(rule: &AttrRule, rel_path: &str, is_dir: bool) -> bool {
540    let path_is_dir = is_dir || rel_path.ends_with('/');
541    if rule.must_be_dir && !path_is_dir {
542        return false;
543    }
544    let path_for_glob = rel_path.trim_end_matches('/');
545    if rule.basename_only {
546        let basename = path_for_glob.rsplit('/').next().unwrap_or(path_for_glob);
547        glob_matches(rule.pattern.as_str(), basename)
548    } else {
549        glob_matches(rule.pattern.as_str(), path_for_glob)
550    }
551}
552
553fn glob_matches(pattern: &str, text: &str) -> bool {
554    glob_match_bytes(pattern.as_bytes(), text.as_bytes())
555}
556
557fn glob_match_bytes(pat: &[u8], text: &[u8]) -> bool {
558    match (pat.first(), text.first()) {
559        (None, None) => true,
560        (Some(&b'*'), _) => {
561            let pat_rest = pat
562                .iter()
563                .position(|&b| b != b'*')
564                .map_or(&pat[pat.len()..], |i| &pat[i..]);
565            if pat_rest.is_empty() {
566                return true;
567            }
568            for i in 0..=text.len() {
569                if glob_match_bytes(pat_rest, &text[i..]) {
570                    return true;
571                }
572            }
573            false
574        }
575        (Some(&b'?'), Some(_)) => glob_match_bytes(&pat[1..], &text[1..]),
576        (Some(p), Some(t)) if p == t => glob_match_bytes(&pat[1..], &text[1..]),
577        _ => false,
578    }
579}
580
581/// Returns true if the data looks binary (contains NUL bytes in the first 8000 bytes).
582pub fn is_binary(data: &[u8]) -> bool {
583    let check_len = data.len().min(8000);
584    data[..check_len].contains(&0)
585}
586
587// Git `convert.c` `CONVERT_STAT_BITS_*` / `gather_convert_stats_ascii` (for `ls-files --eol`).
588const CONVERT_STAT_BITS_TXT_LF: u32 = 0x1;
589const CONVERT_STAT_BITS_TXT_CRLF: u32 = 0x2;
590const CONVERT_STAT_BITS_BIN: u32 = 0x4;
591
592#[derive(Default, Clone)]
593struct TextStat {
594    nul: u32,
595    lonecr: u32,
596    lonelf: u32,
597    crlf: u32,
598    printable: u32,
599    nonprintable: u32,
600}
601
602fn gather_text_stat(data: &[u8]) -> TextStat {
603    let mut s = TextStat::default();
604    let mut i = 0usize;
605    while i < data.len() {
606        let c = data[i];
607        if c == b'\r' {
608            if i + 1 < data.len() && data[i + 1] == b'\n' {
609                s.crlf += 1;
610                i += 2;
611            } else {
612                s.lonecr += 1;
613                i += 1;
614            }
615            continue;
616        }
617        if c == b'\n' {
618            s.lonelf += 1;
619            i += 1;
620            continue;
621        }
622        if c == 127 {
623            s.nonprintable += 1;
624        } else if c < 32 {
625            match c {
626                b'\t' | b'\x08' | b'\x1b' | b'\x0c' => s.printable += 1,
627                0 => {
628                    s.nul += 1;
629                    s.nonprintable += 1;
630                }
631                _ => s.nonprintable += 1,
632            }
633        } else {
634            s.printable += 1;
635        }
636        i += 1;
637    }
638    s
639}
640
641fn convert_is_binary(stats: &TextStat) -> bool {
642    stats.lonecr > 0 || stats.nul > 0 || (stats.printable >> 7) < stats.nonprintable
643}
644
645fn git_text_stat(data: &[u8]) -> TextStat {
646    let mut stats = gather_text_stat(data);
647    if !data.is_empty() && data[data.len() - 1] == 0x1a {
648        stats.nonprintable = stats.nonprintable.saturating_sub(1);
649    }
650    stats
651}
652
653/// Git `will_convert_lf_to_crlf` using [`TextStat`] (same rules as [`should_convert_to_crlf`] on bytes).
654fn will_convert_lf_to_crlf_from_stats(
655    stats: &TextStat,
656    conv: &ConversionConfig,
657    attrs: &FileAttrs,
658) -> bool {
659    let has_lone_lf = stats.lonelf > 0;
660    let is_bin = convert_is_binary(stats);
661
662    match attrs.crlf_legacy {
663        CrlfLegacyAttr::Unset | CrlfLegacyAttr::Input => return false,
664        CrlfLegacyAttr::Crlf => {
665            if attrs.text == TextAttr::Unset {
666                return false;
667            }
668            return has_lone_lf;
669        }
670        CrlfLegacyAttr::Unspecified => {}
671    }
672
673    if attrs.text == TextAttr::Unset {
674        return false;
675    }
676
677    if attrs.eol != EolAttr::Unspecified {
678        if attrs.text == TextAttr::Auto && is_bin {
679            return false;
680        }
681        if attrs.eol != EolAttr::Crlf {
682            return false;
683        }
684        if attrs.text == TextAttr::Auto {
685            return auto_crlf_should_smudge_lf_to_crlf_from_stats(stats);
686        }
687        return has_lone_lf;
688    }
689
690    if attrs.text == TextAttr::Set {
691        if !output_eol_is_crlf(conv) {
692            return false;
693        }
694        return has_lone_lf;
695    }
696
697    if attrs.text == TextAttr::Auto {
698        if is_bin || !output_eol_is_crlf(conv) {
699            return false;
700        }
701        return auto_crlf_should_smudge_lf_to_crlf_from_stats(stats);
702    }
703
704    match conv.autocrlf {
705        AutoCrlf::True => {
706            if is_bin {
707                return false;
708            }
709            auto_crlf_should_smudge_lf_to_crlf_from_stats(stats)
710        }
711        AutoCrlf::Input | AutoCrlf::False => false,
712    }
713}
714
715fn auto_crlf_should_smudge_lf_to_crlf_from_stats(stats: &TextStat) -> bool {
716    if stats.lonelf == 0 {
717        return false;
718    }
719    if stats.lonecr > 0 || stats.crlf > 0 {
720        return false;
721    }
722    !convert_is_binary(stats)
723}
724
725fn gather_convert_stats(data: &[u8]) -> u32 {
726    if data.is_empty() {
727        return 0;
728    }
729    let mut stats = gather_text_stat(data);
730    if !data.is_empty() && data[data.len() - 1] == 0x1a {
731        stats.nonprintable = stats.nonprintable.saturating_sub(1);
732    }
733    let mut ret = 0u32;
734    if convert_is_binary(&stats) {
735        ret |= CONVERT_STAT_BITS_BIN;
736    }
737    if stats.crlf > 0 {
738        ret |= CONVERT_STAT_BITS_TXT_CRLF;
739    }
740    if stats.lonelf > 0 {
741        ret |= CONVERT_STAT_BITS_TXT_LF;
742    }
743    ret
744}
745
746/// Git `convert.c` `gather_convert_stats_ascii` — worktree/index blob EOL stats for `ls-files --eol`.
747#[must_use]
748pub fn gather_convert_stats_ascii(data: &[u8]) -> &'static str {
749    let convert_stats = gather_convert_stats(data);
750    if convert_stats & CONVERT_STAT_BITS_BIN != 0 {
751        return "-text";
752    }
753    match convert_stats {
754        CONVERT_STAT_BITS_TXT_LF => "lf",
755        CONVERT_STAT_BITS_TXT_CRLF => "crlf",
756        x if x == (CONVERT_STAT_BITS_TXT_LF | CONVERT_STAT_BITS_TXT_CRLF) => "mixed",
757        _ => "none",
758    }
759}
760
761/// Git `convert.c` `get_convert_attr_ascii` — ASCII summary of EOL-related attributes for
762/// `git ls-files --eol` (matches `attr_action` after attribute merge, before clean/smudge).
763#[must_use]
764pub fn convert_attr_ascii_for_ls_files(
765    rules: &[AttrRule],
766    rel_path: &str,
767    config: &ConfigSet,
768) -> String {
769    let fa = get_file_attrs(rules, rel_path, false, config);
770    // Mirror `git_path_check_crlf` for `text` then legacy `crlf` (Git checks `text` first).
771    let mut action = match fa.text {
772        TextAttr::Set => 1,   // CRLF_TEXT
773        TextAttr::Unset => 2, // CRLF_BINARY
774        TextAttr::Auto => 5,  // CRLF_AUTO
775        TextAttr::Unspecified => 0,
776    };
777    if action == 0 {
778        action = match fa.crlf_legacy {
779            CrlfLegacyAttr::Crlf => 1,
780            CrlfLegacyAttr::Unset => 2,
781            CrlfLegacyAttr::Input => 3, // CRLF_TEXT_INPUT
782            CrlfLegacyAttr::Unspecified => 0,
783        };
784    }
785    if action == 2 {
786        return "-text".to_string();
787    }
788    // Bare `eol=lf` / `eol=crlf` without `text` still implies text mode (`convert_attrs`).
789    if action == 0 {
790        if fa.eol == EolAttr::Unspecified {
791            return String::new();
792        }
793        action = 1; // CRLF_TEXT
794    }
795
796    // Merge `eol=` like `convert_attrs` (only when not already binary).
797    if fa.eol == EolAttr::Lf {
798        if action == 5 {
799            action = 7; // CRLF_AUTO_INPUT
800        } else {
801            action = 3; // CRLF_TEXT_INPUT
802        }
803    } else if fa.eol == EolAttr::Crlf {
804        if action == 5 {
805            action = 6; // CRLF_AUTO_CRLF
806        } else {
807            action = 4; // CRLF_TEXT_CRLF
808        }
809    }
810
811    // `attr_action` snapshot (Git assigns before splitting bare `text` / applying autocrlf).
812    let attr_action = action;
813
814    match attr_action {
815        1 => "text".to_string(),
816        3 => "text eol=lf".to_string(),
817        4 => "text eol=crlf".to_string(),
818        5 => "text=auto".to_string(),
819        6 => "text=auto eol=crlf".to_string(),
820        7 => "text=auto eol=lf".to_string(),
821        _ => String::new(),
822    }
823}
824
825/// Returns true if data contains any CRLF sequences.
826pub fn has_crlf(data: &[u8]) -> bool {
827    data.windows(2).any(|w| w == b"\r\n")
828}
829
830/// Returns true if data contains any lone LF (not preceded by CR).
831pub fn has_lone_lf(data: &[u8]) -> bool {
832    for i in 0..data.len() {
833        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
834            return true;
835        }
836    }
837    false
838}
839
840/// Returns true if data contains a bare CR not followed by LF (Git `text_stat.lonecr`).
841fn has_lone_cr(data: &[u8]) -> bool {
842    for i in 0..data.len() {
843        if data[i] == b'\r' && (i + 1 >= data.len() || data[i + 1] != b'\n') {
844            return true;
845        }
846    }
847    false
848}
849
850/// Git `convert.c` `will_convert_lf_to_crlf` for `CRLF_AUTO` / `CRLF_AUTO_INPUT` / `CRLF_AUTO_CRLF`:
851/// if the blob already has CRLF pairs or lone CRs, do not convert lone LFs to CRLF on checkout.
852fn auto_crlf_should_smudge_lf_to_crlf(data: &[u8]) -> bool {
853    if !has_lone_lf(data) {
854        return false;
855    }
856    if has_lone_cr(data) || has_crlf(data) {
857        return false;
858    }
859    if is_binary(data) {
860        return false;
861    }
862    true
863}
864
865/// Returns true if ALL line endings are CRLF (no lone LF).
866pub fn is_all_crlf(data: &[u8]) -> bool {
867    has_crlf(data) && !has_lone_lf(data)
868}
869
870/// Returns true if ALL line endings are LF (no CRLF).
871pub fn is_all_lf(data: &[u8]) -> bool {
872    has_lone_lf(data) && !has_crlf(data)
873}
874
875/// Git `convert.c` `has_crlf_in_index`: index blob already contains CRLF pairs (non-binary).
876#[must_use]
877pub fn has_crlf_in_index_blob(data: &[u8]) -> bool {
878    if !data.contains(&b'\r') {
879        return false;
880    }
881    let st = gather_convert_stats(data);
882    st & CONVERT_STAT_BITS_BIN == 0 && (st & CONVERT_STAT_BITS_TXT_CRLF) != 0
883}
884
885/// Whether clean conversion uses Git's `has_crlf_in_index` guard (`convert.c` only for
886/// `CRLF_AUTO`, `CRLF_AUTO_INPUT`, `CRLF_AUTO_CRLF`). Bare `eol=` without `text=auto` becomes
887/// `CRLF_TEXT_*` and must not use this guard.
888#[must_use]
889pub fn clean_uses_autocrlf_index_guard(attrs: &FileAttrs, conv: &ConversionConfig) -> bool {
890    if attrs.text == TextAttr::Unset || attrs.crlf_legacy == CrlfLegacyAttr::Unset {
891        return false;
892    }
893    if attrs.eol != EolAttr::Unspecified && attrs.text != TextAttr::Auto {
894        return false;
895    }
896    attrs.text == TextAttr::Auto
897        || (attrs.text == TextAttr::Unspecified
898            && matches!(conv.autocrlf, AutoCrlf::True | AutoCrlf::Input))
899}
900
901/// Optional inputs for [`convert_to_git_with_opts`] (Git `CONV_EOL_RENORMALIZE` / index blob).
902#[derive(Debug, Clone, Copy)]
903pub struct ConvertToGitOpts<'a> {
904    /// Stage-0 blob bytes for this path before the current add (for safer-autocrlf).
905    pub index_blob: Option<&'a [u8]>,
906    /// When true, always apply CRLF→LF when configured (merge/cherry-pick renormalize).
907    pub renormalize: bool,
908    /// When false, skip `core.safecrlf` simulation (used for internal diff/hashing — must not spam stderr).
909    pub check_safecrlf: bool,
910}
911
912impl Default for ConvertToGitOpts<'_> {
913    fn default() -> Self {
914        Self {
915            index_blob: None,
916            renormalize: false,
917            check_safecrlf: true,
918        }
919    }
920}
921
922// ---------------------------------------------------------------------------
923// working-tree-encoding (Git `convert.c` `encode_to_git` / `encode_to_worktree`)
924// ---------------------------------------------------------------------------
925
926fn utf16_scalar_iter_to_le_bytes(chars: impl Iterator<Item = u16>) -> Vec<u8> {
927    let mut out = Vec::new();
928    for u in chars {
929        out.extend_from_slice(&u.to_le_bytes());
930    }
931    out
932}
933
934fn utf16_scalar_iter_to_be_bytes(chars: impl Iterator<Item = u16>) -> Vec<u8> {
935    let mut out = Vec::new();
936    for u in chars {
937        out.extend_from_slice(&u.to_be_bytes());
938    }
939    out
940}
941
942fn utf32_chars_to_be_bytes(s: &str) -> Vec<u8> {
943    let mut out = Vec::new();
944    for ch in s.chars() {
945        out.extend_from_slice(&(ch as u32).to_be_bytes());
946    }
947    out
948}
949
950fn utf32_chars_to_le_bytes(s: &str) -> Vec<u8> {
951    let mut out = Vec::new();
952    for ch in s.chars() {
953        out.extend_from_slice(&(ch as u32).to_le_bytes());
954    }
955    out
956}
957
958fn decode_utf32_body_to_utf8_bytes(
959    body: &[u8],
960    rel_path: &str,
961    big_endian: bool,
962) -> Result<Vec<u8>, String> {
963    if !body.len().is_multiple_of(4) {
964        return Err(format!(
965            "invalid UTF-32 length for working tree file '{rel_path}'"
966        ));
967    }
968    let mut s = String::new();
969    for chunk in body.chunks_exact(4) {
970        let cp = if big_endian {
971            u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
972        } else {
973            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
974        };
975        let Some(ch) = char::from_u32(cp) else {
976            return Err(format!(
977                "invalid UTF-32 scalar U+{cp:X} in working tree file '{rel_path}'"
978            ));
979        };
980        s.push(ch);
981    }
982    Ok(s.into_bytes())
983}
984
985fn decode_working_tree_bytes_to_utf8(
986    src: &[u8],
987    rel_path: &str,
988    enc_label: &str,
989) -> Result<Vec<u8>, String> {
990    let label = enc_label.trim();
991    if label.is_empty() {
992        return Ok(src.to_vec());
993    }
994    let lower = label.replace('_', "-").to_ascii_lowercase();
995
996    let (cow, _used_enc, had_errors) = match lower.as_str() {
997        "utf-16le-bom" => {
998            let body = if src.len() >= 2 && src.starts_with(&[0xFF, 0xFE]) {
999                &src[2..]
1000            } else {
1001                src
1002            };
1003            encoding_rs::UTF_16LE.decode(body)
1004        }
1005        // Git `UTF-16` requires a BOM; `UTF-16BE` / `UTF-16LE` are raw (BOM prohibited on add).
1006        "utf-16" => {
1007            if src.len() >= 2 && src.starts_with(&[0xFE, 0xFF]) {
1008                encoding_rs::UTF_16BE.decode(&src[2..])
1009            } else if src.len() >= 2 && src.starts_with(&[0xFF, 0xFE]) {
1010                encoding_rs::UTF_16LE.decode(&src[2..])
1011            } else {
1012                return Err(format!(
1013                    "missing byte order mark for UTF-16 working tree file '{rel_path}'"
1014                ));
1015            }
1016        }
1017        "utf-16be" => encoding_rs::UTF_16BE.decode(src),
1018        "utf-16le" => encoding_rs::UTF_16LE.decode(src),
1019        "utf-32" => {
1020            let (body, big_endian) = if src.len() >= 4 && src.starts_with(&[0, 0, 0xFE, 0xFF]) {
1021                (&src[4..], true)
1022            } else if src.len() >= 4 && src.starts_with(&[0xFF, 0xFE, 0, 0]) {
1023                (&src[4..], false)
1024            } else {
1025                return Err(format!(
1026                    "missing byte order mark for UTF-32 working tree file '{rel_path}'"
1027                ));
1028            };
1029            return decode_utf32_body_to_utf8_bytes(body, rel_path, big_endian);
1030        }
1031        "utf-32be" => return decode_utf32_body_to_utf8_bytes(src, rel_path, true),
1032        "utf-32le" => return decode_utf32_body_to_utf8_bytes(src, rel_path, false),
1033        _ => {
1034            let Some(enc) = crate::commit_encoding::resolve(label) else {
1035                return Err(format!(
1036                    "unknown working-tree-encoding '{label}' for '{rel_path}'"
1037                ));
1038            };
1039            if enc == UTF_8 {
1040                return Ok(src.to_vec());
1041            }
1042            enc.decode(src)
1043        }
1044    };
1045
1046    if had_errors {
1047        return Err(format!(
1048            "failed to decode '{rel_path}' from working-tree-encoding {label}"
1049        ));
1050    }
1051    Ok(cow.into_owned().into_bytes())
1052}
1053
1054fn encode_utf8_blob_to_working_tree_bytes(
1055    src: &[u8],
1056    rel_path: &str,
1057    enc_label: &str,
1058) -> Result<Vec<u8>, String> {
1059    let label = enc_label.trim();
1060    if label.is_empty() {
1061        return Ok(src.to_vec());
1062    }
1063    let s = std::str::from_utf8(src).map_err(|_| {
1064        format!("failed to encode '{rel_path}' from UTF-8: blob is not valid UTF-8")
1065    })?;
1066    let lower = label.replace('_', "-").to_ascii_lowercase();
1067
1068    match lower.as_str() {
1069        "utf-16le-bom" => {
1070            let mut out = vec![0xFF_u8, 0xFE_u8];
1071            out.extend(utf16_scalar_iter_to_le_bytes(s.encode_utf16()));
1072            Ok(out)
1073        }
1074        // Bare `UTF-16` in Git is BOM + UTF-16; GNU iconv `-t UTF-16` emits UTF-16LE + LE BOM
1075        // (`FF FE`), which upstream tests expect (t0028 / t2082).
1076        "utf-16" => {
1077            let mut out = vec![0xFF_u8, 0xFE_u8];
1078            out.extend(utf16_scalar_iter_to_le_bytes(s.encode_utf16()));
1079            Ok(out)
1080        }
1081        "utf-16be" => {
1082            let mut out = vec![0xFE_u8, 0xFF_u8];
1083            out.extend(utf16_scalar_iter_to_be_bytes(s.encode_utf16()));
1084            Ok(out)
1085        }
1086        "utf-16le" => Ok(utf16_scalar_iter_to_le_bytes(s.encode_utf16())),
1087        "utf-32" | "utf-32be" => {
1088            let mut out = vec![0_u8, 0_u8, 0xFE_u8, 0xFF_u8];
1089            out.extend(utf32_chars_to_be_bytes(s));
1090            Ok(out)
1091        }
1092        "utf-32le" => {
1093            let mut out = vec![0xFF_u8, 0xFE_u8, 0_u8, 0_u8];
1094            out.extend(utf32_chars_to_le_bytes(s));
1095            Ok(out)
1096        }
1097        _ => {
1098            let Some(enc) = crate::commit_encoding::resolve(label) else {
1099                return Err(format!(
1100                    "unknown working-tree-encoding '{label}' for '{rel_path}'"
1101                ));
1102            };
1103            if enc == UTF_8 {
1104                return Ok(src.to_vec());
1105            }
1106            let (cow, _, had_errors) = enc.encode(s);
1107            if had_errors {
1108                return Err(format!(
1109                    "failed to encode '{rel_path}' from UTF-8 to {label}"
1110                ));
1111            }
1112            Ok(cow.into_owned())
1113        }
1114    }
1115}
1116
1117// ---------------------------------------------------------------------------
1118// Input (add / clean) direction
1119// ---------------------------------------------------------------------------
1120
1121/// Convert data for storage in the index/object database (the "clean" direction).
1122///
1123/// This handles:
1124/// 1. Clean filter execution
1125/// 2. CRLF → LF conversion based on config + attributes
1126/// 3. safecrlf checking
1127///
1128/// Returns `Ok(data)` on success, or an error if safecrlf rejects it.
1129pub fn convert_to_git(
1130    data: &[u8],
1131    rel_path: &str,
1132    conv: &ConversionConfig,
1133    file_attrs: &FileAttrs,
1134) -> Result<Vec<u8>, String> {
1135    convert_to_git_with_opts(
1136        data,
1137        rel_path,
1138        conv,
1139        file_attrs,
1140        ConvertToGitOpts::default(),
1141    )
1142}
1143
1144/// Like [`convert_to_git`] with Git-compatible safer-autocrlf index handling.
1145pub fn convert_to_git_with_opts(
1146    data: &[u8],
1147    rel_path: &str,
1148    conv: &ConversionConfig,
1149    file_attrs: &FileAttrs,
1150    opts: ConvertToGitOpts<'_>,
1151) -> Result<Vec<u8>, String> {
1152    let mut buf = data.to_vec();
1153
1154    // 1. Run clean filter if configured (long-running `process` overrides clean command)
1155    if let Some(ref proc_cmd) = file_attrs.filter_process {
1156        let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1157        buf = apply_process_clean(proc_cmd, rel_path, &buf).map_err(|_e| {
1158            if file_attrs.filter_clean_required {
1159                format!("fatal: {rel_path}: clean filter '{name}' failed")
1160            } else {
1161                format!("clean filter failed: {_e}")
1162            }
1163        })?;
1164    } else {
1165        match file_attrs.filter_clean.as_ref() {
1166            Some(clean_cmd) => {
1167                buf = run_filter(clean_cmd, &buf, rel_path).map_err(|e| {
1168                    let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1169                    if file_attrs.filter_clean_required {
1170                        format!("fatal: {rel_path}: clean filter '{name}' failed")
1171                    } else {
1172                        format!("clean filter failed: {e}")
1173                    }
1174                })?;
1175            }
1176            None => {
1177                if file_attrs.filter_clean_required {
1178                    let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1179                    return Err(format!("fatal: {rel_path}: clean filter '{name}' failed"));
1180                }
1181            }
1182        }
1183    }
1184
1185    // 2. working-tree-encoding: working tree bytes → UTF-8 for the object DB (Git `encode_to_git`).
1186    if let Some(ref enc) = file_attrs.working_tree_encoding {
1187        buf = decode_working_tree_bytes_to_utf8(&buf, rel_path, enc)?;
1188    }
1189
1190    // 3. Determine if we should do CRLF→LF conversion
1191    let would_convert = would_convert_on_input(conv, file_attrs, &buf);
1192
1193    let mut convert_crlf_into_lf = would_convert && has_crlf(&buf);
1194    if convert_crlf_into_lf
1195        && clean_uses_autocrlf_index_guard(file_attrs, conv)
1196        && !opts.renormalize
1197        && opts.index_blob.is_some_and(has_crlf_in_index_blob)
1198    {
1199        convert_crlf_into_lf = false;
1200    }
1201
1202    // 4. safecrlf check — Git simulates clean then smudge (`check_global_conv_flags_eol`).
1203    if would_convert && opts.check_safecrlf {
1204        check_safecrlf_roundtrip(conv, file_attrs, &buf, rel_path, convert_crlf_into_lf)?;
1205    }
1206
1207    // 5. Actually convert CRLF → LF if the file has CRLFs
1208    if convert_crlf_into_lf {
1209        buf = crlf_to_lf(&buf);
1210    }
1211
1212    Ok(buf)
1213}
1214
1215/// Decide whether CRLF/LF conversion is configured for this file on input.
1216/// Returns true if the file *would* be subject to conversion (even if no
1217/// actual bytes need changing).
1218fn would_convert_on_input(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
1219    match attrs.crlf_legacy {
1220        CrlfLegacyAttr::Unset => return false,
1221        CrlfLegacyAttr::Input => {
1222            if is_binary(data) {
1223                return false;
1224            }
1225            return true;
1226        }
1227        CrlfLegacyAttr::Crlf => {
1228            if attrs.text == TextAttr::Unset {
1229                return false;
1230            }
1231            if is_binary(data) {
1232                return false;
1233            }
1234            return true;
1235        }
1236        CrlfLegacyAttr::Unspecified => {}
1237    }
1238
1239    // If text is explicitly unset (-text or binary), never convert
1240    if attrs.text == TextAttr::Unset {
1241        return false;
1242    }
1243
1244    // If eol attr is set, this implies text mode
1245    if attrs.eol != EolAttr::Unspecified {
1246        if attrs.text == TextAttr::Auto && is_binary(data) {
1247            return false;
1248        }
1249        return true;
1250    }
1251
1252    // If text is explicitly set, always convert
1253    if attrs.text == TextAttr::Set {
1254        return true;
1255    }
1256
1257    if attrs.text == TextAttr::Auto {
1258        if is_binary(data) {
1259            return false;
1260        }
1261        return true;
1262    }
1263
1264    // No text attribute: fall back to core.autocrlf
1265    match conv.autocrlf {
1266        AutoCrlf::True | AutoCrlf::Input => {
1267            if is_binary(data) {
1268                return false;
1269            }
1270            true
1271        }
1272        AutoCrlf::False => false,
1273    }
1274}
1275
1276/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, CRLF→LF).
1277fn eprint_safecrlf_warn_crlf_to_lf(rel_path: &str) {
1278    eprintln!(
1279        "warning: in the working copy of '{rel_path}', CRLF will be replaced by LF the next time Git touches it"
1280    );
1281}
1282
1283/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, LF→CRLF).
1284fn eprint_safecrlf_warn_lf_to_crlf(rel_path: &str) {
1285    eprintln!(
1286        "warning: in the working copy of '{rel_path}', LF will be replaced by CRLF the next time Git touches it"
1287    );
1288}
1289
1290/// Git `convert.c` `check_global_conv_flags_eol` after simulating clean + smudge.
1291fn check_safecrlf_roundtrip(
1292    conv: &ConversionConfig,
1293    file_attrs: &FileAttrs,
1294    data: &[u8],
1295    rel_path: &str,
1296    convert_crlf_into_lf: bool,
1297) -> Result<(), String> {
1298    if conv.safecrlf == SafeCrlf::False {
1299        return Ok(());
1300    }
1301
1302    let old_stats = git_text_stat(data);
1303
1304    let mut new_stats = old_stats.clone();
1305    if convert_crlf_into_lf && new_stats.crlf > 0 {
1306        new_stats.lonelf += new_stats.crlf;
1307        new_stats.crlf = 0;
1308    }
1309    if will_convert_lf_to_crlf_from_stats(&new_stats, conv, file_attrs) {
1310        new_stats.crlf += new_stats.lonelf;
1311        new_stats.lonelf = 0;
1312    }
1313
1314    if old_stats.crlf > 0 && new_stats.crlf == 0 {
1315        let msg = format!("fatal: CRLF would be replaced by LF in {rel_path}");
1316        if conv.safecrlf == SafeCrlf::True {
1317            return Err(msg);
1318        }
1319        eprint_safecrlf_warn_crlf_to_lf(rel_path);
1320    } else if old_stats.lonelf > 0 && new_stats.lonelf == 0 {
1321        let msg = format!("fatal: LF would be replaced by CRLF in {rel_path}");
1322        if conv.safecrlf == SafeCrlf::True {
1323            return Err(msg);
1324        }
1325        eprint_safecrlf_warn_lf_to_crlf(rel_path);
1326    }
1327
1328    Ok(())
1329}
1330
1331/// Replace CRLF with LF.
1332pub fn crlf_to_lf(data: &[u8]) -> Vec<u8> {
1333    let mut out = Vec::with_capacity(data.len());
1334    let mut i = 0;
1335    while i < data.len() {
1336        if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
1337            out.push(b'\n');
1338            i += 2;
1339        } else {
1340            out.push(data[i]);
1341            i += 1;
1342        }
1343    }
1344    out
1345}
1346
1347/// Replace lone LF with CRLF.
1348pub fn lf_to_crlf(data: &[u8]) -> Vec<u8> {
1349    let mut out = Vec::with_capacity(data.len() + data.len() / 10);
1350    let mut i = 0;
1351    while i < data.len() {
1352        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
1353            out.push(b'\r');
1354            out.push(b'\n');
1355        } else {
1356            out.push(data[i]);
1357        }
1358        i += 1;
1359    }
1360    out
1361}
1362
1363// ---------------------------------------------------------------------------
1364// Output (checkout / smudge) direction
1365// ---------------------------------------------------------------------------
1366
1367/// Convert data from the object database for writing to the working tree
1368/// (the "smudge" direction).
1369///
1370/// This handles (Git `convert_to_working_tree_ca_internal` order):
1371/// 1. Ident keyword expansion
1372/// 2. LF → CRLF conversion based on config + attributes
1373/// 3. `working-tree-encoding` (UTF-8 blob → working tree bytes)
1374/// 4. Smudge filter execution
1375///
1376/// Returns `Ok(None)` when the process filter returned `status=delayed` and `delayed_checkout` was
1377/// provided (Git `delayed_checkout`); the path is queued for [`crate::filter_process::DelayedProcessCheckout::finish`].
1378pub fn convert_to_worktree(
1379    data: &[u8],
1380    rel_path: &str,
1381    conv: &ConversionConfig,
1382    file_attrs: &FileAttrs,
1383    oid_hex: Option<&str>,
1384    smudge_meta: Option<&FilterSmudgeMeta>,
1385    delayed_checkout: Option<&mut crate::filter_process::DelayedProcessCheckout>,
1386) -> Result<Option<Vec<u8>>, String> {
1387    let mut buf = data.to_vec();
1388
1389    // 1. Ident expansion
1390    if file_attrs.ident {
1391        if let Some(oid) = oid_hex {
1392            buf = expand_ident(&buf, oid);
1393        }
1394    }
1395
1396    let can_delay_smudge = delayed_checkout.is_some()
1397        && file_attrs.working_tree_encoding.is_none()
1398        && !file_attrs.ident
1399        && file_attrs
1400            .filter_process
1401            .as_deref()
1402            .is_some_and(|c| !c.is_empty())
1403        && !should_convert_to_crlf(conv, file_attrs, &buf)
1404        && file_attrs
1405            .filter_process
1406            .as_deref()
1407            .is_some_and(crate::filter_process::process_filter_supports_delay);
1408
1409    // 2. LF→CRLF for working tree
1410    let should_convert = should_convert_to_crlf(conv, file_attrs, &buf);
1411    if should_convert {
1412        buf = lf_to_crlf(&buf);
1413    }
1414
1415    // 3. working-tree-encoding (Git `encode_to_worktree`)
1416    if let Some(ref enc) = file_attrs.working_tree_encoding {
1417        buf = encode_utf8_blob_to_working_tree_bytes(&buf, rel_path, enc)?;
1418    }
1419
1420    // 4. Smudge filter — process driver overrides shell smudge
1421    let driver = file_attrs.filter_driver_name.as_deref().unwrap_or("");
1422    if let Some(ref proc_cmd) = file_attrs.filter_process {
1423        let smudge_out =
1424            apply_process_smudge(proc_cmd, rel_path, &buf, smudge_meta, can_delay_smudge).map_err(
1425                |_e| {
1426                    if file_attrs.filter_smudge_required {
1427                        format!("fatal: {rel_path}: smudge filter {driver} failed")
1428                    } else {
1429                        _e
1430                    }
1431                },
1432            )?;
1433        let Some(out) = smudge_out else {
1434            let Some(q) = delayed_checkout else {
1435                return Err(format!(
1436                    "internal error: delayed smudge without checkout queue for {rel_path}"
1437                ));
1438            };
1439            q.push_delayed(
1440                proc_cmd.clone(),
1441                rel_path.to_string(),
1442                smudge_meta.cloned().unwrap_or_default(),
1443            );
1444            return Ok(None);
1445        };
1446        buf = out;
1447    } else {
1448        match file_attrs.filter_smudge.as_ref() {
1449            Some(smudge_cmd) => match run_filter(smudge_cmd, &buf, rel_path) {
1450                Ok(filtered) => buf = filtered,
1451                Err(_e) => {
1452                    if file_attrs.filter_smudge_required {
1453                        return Err(format!("fatal: {rel_path}: smudge filter {driver} failed"));
1454                    }
1455                }
1456            },
1457            None => {
1458                if file_attrs.filter_smudge_required {
1459                    return Err(format!("fatal: {rel_path}: smudge filter {driver} failed"));
1460                }
1461            }
1462        }
1463    }
1464
1465    Ok(Some(buf))
1466}
1467
1468/// Like [`convert_to_worktree`] without delayed-checkout queueing (always materializes or errors).
1469#[must_use]
1470pub fn convert_to_worktree_eager(
1471    data: &[u8],
1472    rel_path: &str,
1473    conv: &ConversionConfig,
1474    file_attrs: &FileAttrs,
1475    oid_hex: Option<&str>,
1476    smudge_meta: Option<&FilterSmudgeMeta>,
1477) -> Result<Vec<u8>, String> {
1478    match convert_to_worktree(data, rel_path, conv, file_attrs, oid_hex, smudge_meta, None)? {
1479        Some(v) => Ok(v),
1480        None => Err(format!(
1481            "internal error: unexpected delayed smudge for {rel_path}"
1482        )),
1483    }
1484}
1485
1486/// Decide whether to convert LF→CRLF on output (working tree / smudge direction).
1487#[must_use]
1488pub fn should_convert_to_crlf(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
1489    match attrs.crlf_legacy {
1490        CrlfLegacyAttr::Unset | CrlfLegacyAttr::Input => return false,
1491        CrlfLegacyAttr::Crlf => {
1492            if attrs.text == TextAttr::Unset {
1493                return false;
1494            }
1495            // Legacy `crlf` (set) forces CRLF on checkout (even for paths Git
1496            // would otherwise treat as binary; see t0020 "t* crlf" + `three`).
1497            return true;
1498        }
1499        CrlfLegacyAttr::Unspecified => {}
1500    }
1501
1502    // If text is explicitly unset, never convert
1503    if attrs.text == TextAttr::Unset {
1504        return false;
1505    }
1506
1507    // If there's an explicit eol attribute
1508    if attrs.eol != EolAttr::Unspecified {
1509        if attrs.text == TextAttr::Auto && is_binary(data) {
1510            return false;
1511        }
1512        if attrs.eol != EolAttr::Crlf {
1513            return false;
1514        }
1515        // `text=auto` + `eol=crlf` → Git `CRLF_AUTO_CRLF` (safe mixed handling).
1516        if attrs.text == TextAttr::Auto {
1517            return auto_crlf_should_smudge_lf_to_crlf(data);
1518        }
1519        // Explicit `eol=crlf` with `text` set, etc. → `CRLF_TEXT_CRLF` (always normalize).
1520        return true;
1521    }
1522
1523    // If text is explicitly set, use eol config
1524    if attrs.text == TextAttr::Set {
1525        return output_eol_is_crlf(conv);
1526    }
1527
1528    if attrs.text == TextAttr::Auto {
1529        if is_binary(data) {
1530            return false;
1531        }
1532        if !output_eol_is_crlf(conv) {
1533            return false;
1534        }
1535        return auto_crlf_should_smudge_lf_to_crlf(data);
1536    }
1537
1538    // No text attribute: fall back to core.autocrlf
1539    match conv.autocrlf {
1540        AutoCrlf::True => {
1541            if is_binary(data) {
1542                return false;
1543            }
1544            auto_crlf_should_smudge_lf_to_crlf(data)
1545        }
1546        AutoCrlf::Input | AutoCrlf::False => false,
1547    }
1548}
1549
1550/// Whether the output EOL should be CRLF based on config.
1551fn output_eol_is_crlf(conv: &ConversionConfig) -> bool {
1552    // Git `text_eol_is_crlf`: autocrlf=input forces LF output before `core.eol` is consulted.
1553    if conv.autocrlf == AutoCrlf::Input {
1554        return false;
1555    }
1556    if conv.autocrlf == AutoCrlf::True {
1557        return true;
1558    }
1559    match conv.eol {
1560        CoreEol::Crlf => true,
1561        CoreEol::Lf => false,
1562        CoreEol::Native => {
1563            // On Unix, native is LF
1564            cfg!(windows)
1565        }
1566    }
1567}
1568
1569/// Expand `$Id$` → `$Id: <oid>$` in data.
1570///
1571/// Matches Git's `ident_to_worktree` in `convert.c`: same-line `$` terminator, and foreign
1572/// idents (internal spaces before the closing `$`) are left unchanged.
1573fn expand_ident(data: &[u8], oid: &str) -> Vec<u8> {
1574    if !count_ident_regions(data) {
1575        return data.to_vec();
1576    }
1577    let replacement = format!("$Id: {oid} $");
1578    let mut out = Vec::with_capacity(data.len() + 60);
1579    let mut i = 0;
1580    while i < data.len() {
1581        if data[i] != b'$' {
1582            out.push(data[i]);
1583            i += 1;
1584            continue;
1585        }
1586        if i + 3 > data.len() || data[i + 1] != b'I' || data[i + 2] != b'd' {
1587            out.push(data[i]);
1588            i += 1;
1589            continue;
1590        }
1591        let after_id = i + 3;
1592        let ch = data.get(after_id).copied();
1593        match ch {
1594            Some(b'$') => {
1595                out.extend_from_slice(replacement.as_bytes());
1596                i = after_id + 1;
1597            }
1598            Some(b':') => {
1599                let rest = &data[after_id + 1..];
1600                let line_end = rest
1601                    .iter()
1602                    .position(|&b| b == b'\n' || b == b'\r')
1603                    .unwrap_or(rest.len());
1604                let line = &rest[..line_end];
1605                let Some(dollar_rel) = line.iter().position(|&b| b == b'$') else {
1606                    out.push(data[i]);
1607                    i += 1;
1608                    continue;
1609                };
1610                if line[..dollar_rel].contains(&b'\n') {
1611                    out.push(data[i]);
1612                    i += 1;
1613                    continue;
1614                }
1615                // Foreign ident (Git `ident_to_worktree`): first space in the payload after the
1616                // byte following `:` must not be the last character before `$`.
1617                let payload = &line[..dollar_rel];
1618                let foreign = payload.len() > 1
1619                    && payload[1..]
1620                        .iter()
1621                        .position(|&b| b == b' ')
1622                        .is_some_and(|rel| {
1623                            let pos = 1 + rel;
1624                            pos < payload.len().saturating_sub(1)
1625                        });
1626                if foreign {
1627                    out.push(data[i]);
1628                    i += 1;
1629                    continue;
1630                }
1631                out.extend_from_slice(replacement.as_bytes());
1632                i = after_id + 1 + dollar_rel + 1;
1633            }
1634            _ => {
1635                out.push(data[i]);
1636                i += 1;
1637            }
1638        }
1639    }
1640    out
1641}
1642
1643/// Whether the buffer contains any `$Id$` / `$Id: ... $` regions Git would rewrite (`count_ident`).
1644fn count_ident_regions(data: &[u8]) -> bool {
1645    let mut i = 0usize;
1646    while i < data.len() {
1647        if data[i] != b'$' {
1648            i += 1;
1649            continue;
1650        }
1651        if i + 3 > data.len() || data[i + 1] != b'I' || data[i + 2] != b'd' {
1652            i += 1;
1653            continue;
1654        }
1655        let after = i + 3;
1656        match data.get(after).copied() {
1657            Some(b'$') => return true,
1658            Some(b':') => {
1659                let mut j = after + 1;
1660                let mut found = false;
1661                while j < data.len() {
1662                    match data[j] {
1663                        b'$' => {
1664                            found = true;
1665                            break;
1666                        }
1667                        b'\n' | b'\r' => break,
1668                        _ => j += 1,
1669                    }
1670                }
1671                if found {
1672                    return true;
1673                }
1674                i += 1;
1675            }
1676            _ => i += 1,
1677        }
1678    }
1679    false
1680}
1681
1682/// Collapse `$Id: ... $` back to `$Id$`.
1683pub fn collapse_ident(data: &[u8]) -> Vec<u8> {
1684    let mut out = Vec::with_capacity(data.len());
1685    let mut i = 0;
1686    while i < data.len() {
1687        if i + 4 <= data.len() && &data[i..i + 4] == b"$Id:" {
1688            let rest = &data[i + 4..];
1689            let line_end = rest
1690                .iter()
1691                .position(|&b| b == b'\n' || b == b'\r')
1692                .unwrap_or(rest.len());
1693            let line = &rest[..line_end];
1694            if let Some(end) = line.iter().position(|&b| b == b'$') {
1695                out.extend_from_slice(b"$Id$");
1696                i += 4 + end + 1;
1697                continue;
1698            }
1699        }
1700        out.push(data[i]);
1701        i += 1;
1702    }
1703    out
1704}
1705
1706/// Shell-quote `s` with single quotes, matching Git's `sq_quote_buf` (`'` → `'\''`).
1707fn sq_quote_buf(s: &str) -> String {
1708    let mut out = String::with_capacity(s.len() + 2);
1709    out.push('\'');
1710    for ch in s.chars() {
1711        if ch == '\'' {
1712            out.push_str("'\\''");
1713        } else {
1714            out.push(ch);
1715        }
1716    }
1717    out.push('\'');
1718    out
1719}
1720
1721/// Expand Git filter command placeholders: `%%` → `%`, `%f` → quoted repository-relative path.
1722fn expand_filter_command(cmd: &str, rel_path: &str) -> String {
1723    let mut out = String::with_capacity(cmd.len() + rel_path.len() + 8);
1724    let mut chars = cmd.chars().peekable();
1725    while let Some(c) = chars.next() {
1726        if c == '%' {
1727            match chars.peek() {
1728                Some('%') => {
1729                    chars.next();
1730                    out.push('%');
1731                }
1732                Some('f') => {
1733                    chars.next();
1734                    out.push_str(&sq_quote_buf(rel_path));
1735                }
1736                _ => out.push('%'),
1737            }
1738        } else {
1739            out.push(c);
1740        }
1741    }
1742    out
1743}
1744
1745/// Run a filter command, piping data through stdin→stdout.
1746fn run_filter(cmd: &str, data: &[u8], rel_path: &str) -> Result<Vec<u8>, std::io::Error> {
1747    let expanded = expand_filter_command(cmd, rel_path);
1748    let mut child = Command::new("sh")
1749        .arg("-c")
1750        .arg(&expanded)
1751        .stdin(Stdio::piped())
1752        .stdout(Stdio::piped())
1753        .stderr(Stdio::inherit())
1754        .spawn()?;
1755
1756    use std::io::{ErrorKind, Write};
1757    if let Some(ref mut stdin) = child.stdin {
1758        if let Err(e) = stdin.write_all(data) {
1759            // Match Git: if the filter exits without reading stdin, ignore EPIPE.
1760            if e.kind() != ErrorKind::BrokenPipe {
1761                return Err(e);
1762            }
1763        }
1764    }
1765    drop(child.stdin.take());
1766
1767    let output = child.wait_with_output()?;
1768    if !output.status.success() {
1769        return Err(std::io::Error::other(format!(
1770            "filter command exited with status {}",
1771            output.status
1772        )));
1773    }
1774
1775    Ok(output.stdout)
1776}
1777
1778// Re-export AttrRule type is internal, but we expose the vec through load_gitattributes.
1779// The public API uses the opaque Vec from load_gitattributes + get_file_attrs.
1780
1781/// Opaque type alias for loaded gitattributes rules.
1782pub type GitAttributes = Vec<AttrRule>;
1783
1784#[cfg(test)]
1785mod tests {
1786    use super::*;
1787
1788    #[test]
1789    fn test_crlf_to_lf() {
1790        assert_eq!(crlf_to_lf(b"hello\r\nworld\r\n"), b"hello\nworld\n");
1791        assert_eq!(crlf_to_lf(b"hello\nworld\n"), b"hello\nworld\n");
1792        assert_eq!(crlf_to_lf(b"hello\r\n"), b"hello\n");
1793    }
1794
1795    #[test]
1796    fn test_lf_to_crlf() {
1797        assert_eq!(lf_to_crlf(b"hello\nworld\n"), b"hello\r\nworld\r\n");
1798        assert_eq!(lf_to_crlf(b"hello\r\nworld\r\n"), b"hello\r\nworld\r\n");
1799    }
1800
1801    #[test]
1802    fn test_has_crlf() {
1803        assert!(has_crlf(b"hello\r\nworld"));
1804        assert!(!has_crlf(b"hello\nworld"));
1805    }
1806
1807    #[test]
1808    fn smudge_mixed_line_endings_unchanged_with_autocrlf_true() {
1809        let mut blob = Vec::new();
1810        for part in [
1811            b"Oh\n".as_slice(),
1812            b"here\n",
1813            b"is\n",
1814            b"CRLF\r\n",
1815            b"in\n",
1816            b"text\n",
1817        ] {
1818            blob.extend_from_slice(part);
1819        }
1820        let conv = ConversionConfig {
1821            autocrlf: AutoCrlf::True,
1822            eol: CoreEol::Lf,
1823            safecrlf: SafeCrlf::False,
1824        };
1825        let attrs = FileAttrs::default();
1826        let out = convert_to_worktree_eager(&blob, "mixed", &conv, &attrs, None, None).unwrap();
1827        assert_eq!(out, blob);
1828    }
1829
1830    #[test]
1831    fn smudge_lf_only_gets_crlf_with_autocrlf_true() {
1832        let blob = b"a\nb\n";
1833        let conv = ConversionConfig {
1834            autocrlf: AutoCrlf::True,
1835            eol: CoreEol::Lf,
1836            safecrlf: SafeCrlf::False,
1837        };
1838        let attrs = FileAttrs::default();
1839        let out = convert_to_worktree_eager(blob, "x", &conv, &attrs, None, None).unwrap();
1840        assert_eq!(out, b"a\r\nb\r\n");
1841    }
1842
1843    #[test]
1844    fn test_is_binary() {
1845        assert!(is_binary(b"hello\0world"));
1846        assert!(!is_binary(b"hello world"));
1847    }
1848
1849    #[test]
1850    fn attr_dir_only_pattern_does_not_match_same_named_file() {
1851        let rules = parse_gitattributes_content("ignored-only-if-dir/ export-ignore\n");
1852        let rule = &rules[0];
1853        assert!(rule.must_be_dir);
1854        assert!(rule.basename_only);
1855        assert!(!attr_rule_matches(
1856            rule,
1857            "not-ignored-dir/ignored-only-if-dir",
1858            false
1859        ));
1860        assert!(attr_rule_matches(rule, "ignored-only-if-dir", true));
1861    }
1862
1863    #[test]
1864    fn test_expand_collapse_ident() {
1865        let data = b"$Id$";
1866        let expanded = expand_ident(data, "abc123");
1867        assert_eq!(expanded, b"$Id: abc123 $");
1868        let collapsed = collapse_ident(&expanded);
1869        assert_eq!(collapsed, b"$Id$");
1870    }
1871
1872    #[test]
1873    fn expand_ident_does_not_span_lines_for_partial_keyword() {
1874        let data = b"$Id: NoTerminatingSymbol\n$Id: deadbeef $\n";
1875        let expanded = expand_ident(data, "newoid");
1876        assert_eq!(expanded, b"$Id: NoTerminatingSymbol\n$Id: newoid $\n");
1877    }
1878
1879    #[test]
1880    fn expand_ident_preserves_foreign_id_with_internal_spaces() {
1881        let data = b"$Id: Foreign Commit With Spaces $\n";
1882        let expanded = expand_ident(data, "abc");
1883        assert_eq!(expanded, data);
1884    }
1885
1886    #[test]
1887    fn expand_filter_command_percent_f_quotes_path() {
1888        let s = expand_filter_command("sh ./x.sh %f --extra", "name  with 'sq'");
1889        assert_eq!(s, "sh ./x.sh 'name  with '\\''sq'\\''' --extra");
1890        assert_eq!(expand_filter_command("a %% b", "p"), "a % b");
1891    }
1892}
grit_lib/crlf.rs

grit_lib/
crlf.rs