grit_lib/
crlf.rs

1//! CRLF / EOL conversion and clean/smudge filter support.
2//!
3//! This module handles line-ending conversion when staging files (`git add`)
4//! and checking out files (`git checkout`, `read-tree -u`, `checkout-index`).
5//!
6//! Config knobs:
7//!   - `core.autocrlf` (true / input / false)
8//!   - `core.eol` (lf / crlf / native)
9//!   - `core.safecrlf` (true / warn / false)
10//!
11//! Gitattributes:
12//!   - `text` / `text=auto` / `-text` / `binary`
13//!   - `eol=lf` / `eol=crlf`
14//!   - `filter=<name>` (with `filter.<name>.clean` / `filter.<name>.smudge`)
15//!   - `ident` keyword expansion
16
17use std::path::{Path, PathBuf};
18use std::process::{Command, Stdio};
19
20use encoding_rs::UTF_8;
21
22use crate::config::ConfigSet;
23use crate::filter_process::{apply_process_clean, apply_process_smudge, FilterSmudgeMeta};
24
25/// What `core.autocrlf` is set to.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum AutoCrlf {
28    True,
29    Input,
30    False,
31}
32
33/// What `core.eol` is set to.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum CoreEol {
36    Lf,
37    Crlf,
38    Native,
39}
40
41/// What `core.safecrlf` is set to.
42#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum SafeCrlf {
44    True,
45    Warn,
46    False,
47}
48
49/// Per-file text attribute from .gitattributes.
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum TextAttr {
52    /// `text` — always treat as text.
53    Set,
54    /// `text=auto` — auto-detect.
55    Auto,
56    /// `-text` or `binary` — never convert.
57    Unset,
58    /// No text attribute specified.
59    Unspecified,
60}
61
62/// Per-file eol attribute from .gitattributes.
63#[derive(Debug, Clone, Copy, PartialEq, Eq)]
64pub enum EolAttr {
65    Lf,
66    Crlf,
67    Unspecified,
68}
69
70/// Legacy `crlf` gitattribute (deprecated in Git; still honored for EOL conversion).
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
72pub enum CrlfLegacyAttr {
73    #[default]
74    Unspecified,
75    /// `-crlf` — disable CRLF conversion.
76    Unset,
77    /// `crlf=input` — normalize to LF in the object database; no CRLF on checkout.
78    Input,
79    /// Bare `crlf` (set) — force CRLF on checkout for text files.
80    Crlf,
81}
82
83/// Per-file merge attribute from .gitattributes.
84#[derive(Debug, Clone, PartialEq, Eq)]
85pub enum MergeAttr {
86    /// No merge attribute specified.
87    Unspecified,
88    /// `-merge` — treat as binary/non-text merge.
89    Unset,
90    /// `merge=<driver>` — use named merge driver.
91    Driver(String),
92}
93
94/// How the `diff` gitattribute affects diff output.
95#[derive(Debug, Clone, PartialEq, Eq)]
96pub enum DiffAttr {
97    /// No `diff` attribute (use heuristics / default).
98    Unspecified,
99    /// `-diff` / `diff=unset` / `binary` — treat as binary for diff purposes.
100    Unset,
101    /// `diff=<driver>` — use named driver (e.g. for textconv).
102    Driver(String),
103}
104
105/// Per-file attributes relevant to conversion.
106#[derive(Debug, Clone)]
107pub struct FileAttrs {
108    pub text: TextAttr,
109    pub eol: EolAttr,
110    /// Effect of the `diff` gitattribute on diff output.
111    pub diff_attr: DiffAttr,
112    /// `export-ignore` — omit from `git archive`.
113    pub export_ignore: bool,
114    /// `export-subst` — expand `$Format:` placeholders using the archived commit.
115    pub export_subst: bool,
116    pub filter_clean: Option<String>,
117    pub filter_smudge: Option<String>,
118    /// `filter.<name>.process` — long-running filter (takes precedence over clean/smudge commands).
119    pub filter_process: Option<String>,
120    /// Driver name from the active `filter=<name>` gitattribute (for error messages).
121    pub filter_driver_name: Option<String>,
122    /// Whether `filter.<name>.required` is set for this path's filter driver.
123    pub filter_smudge_required: bool,
124    /// Same config key as smudge; clean direction fails when unset if true.
125    pub filter_clean_required: bool,
126    pub ident: bool,
127    pub merge: MergeAttr,
128    pub conflict_marker_size: Option<String>,
129    /// Working tree encoding (e.g. "utf-16") — content is converted to UTF-8 on add.
130    pub working_tree_encoding: Option<String>,
131    /// Legacy `crlf` / `-crlf` / `crlf=input` from `.gitattributes`.
132    pub crlf_legacy: CrlfLegacyAttr,
133    /// `whitespace` attribute value: `None` if unset, `Some("set")` for bare `whitespace`,
134    /// `Some("unset")` for `-whitespace`, or `Some("trailing,...")` for `whitespace=...`.
135    pub whitespace: Option<String>,
136}
137
138impl Default for FileAttrs {
139    fn default() -> Self {
140        FileAttrs {
141            text: TextAttr::Unspecified,
142            eol: EolAttr::Unspecified,
143            diff_attr: DiffAttr::Unspecified,
144            export_ignore: false,
145            export_subst: false,
146            filter_clean: None,
147            filter_smudge: None,
148            filter_process: None,
149            filter_driver_name: None,
150            filter_smudge_required: false,
151            filter_clean_required: false,
152            ident: false,
153            merge: MergeAttr::Unspecified,
154            conflict_marker_size: None,
155            working_tree_encoding: None,
156            crlf_legacy: CrlfLegacyAttr::Unspecified,
157            whitespace: None,
158        }
159    }
160}
161
162/// Global conversion settings derived from config.
163#[derive(Debug, Clone)]
164pub struct ConversionConfig {
165    pub autocrlf: AutoCrlf,
166    pub eol: CoreEol,
167    pub safecrlf: SafeCrlf,
168}
169
170impl ConversionConfig {
171    /// Load conversion settings from a ConfigSet.
172    pub fn from_config(config: &ConfigSet) -> Self {
173        let autocrlf = match config.get("core.autocrlf") {
174            Some(v) => match v.to_lowercase().as_str() {
175                "true" | "yes" | "on" | "1" => AutoCrlf::True,
176                "input" => AutoCrlf::Input,
177                _ => AutoCrlf::False,
178            },
179            None => AutoCrlf::False,
180        };
181
182        let eol = match config.get("core.eol") {
183            Some(v) => match v.to_lowercase().as_str() {
184                "crlf" => CoreEol::Crlf,
185                "lf" => CoreEol::Lf,
186                "native" => CoreEol::Native,
187                _ => CoreEol::Native,
188            },
189            None => CoreEol::Native,
190        };
191
192        let safecrlf = match config.get("core.safecrlf") {
193            Some(v) => match v.to_lowercase().as_str() {
194                "true" | "yes" | "on" | "1" => SafeCrlf::True,
195                "warn" => SafeCrlf::Warn,
196                _ => SafeCrlf::False,
197            },
198            // Git warns on round-trip EOL issues by default when unset.
199            None => SafeCrlf::Warn,
200        };
201
202        ConversionConfig {
203            autocrlf,
204            eol,
205            safecrlf,
206        }
207    }
208}
209
210/// A parsed .gitattributes rule.
211#[derive(Debug, Clone)]
212pub struct AttrRule {
213    /// Glob text used for matching (trailing directory `/` stripped; see [`AttrRule::must_be_dir`]).
214    pattern: String,
215    /// When true, the source pattern ended with `/` and matches only directories (Git `PATTERN_FLAG_MUSTBEDIR`).
216    must_be_dir: bool,
217    /// When true, match only the path's final component (Git `PATTERN_FLAG_NODIR` / no `/` in the pattern body).
218    basename_only: bool,
219    attrs: Vec<(String, String)>, // (name, value) where value is "set"/"unset"/specific value
220}
221
222impl AttrRule {
223    /// Diff driver names assigned by this rule (`diff=<driver>`), excluding `set`/`unset`.
224    pub fn diff_drivers(&self) -> impl Iterator<Item = &str> + '_ {
225        self.attrs.iter().filter_map(|(name, value)| {
226            if name == "diff" && !value.is_empty() && value != "unset" && value != "set" {
227                Some(value.as_str())
228            } else {
229                None
230            }
231        })
232    }
233}
234
235/// Load .gitattributes from the worktree root.
236pub fn load_gitattributes(work_tree: &Path) -> Vec<AttrRule> {
237    let mut rules = Vec::new();
238
239    let root_attrs = work_tree.join(".gitattributes");
240    if let Ok(content) = std::fs::read_to_string(&root_attrs) {
241        parse_gitattributes(&content, &mut rules);
242    }
243
244    let info_attrs = work_tree.join(".git/info/attributes");
245    if let Ok(content) = std::fs::read_to_string(&info_attrs) {
246        parse_gitattributes(&content, &mut rules);
247    }
248
249    rules
250}
251
252/// Parse gitattributes content into attribute rules.
253///
254/// This is useful when attributes are sourced from non-worktree inputs
255/// (for example, tree objects selected by `--attr-source`).
256#[must_use]
257pub fn parse_gitattributes_content(content: &str) -> Vec<AttrRule> {
258    let mut rules = Vec::new();
259    parse_gitattributes(content, &mut rules);
260    rules
261}
262
263/// Load .gitattributes from the index (for use during checkout when
264/// the worktree file may not yet exist).
265pub fn load_gitattributes_from_index(
266    index: &crate::index::Index,
267    odb: &crate::odb::Odb,
268) -> Vec<AttrRule> {
269    let mut rules = Vec::new();
270
271    // Look for .gitattributes in the index (stage 0)
272    if let Some(entry) = index.get(b".gitattributes", 0) {
273        if let Ok(obj) = odb.read(&entry.oid) {
274            if let Ok(content) = String::from_utf8(obj.data) {
275                parse_gitattributes(&content, &mut rules);
276            }
277        }
278    }
279
280    rules
281}
282
283/// Load `.gitattributes` rules that apply to `rel_path`, including root and
284/// nested `dir/.gitattributes` along parent directories (Git-consistent order:
285/// root first, then each ancestor directory; later rules win in [`get_file_attrs`]).
286///
287/// Reads from the working tree when present, otherwise from a stage-0 index entry.
288pub fn load_gitattributes_for_checkout(
289    work_tree: &Path,
290    rel_path: &str,
291    index: &crate::index::Index,
292    odb: &crate::odb::Odb,
293) -> Vec<AttrRule> {
294    let mut rules = load_gitattributes(work_tree);
295
296    // Root `.gitattributes` may exist only in the index while the worktree file
297    // is missing (e.g. t0020 in-tree attributes after `rm -rf .gitattributes`).
298    if !work_tree.join(".gitattributes").exists() {
299        if let Some(entry) = index.get(b".gitattributes", 0) {
300            if let Ok(obj) = odb.read(&entry.oid) {
301                if let Ok(content) = String::from_utf8(obj.data) {
302                    parse_gitattributes(&content, &mut rules);
303                }
304            }
305        }
306    }
307
308    let path = Path::new(rel_path);
309    if let Some(parent) = path.parent() {
310        let mut accum = PathBuf::new();
311        for comp in parent.components() {
312            accum.push(comp);
313            let ga_rel = accum.join(".gitattributes");
314            let wt_ga = work_tree.join(&ga_rel);
315            if let Ok(content) = std::fs::read_to_string(&wt_ga) {
316                parse_gitattributes(&content, &mut rules);
317            } else {
318                let key = path_to_index_bytes(&ga_rel);
319                if let Some(entry) = index.get(&key, 0) {
320                    if let Ok(obj) = odb.read(&entry.oid) {
321                        if let Ok(content) = String::from_utf8(obj.data) {
322                            parse_gitattributes(&content, &mut rules);
323                        }
324                    }
325                }
326            }
327        }
328    }
329
330    rules
331}
332
333fn path_to_index_bytes(path: &Path) -> Vec<u8> {
334    #[cfg(unix)]
335    {
336        use std::os::unix::ffi::OsStrExt;
337        path.as_os_str().as_bytes().to_vec()
338    }
339    #[cfg(not(unix))]
340    {
341        path.to_string_lossy().as_bytes().to_vec()
342    }
343}
344
345fn parse_gitattributes(content: &str, rules: &mut Vec<AttrRule>) {
346    for line in content.lines() {
347        let line = line.trim();
348        if line.is_empty() || line.starts_with('#') {
349            continue;
350        }
351
352        let mut parts = line.split_whitespace();
353        let raw_pattern = match parts.next() {
354            Some(p) => p,
355            None => continue,
356        };
357
358        let mut pat = raw_pattern.to_owned();
359        let mut must_be_dir = false;
360        if pat.ends_with('/') && pat.len() > 1 {
361            pat.pop();
362            must_be_dir = true;
363        }
364        let basename_only = !pat.contains('/');
365
366        let mut attrs = Vec::new();
367        for part in parts {
368            if part == "binary" {
369                attrs.push(("text".to_owned(), "unset".to_owned()));
370                attrs.push(("diff".to_owned(), "unset".to_owned()));
371            } else if let Some(rest) = part.strip_prefix('-') {
372                attrs.push((rest.to_owned(), "unset".to_owned()));
373            } else if let Some((key, val)) = part.split_once('=') {
374                attrs.push((key.to_owned(), val.to_owned()));
375            } else {
376                attrs.push((part.to_owned(), "set".to_owned()));
377            }
378        }
379
380        if !attrs.is_empty() {
381            rules.push(AttrRule {
382                pattern: pat,
383                must_be_dir,
384                basename_only,
385                attrs,
386            });
387        }
388    }
389}
390
391fn config_bool_truthy(value: &str) -> bool {
392    matches!(
393        value.trim().to_ascii_lowercase().as_str(),
394        "true" | "yes" | "on" | "1"
395    )
396}
397
398/// Get file attributes for a given path from .gitattributes rules and config.
399///
400/// `is_dir` should be true when `rel_path` names a directory (Git passes a trailing `/` for
401/// directory paths in some call sites; we accept either trailing `/` or this flag from tree walks).
402pub fn get_file_attrs(
403    rules: &[AttrRule],
404    rel_path: &str,
405    is_dir: bool,
406    config: &ConfigSet,
407) -> FileAttrs {
408    let mut fa = FileAttrs::default();
409
410    // Walk rules; last match wins for each attribute.
411    for rule in rules {
412        if attr_rule_matches(rule, rel_path, is_dir) {
413            for (name, value) in &rule.attrs {
414                match name.as_str() {
415                    "text" => {
416                        fa.text = match value.as_str() {
417                            "set" => TextAttr::Set,
418                            "unset" => TextAttr::Unset,
419                            "auto" => TextAttr::Auto,
420                            _ => TextAttr::Unspecified,
421                        };
422                    }
423                    "eol" => {
424                        fa.eol = match value.as_str() {
425                            "lf" => EolAttr::Lf,
426                            "crlf" => EolAttr::Crlf,
427                            _ => EolAttr::Unspecified,
428                        };
429                    }
430                    "filter" => {
431                        if value == "unset" {
432                            fa.filter_clean = None;
433                            fa.filter_smudge = None;
434                            fa.filter_process = None;
435                            fa.filter_driver_name = None;
436                            fa.filter_smudge_required = false;
437                            fa.filter_clean_required = false;
438                        } else {
439                            let clean_key = format!("filter.{value}.clean");
440                            let smudge_key = format!("filter.{value}.smudge");
441                            let process_key = format!("filter.{value}.process");
442                            let req_key = format!("filter.{value}.required");
443                            fa.filter_driver_name = Some(value.clone());
444                            fa.filter_process = config.get(&process_key).filter(|s| !s.is_empty());
445                            if fa.filter_process.is_some() {
446                                fa.filter_clean = None;
447                                fa.filter_smudge = None;
448                            } else {
449                                fa.filter_clean = config.get(&clean_key);
450                                fa.filter_smudge = config.get(&smudge_key);
451                            }
452                            let required =
453                                config.get(&req_key).is_some_and(|v| config_bool_truthy(&v));
454                            fa.filter_smudge_required = required;
455                            fa.filter_clean_required = required;
456                        }
457                    }
458                    "diff" => {
459                        if value == "unset" {
460                            fa.diff_attr = DiffAttr::Unset;
461                        } else if !value.is_empty() && value != "set" {
462                            fa.diff_attr = DiffAttr::Driver(value.clone());
463                        }
464                    }
465                    "ident" => {
466                        fa.ident = value == "set";
467                    }
468                    "export-ignore" => {
469                        fa.export_ignore = value != "unset";
470                    }
471                    "export-subst" => {
472                        fa.export_subst = value != "unset";
473                    }
474                    "merge" => {
475                        fa.merge = match value.as_str() {
476                            "unset" => MergeAttr::Unset,
477                            "set" => MergeAttr::Unspecified,
478                            other => MergeAttr::Driver(other.to_string()),
479                        };
480                    }
481                    "conflict-marker-size" => {
482                        if value == "unset" {
483                            fa.conflict_marker_size = None;
484                        } else {
485                            fa.conflict_marker_size = Some(value.clone());
486                        }
487                    }
488                    "working-tree-encoding" => {
489                        if value != "unset" && !value.is_empty() {
490                            fa.working_tree_encoding = Some(value.clone());
491                        }
492                    }
493                    "crlf" => {
494                        fa.crlf_legacy = match value.as_str() {
495                            "unset" => CrlfLegacyAttr::Unset,
496                            "input" => CrlfLegacyAttr::Input,
497                            "set" => CrlfLegacyAttr::Crlf,
498                            _ => CrlfLegacyAttr::Unspecified,
499                        };
500                    }
501                    "whitespace" => {
502                        if value == "unset" {
503                            fa.whitespace = Some("unset".to_owned());
504                        } else if !value.is_empty() {
505                            fa.whitespace = Some(value.clone());
506                        }
507                    }
508                    _ => {}
509                }
510            }
511        }
512    }
513
514    fa
515}
516
517/// Returns whether gitattribute `attr_name` is set (last matching rule wins), for arbitrary
518/// attribute names used by pathspec `:(attr:...)`.
519///
520/// `is_dir` is whether `path` refers to a directory (see [`get_file_attrs`]).
521#[must_use]
522pub fn path_has_gitattribute(
523    rules: &[AttrRule],
524    path: &str,
525    is_dir: bool,
526    attr_name: &str,
527) -> bool {
528    let mut last: Option<&str> = None;
529    for rule in rules {
530        if attr_rule_matches(rule, path, is_dir) {
531            for (name, value) in &rule.attrs {
532                if name == attr_name {
533                    last = Some(value.as_str());
534                }
535            }
536        }
537    }
538    match last {
539        None | Some("unset") => false,
540        Some(_) => true,
541    }
542}
543
544/// Whether `rule` matches `rel_path` given directory vs file context (Git `path_matches`).
545#[must_use]
546pub fn attr_rule_matches(rule: &AttrRule, rel_path: &str, is_dir: bool) -> bool {
547    let path_is_dir = is_dir || rel_path.ends_with('/');
548    if rule.must_be_dir && !path_is_dir {
549        return false;
550    }
551    let path_for_glob = rel_path.trim_end_matches('/');
552    if rule.basename_only {
553        let basename = path_for_glob.rsplit('/').next().unwrap_or(path_for_glob);
554        glob_matches(rule.pattern.as_str(), basename)
555    } else {
556        glob_matches(rule.pattern.as_str(), path_for_glob)
557    }
558}
559
560fn glob_matches(pattern: &str, text: &str) -> bool {
561    glob_match_bytes(pattern.as_bytes(), text.as_bytes())
562}
563
564fn glob_match_bytes(pat: &[u8], text: &[u8]) -> bool {
565    match (pat.first(), text.first()) {
566        (None, None) => true,
567        (Some(&b'*'), _) => {
568            let pat_rest = pat
569                .iter()
570                .position(|&b| b != b'*')
571                .map_or(&pat[pat.len()..], |i| &pat[i..]);
572            if pat_rest.is_empty() {
573                return true;
574            }
575            for i in 0..=text.len() {
576                if glob_match_bytes(pat_rest, &text[i..]) {
577                    return true;
578                }
579            }
580            false
581        }
582        (Some(&b'?'), Some(_)) => glob_match_bytes(&pat[1..], &text[1..]),
583        (Some(p), Some(t)) if p == t => glob_match_bytes(&pat[1..], &text[1..]),
584        _ => false,
585    }
586}
587
588/// Returns true if the data looks binary (contains NUL bytes in the first 8000 bytes).
589pub fn is_binary(data: &[u8]) -> bool {
590    let check_len = data.len().min(8000);
591    data[..check_len].contains(&0)
592}
593
594// Git `convert.c` `CONVERT_STAT_BITS_*` / `gather_convert_stats_ascii` (for `ls-files --eol`).
595const CONVERT_STAT_BITS_TXT_LF: u32 = 0x1;
596const CONVERT_STAT_BITS_TXT_CRLF: u32 = 0x2;
597const CONVERT_STAT_BITS_BIN: u32 = 0x4;
598
599#[derive(Default, Clone)]
600struct TextStat {
601    nul: u32,
602    lonecr: u32,
603    lonelf: u32,
604    crlf: u32,
605    printable: u32,
606    nonprintable: u32,
607}
608
609fn gather_text_stat(data: &[u8]) -> TextStat {
610    let mut s = TextStat::default();
611    let mut i = 0usize;
612    while i < data.len() {
613        let c = data[i];
614        if c == b'\r' {
615            if i + 1 < data.len() && data[i + 1] == b'\n' {
616                s.crlf += 1;
617                i += 2;
618            } else {
619                s.lonecr += 1;
620                i += 1;
621            }
622            continue;
623        }
624        if c == b'\n' {
625            s.lonelf += 1;
626            i += 1;
627            continue;
628        }
629        if c == 127 {
630            s.nonprintable += 1;
631        } else if c < 32 {
632            match c {
633                b'\t' | b'\x08' | b'\x1b' | b'\x0c' => s.printable += 1,
634                0 => {
635                    s.nul += 1;
636                    s.nonprintable += 1;
637                }
638                _ => s.nonprintable += 1,
639            }
640        } else {
641            s.printable += 1;
642        }
643        i += 1;
644    }
645    s
646}
647
648fn convert_is_binary(stats: &TextStat) -> bool {
649    stats.lonecr > 0 || stats.nul > 0 || (stats.printable >> 7) < stats.nonprintable
650}
651
652fn git_text_stat(data: &[u8]) -> TextStat {
653    let mut stats = gather_text_stat(data);
654    if !data.is_empty() && data[data.len() - 1] == 0x1a {
655        stats.nonprintable = stats.nonprintable.saturating_sub(1);
656    }
657    stats
658}
659
660/// Git `will_convert_lf_to_crlf` using [`TextStat`] (same rules as [`should_convert_to_crlf`] on bytes).
661fn will_convert_lf_to_crlf_from_stats(
662    stats: &TextStat,
663    conv: &ConversionConfig,
664    attrs: &FileAttrs,
665) -> bool {
666    let has_lone_lf = stats.lonelf > 0;
667    let is_bin = convert_is_binary(stats);
668
669    match attrs.crlf_legacy {
670        CrlfLegacyAttr::Unset | CrlfLegacyAttr::Input => return false,
671        CrlfLegacyAttr::Crlf => {
672            if attrs.text == TextAttr::Unset {
673                return false;
674            }
675            return has_lone_lf;
676        }
677        CrlfLegacyAttr::Unspecified => {}
678    }
679
680    if attrs.text == TextAttr::Unset {
681        return false;
682    }
683
684    if attrs.eol != EolAttr::Unspecified {
685        if attrs.text == TextAttr::Auto && is_bin {
686            return false;
687        }
688        if attrs.eol != EolAttr::Crlf {
689            return false;
690        }
691        if attrs.text == TextAttr::Auto {
692            return auto_crlf_should_smudge_lf_to_crlf_from_stats(stats);
693        }
694        return has_lone_lf;
695    }
696
697    if attrs.text == TextAttr::Set {
698        if !output_eol_is_crlf(conv) {
699            return false;
700        }
701        return has_lone_lf;
702    }
703
704    if attrs.text == TextAttr::Auto {
705        if is_bin || !output_eol_is_crlf(conv) {
706            return false;
707        }
708        return auto_crlf_should_smudge_lf_to_crlf_from_stats(stats);
709    }
710
711    match conv.autocrlf {
712        AutoCrlf::True => {
713            if is_bin {
714                return false;
715            }
716            auto_crlf_should_smudge_lf_to_crlf_from_stats(stats)
717        }
718        AutoCrlf::Input | AutoCrlf::False => false,
719    }
720}
721
722fn auto_crlf_should_smudge_lf_to_crlf_from_stats(stats: &TextStat) -> bool {
723    if stats.lonelf == 0 {
724        return false;
725    }
726    if stats.lonecr > 0 || stats.crlf > 0 {
727        return false;
728    }
729    !convert_is_binary(stats)
730}
731
732fn gather_convert_stats(data: &[u8]) -> u32 {
733    if data.is_empty() {
734        return 0;
735    }
736    let mut stats = gather_text_stat(data);
737    if !data.is_empty() && data[data.len() - 1] == 0x1a {
738        stats.nonprintable = stats.nonprintable.saturating_sub(1);
739    }
740    let mut ret = 0u32;
741    if convert_is_binary(&stats) {
742        ret |= CONVERT_STAT_BITS_BIN;
743    }
744    if stats.crlf > 0 {
745        ret |= CONVERT_STAT_BITS_TXT_CRLF;
746    }
747    if stats.lonelf > 0 {
748        ret |= CONVERT_STAT_BITS_TXT_LF;
749    }
750    ret
751}
752
753/// Git `convert.c` `gather_convert_stats_ascii` — worktree/index blob EOL stats for `ls-files --eol`.
754#[must_use]
755pub fn gather_convert_stats_ascii(data: &[u8]) -> &'static str {
756    let convert_stats = gather_convert_stats(data);
757    if convert_stats & CONVERT_STAT_BITS_BIN != 0 {
758        return "-text";
759    }
760    match convert_stats {
761        CONVERT_STAT_BITS_TXT_LF => "lf",
762        CONVERT_STAT_BITS_TXT_CRLF => "crlf",
763        x if x == (CONVERT_STAT_BITS_TXT_LF | CONVERT_STAT_BITS_TXT_CRLF) => "mixed",
764        _ => "none",
765    }
766}
767
768/// Git `convert.c` `get_convert_attr_ascii` — ASCII summary of EOL-related attributes for
769/// `git ls-files --eol` (matches `attr_action` after attribute merge, before clean/smudge).
770#[must_use]
771pub fn convert_attr_ascii_for_ls_files(
772    rules: &[AttrRule],
773    rel_path: &str,
774    config: &ConfigSet,
775) -> String {
776    let fa = get_file_attrs(rules, rel_path, false, config);
777    // Mirror `git_path_check_crlf` for `text` then legacy `crlf` (Git checks `text` first).
778    let mut action = match fa.text {
779        TextAttr::Set => 1,   // CRLF_TEXT
780        TextAttr::Unset => 2, // CRLF_BINARY
781        TextAttr::Auto => 5,  // CRLF_AUTO
782        TextAttr::Unspecified => 0,
783    };
784    if action == 0 {
785        action = match fa.crlf_legacy {
786            CrlfLegacyAttr::Crlf => 1,
787            CrlfLegacyAttr::Unset => 2,
788            CrlfLegacyAttr::Input => 3, // CRLF_TEXT_INPUT
789            CrlfLegacyAttr::Unspecified => 0,
790        };
791    }
792    if action == 2 {
793        return "-text".to_string();
794    }
795    // Bare `eol=lf` / `eol=crlf` without `text` still implies text mode (`convert_attrs`).
796    if action == 0 {
797        if fa.eol == EolAttr::Unspecified {
798            return String::new();
799        }
800        action = 1; // CRLF_TEXT
801    }
802
803    // Merge `eol=` like `convert_attrs` (only when not already binary).
804    if fa.eol == EolAttr::Lf {
805        if action == 5 {
806            action = 7; // CRLF_AUTO_INPUT
807        } else {
808            action = 3; // CRLF_TEXT_INPUT
809        }
810    } else if fa.eol == EolAttr::Crlf {
811        if action == 5 {
812            action = 6; // CRLF_AUTO_CRLF
813        } else {
814            action = 4; // CRLF_TEXT_CRLF
815        }
816    }
817
818    // `attr_action` snapshot (Git assigns before splitting bare `text` / applying autocrlf).
819    let attr_action = action;
820
821    match attr_action {
822        1 => "text".to_string(),
823        3 => "text eol=lf".to_string(),
824        4 => "text eol=crlf".to_string(),
825        5 => "text=auto".to_string(),
826        6 => "text=auto eol=crlf".to_string(),
827        7 => "text=auto eol=lf".to_string(),
828        _ => String::new(),
829    }
830}
831
832/// Returns true if data contains any CRLF sequences.
833pub fn has_crlf(data: &[u8]) -> bool {
834    data.windows(2).any(|w| w == b"\r\n")
835}
836
837/// Returns true if data contains any lone LF (not preceded by CR).
838pub fn has_lone_lf(data: &[u8]) -> bool {
839    for i in 0..data.len() {
840        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
841            return true;
842        }
843    }
844    false
845}
846
847/// Returns true if data contains a bare CR not followed by LF (Git `text_stat.lonecr`).
848fn has_lone_cr(data: &[u8]) -> bool {
849    for i in 0..data.len() {
850        if data[i] == b'\r' && (i + 1 >= data.len() || data[i + 1] != b'\n') {
851            return true;
852        }
853    }
854    false
855}
856
857/// Git `convert.c` `will_convert_lf_to_crlf` for `CRLF_AUTO` / `CRLF_AUTO_INPUT` / `CRLF_AUTO_CRLF`:
858/// if the blob already has CRLF pairs or lone CRs, do not convert lone LFs to CRLF on checkout.
859fn auto_crlf_should_smudge_lf_to_crlf(data: &[u8]) -> bool {
860    if !has_lone_lf(data) {
861        return false;
862    }
863    if has_lone_cr(data) || has_crlf(data) {
864        return false;
865    }
866    if is_binary(data) {
867        return false;
868    }
869    true
870}
871
872/// Returns true if ALL line endings are CRLF (no lone LF).
873pub fn is_all_crlf(data: &[u8]) -> bool {
874    has_crlf(data) && !has_lone_lf(data)
875}
876
877/// Returns true if ALL line endings are LF (no CRLF).
878pub fn is_all_lf(data: &[u8]) -> bool {
879    has_lone_lf(data) && !has_crlf(data)
880}
881
882/// Git `convert.c` `has_crlf_in_index`: index blob already contains CRLF pairs (non-binary).
883#[must_use]
884pub fn has_crlf_in_index_blob(data: &[u8]) -> bool {
885    if !data.contains(&b'\r') {
886        return false;
887    }
888    let st = gather_convert_stats(data);
889    st & CONVERT_STAT_BITS_BIN == 0 && (st & CONVERT_STAT_BITS_TXT_CRLF) != 0
890}
891
892/// Whether clean conversion uses Git's `has_crlf_in_index` guard (`convert.c` only for
893/// `CRLF_AUTO`, `CRLF_AUTO_INPUT`, `CRLF_AUTO_CRLF`). Bare `eol=` without `text=auto` becomes
894/// `CRLF_TEXT_*` and must not use this guard.
895#[must_use]
896pub fn clean_uses_autocrlf_index_guard(attrs: &FileAttrs, conv: &ConversionConfig) -> bool {
897    if attrs.text == TextAttr::Unset || attrs.crlf_legacy == CrlfLegacyAttr::Unset {
898        return false;
899    }
900    if attrs.eol != EolAttr::Unspecified && attrs.text != TextAttr::Auto {
901        return false;
902    }
903    attrs.text == TextAttr::Auto
904        || (attrs.text == TextAttr::Unspecified
905            && matches!(conv.autocrlf, AutoCrlf::True | AutoCrlf::Input))
906}
907
908/// Optional inputs for [`convert_to_git_with_opts`] (Git `CONV_EOL_RENORMALIZE` / index blob).
909#[derive(Debug, Clone, Copy)]
910pub struct ConvertToGitOpts<'a> {
911    /// Stage-0 blob bytes for this path before the current add (for safer-autocrlf).
912    pub index_blob: Option<&'a [u8]>,
913    /// When true, always apply CRLF→LF when configured (merge/cherry-pick renormalize).
914    pub renormalize: bool,
915    /// When false, skip `core.safecrlf` simulation (used for internal diff/hashing — must not spam stderr).
916    pub check_safecrlf: bool,
917}
918
919impl Default for ConvertToGitOpts<'_> {
920    fn default() -> Self {
921        Self {
922            index_blob: None,
923            renormalize: false,
924            check_safecrlf: true,
925        }
926    }
927}
928
929// ---------------------------------------------------------------------------
930// working-tree-encoding (Git `convert.c` `encode_to_git` / `encode_to_worktree`)
931// ---------------------------------------------------------------------------
932
933fn utf16_scalar_iter_to_le_bytes(chars: impl Iterator<Item = u16>) -> Vec<u8> {
934    let mut out = Vec::new();
935    for u in chars {
936        out.extend_from_slice(&u.to_le_bytes());
937    }
938    out
939}
940
941fn utf16_scalar_iter_to_be_bytes(chars: impl Iterator<Item = u16>) -> Vec<u8> {
942    let mut out = Vec::new();
943    for u in chars {
944        out.extend_from_slice(&u.to_be_bytes());
945    }
946    out
947}
948
949fn utf32_chars_to_be_bytes(s: &str) -> Vec<u8> {
950    let mut out = Vec::new();
951    for ch in s.chars() {
952        out.extend_from_slice(&(ch as u32).to_be_bytes());
953    }
954    out
955}
956
957fn utf32_chars_to_le_bytes(s: &str) -> Vec<u8> {
958    let mut out = Vec::new();
959    for ch in s.chars() {
960        out.extend_from_slice(&(ch as u32).to_le_bytes());
961    }
962    out
963}
964
965fn decode_utf32_body_to_utf8_bytes(
966    body: &[u8],
967    rel_path: &str,
968    big_endian: bool,
969) -> Result<Vec<u8>, String> {
970    if !body.len().is_multiple_of(4) {
971        return Err(format!(
972            "invalid UTF-32 length for working tree file '{rel_path}'"
973        ));
974    }
975    let mut s = String::new();
976    for chunk in body.chunks_exact(4) {
977        let cp = if big_endian {
978            u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
979        } else {
980            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
981        };
982        let Some(ch) = char::from_u32(cp) else {
983            return Err(format!(
984                "invalid UTF-32 scalar U+{cp:X} in working tree file '{rel_path}'"
985            ));
986        };
987        s.push(ch);
988    }
989    Ok(s.into_bytes())
990}
991
992fn decode_working_tree_bytes_to_utf8(
993    src: &[u8],
994    rel_path: &str,
995    enc_label: &str,
996) -> Result<Vec<u8>, String> {
997    let label = enc_label.trim();
998    if label.is_empty() {
999        return Ok(src.to_vec());
1000    }
1001    let lower = label.replace('_', "-").to_ascii_lowercase();
1002
1003    let (cow, _used_enc, had_errors) = match lower.as_str() {
1004        "utf-16le-bom" => {
1005            let body = if src.len() >= 2 && src.starts_with(&[0xFF, 0xFE]) {
1006                &src[2..]
1007            } else {
1008                src
1009            };
1010            encoding_rs::UTF_16LE.decode(body)
1011        }
1012        // Git `UTF-16` requires a BOM; `UTF-16BE` / `UTF-16LE` are raw (BOM prohibited on add).
1013        "utf-16" => {
1014            if src.len() >= 2 && src.starts_with(&[0xFE, 0xFF]) {
1015                encoding_rs::UTF_16BE.decode(&src[2..])
1016            } else if src.len() >= 2 && src.starts_with(&[0xFF, 0xFE]) {
1017                encoding_rs::UTF_16LE.decode(&src[2..])
1018            } else {
1019                return Err(format!(
1020                    "missing byte order mark for UTF-16 working tree file '{rel_path}'"
1021                ));
1022            }
1023        }
1024        "utf-16be" => encoding_rs::UTF_16BE.decode(src),
1025        "utf-16le" => encoding_rs::UTF_16LE.decode(src),
1026        "utf-32" => {
1027            let (body, big_endian) = if src.len() >= 4 && src.starts_with(&[0, 0, 0xFE, 0xFF]) {
1028                (&src[4..], true)
1029            } else if src.len() >= 4 && src.starts_with(&[0xFF, 0xFE, 0, 0]) {
1030                (&src[4..], false)
1031            } else {
1032                return Err(format!(
1033                    "missing byte order mark for UTF-32 working tree file '{rel_path}'"
1034                ));
1035            };
1036            return decode_utf32_body_to_utf8_bytes(body, rel_path, big_endian);
1037        }
1038        "utf-32be" => return decode_utf32_body_to_utf8_bytes(src, rel_path, true),
1039        "utf-32le" => return decode_utf32_body_to_utf8_bytes(src, rel_path, false),
1040        _ => {
1041            let Some(enc) = crate::commit_encoding::resolve(label) else {
1042                return Err(format!(
1043                    "unknown working-tree-encoding '{label}' for '{rel_path}'"
1044                ));
1045            };
1046            if enc == UTF_8 {
1047                return Ok(src.to_vec());
1048            }
1049            enc.decode(src)
1050        }
1051    };
1052
1053    if had_errors {
1054        return Err(format!(
1055            "failed to decode '{rel_path}' from working-tree-encoding {label}"
1056        ));
1057    }
1058    Ok(cow.into_owned().into_bytes())
1059}
1060
1061fn encode_utf8_blob_to_working_tree_bytes(
1062    src: &[u8],
1063    rel_path: &str,
1064    enc_label: &str,
1065) -> Result<Vec<u8>, String> {
1066    let label = enc_label.trim();
1067    if label.is_empty() {
1068        return Ok(src.to_vec());
1069    }
1070    let s = std::str::from_utf8(src).map_err(|_| {
1071        format!("failed to encode '{rel_path}' from UTF-8: blob is not valid UTF-8")
1072    })?;
1073    let lower = label.replace('_', "-").to_ascii_lowercase();
1074
1075    match lower.as_str() {
1076        "utf-16le-bom" => {
1077            let mut out = vec![0xFF_u8, 0xFE_u8];
1078            out.extend(utf16_scalar_iter_to_le_bytes(s.encode_utf16()));
1079            Ok(out)
1080        }
1081        // Bare `UTF-16` in Git is BOM + UTF-16; GNU iconv `-t UTF-16` emits UTF-16LE + LE BOM
1082        // (`FF FE`), which upstream tests expect (t0028 / t2082).
1083        "utf-16" => {
1084            let mut out = vec![0xFF_u8, 0xFE_u8];
1085            out.extend(utf16_scalar_iter_to_le_bytes(s.encode_utf16()));
1086            Ok(out)
1087        }
1088        "utf-16be" => {
1089            let mut out = vec![0xFE_u8, 0xFF_u8];
1090            out.extend(utf16_scalar_iter_to_be_bytes(s.encode_utf16()));
1091            Ok(out)
1092        }
1093        "utf-16le" => Ok(utf16_scalar_iter_to_le_bytes(s.encode_utf16())),
1094        "utf-32" | "utf-32be" => {
1095            let mut out = vec![0_u8, 0_u8, 0xFE_u8, 0xFF_u8];
1096            out.extend(utf32_chars_to_be_bytes(s));
1097            Ok(out)
1098        }
1099        "utf-32le" => {
1100            let mut out = vec![0xFF_u8, 0xFE_u8, 0_u8, 0_u8];
1101            out.extend(utf32_chars_to_le_bytes(s));
1102            Ok(out)
1103        }
1104        _ => {
1105            let Some(enc) = crate::commit_encoding::resolve(label) else {
1106                return Err(format!(
1107                    "unknown working-tree-encoding '{label}' for '{rel_path}'"
1108                ));
1109            };
1110            if enc == UTF_8 {
1111                return Ok(src.to_vec());
1112            }
1113            let (cow, _, had_errors) = enc.encode(s);
1114            if had_errors {
1115                return Err(format!(
1116                    "failed to encode '{rel_path}' from UTF-8 to {label}"
1117                ));
1118            }
1119            Ok(cow.into_owned())
1120        }
1121    }
1122}
1123
1124// ---------------------------------------------------------------------------
1125// Input (add / clean) direction
1126// ---------------------------------------------------------------------------
1127
1128/// Convert data for storage in the index/object database (the "clean" direction).
1129///
1130/// This handles:
1131/// 1. Clean filter execution
1132/// 2. CRLF → LF conversion based on config + attributes
1133/// 3. safecrlf checking
1134///
1135/// Returns `Ok(data)` on success, or an error if safecrlf rejects it.
1136pub fn convert_to_git(
1137    data: &[u8],
1138    rel_path: &str,
1139    conv: &ConversionConfig,
1140    file_attrs: &FileAttrs,
1141) -> Result<Vec<u8>, String> {
1142    convert_to_git_with_opts(
1143        data,
1144        rel_path,
1145        conv,
1146        file_attrs,
1147        ConvertToGitOpts::default(),
1148    )
1149}
1150
1151/// Like [`convert_to_git`] with Git-compatible safer-autocrlf index handling.
1152pub fn convert_to_git_with_opts(
1153    data: &[u8],
1154    rel_path: &str,
1155    conv: &ConversionConfig,
1156    file_attrs: &FileAttrs,
1157    opts: ConvertToGitOpts<'_>,
1158) -> Result<Vec<u8>, String> {
1159    let mut buf = data.to_vec();
1160
1161    // 1. Run clean filter if configured (long-running `process` overrides clean command)
1162    if let Some(ref proc_cmd) = file_attrs.filter_process {
1163        let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1164        buf = apply_process_clean(proc_cmd, rel_path, &buf).map_err(|_e| {
1165            if file_attrs.filter_clean_required {
1166                format!("fatal: {rel_path}: clean filter '{name}' failed")
1167            } else {
1168                format!("clean filter failed: {_e}")
1169            }
1170        })?;
1171    } else {
1172        match file_attrs.filter_clean.as_ref() {
1173            Some(clean_cmd) => {
1174                buf = run_filter(clean_cmd, &buf, rel_path).map_err(|e| {
1175                    let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1176                    if file_attrs.filter_clean_required {
1177                        format!("fatal: {rel_path}: clean filter '{name}' failed")
1178                    } else {
1179                        format!("clean filter failed: {e}")
1180                    }
1181                })?;
1182            }
1183            None => {
1184                if file_attrs.filter_clean_required {
1185                    let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1186                    return Err(format!("fatal: {rel_path}: clean filter '{name}' failed"));
1187                }
1188            }
1189        }
1190    }
1191
1192    // 2. working-tree-encoding: working tree bytes → UTF-8 for the object DB (Git `encode_to_git`).
1193    if let Some(ref enc) = file_attrs.working_tree_encoding {
1194        buf = decode_working_tree_bytes_to_utf8(&buf, rel_path, enc)?;
1195    }
1196
1197    // 3. Determine if we should do CRLF→LF conversion
1198    let would_convert = would_convert_on_input(conv, file_attrs, &buf);
1199
1200    let mut convert_crlf_into_lf = would_convert && has_crlf(&buf);
1201    if convert_crlf_into_lf
1202        && clean_uses_autocrlf_index_guard(file_attrs, conv)
1203        && !opts.renormalize
1204        && opts.index_blob.is_some_and(has_crlf_in_index_blob)
1205    {
1206        convert_crlf_into_lf = false;
1207    }
1208
1209    // 4. safecrlf check — Git simulates clean then smudge (`check_global_conv_flags_eol`).
1210    if would_convert && opts.check_safecrlf {
1211        check_safecrlf_roundtrip(conv, file_attrs, &buf, rel_path, convert_crlf_into_lf)?;
1212    }
1213
1214    // 5. Actually convert CRLF → LF if the file has CRLFs
1215    if convert_crlf_into_lf {
1216        buf = crlf_to_lf(&buf);
1217    }
1218
1219    Ok(buf)
1220}
1221
1222/// Decide whether CRLF/LF conversion is configured for this file on input.
1223/// Returns true if the file *would* be subject to conversion (even if no
1224/// actual bytes need changing).
1225fn would_convert_on_input(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
1226    match attrs.crlf_legacy {
1227        CrlfLegacyAttr::Unset => return false,
1228        CrlfLegacyAttr::Input => {
1229            if is_binary(data) {
1230                return false;
1231            }
1232            return true;
1233        }
1234        CrlfLegacyAttr::Crlf => {
1235            if attrs.text == TextAttr::Unset {
1236                return false;
1237            }
1238            if is_binary(data) {
1239                return false;
1240            }
1241            return true;
1242        }
1243        CrlfLegacyAttr::Unspecified => {}
1244    }
1245
1246    // If text is explicitly unset (-text or binary), never convert
1247    if attrs.text == TextAttr::Unset {
1248        return false;
1249    }
1250
1251    // If eol attr is set, this implies text mode
1252    if attrs.eol != EolAttr::Unspecified {
1253        if attrs.text == TextAttr::Auto && is_binary(data) {
1254            return false;
1255        }
1256        return true;
1257    }
1258
1259    // If text is explicitly set, always convert
1260    if attrs.text == TextAttr::Set {
1261        return true;
1262    }
1263
1264    if attrs.text == TextAttr::Auto {
1265        if is_binary(data) {
1266            return false;
1267        }
1268        return true;
1269    }
1270
1271    // No text attribute: fall back to core.autocrlf
1272    match conv.autocrlf {
1273        AutoCrlf::True | AutoCrlf::Input => {
1274            if is_binary(data) {
1275                return false;
1276            }
1277            true
1278        }
1279        AutoCrlf::False => false,
1280    }
1281}
1282
1283/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, CRLF→LF).
1284fn eprint_safecrlf_warn_crlf_to_lf(rel_path: &str) {
1285    eprintln!(
1286        "warning: in the working copy of '{rel_path}', CRLF will be replaced by LF the next time Git touches it"
1287    );
1288}
1289
1290/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, LF→CRLF).
1291fn eprint_safecrlf_warn_lf_to_crlf(rel_path: &str) {
1292    eprintln!(
1293        "warning: in the working copy of '{rel_path}', LF will be replaced by CRLF the next time Git touches it"
1294    );
1295}
1296
1297/// Git `convert.c` `check_global_conv_flags_eol` after simulating clean + smudge.
1298fn check_safecrlf_roundtrip(
1299    conv: &ConversionConfig,
1300    file_attrs: &FileAttrs,
1301    data: &[u8],
1302    rel_path: &str,
1303    convert_crlf_into_lf: bool,
1304) -> Result<(), String> {
1305    if conv.safecrlf == SafeCrlf::False {
1306        return Ok(());
1307    }
1308
1309    let old_stats = git_text_stat(data);
1310
1311    let mut new_stats = old_stats.clone();
1312    if convert_crlf_into_lf && new_stats.crlf > 0 {
1313        new_stats.lonelf += new_stats.crlf;
1314        new_stats.crlf = 0;
1315    }
1316    if will_convert_lf_to_crlf_from_stats(&new_stats, conv, file_attrs) {
1317        new_stats.crlf += new_stats.lonelf;
1318        new_stats.lonelf = 0;
1319    }
1320
1321    if old_stats.crlf > 0 && new_stats.crlf == 0 {
1322        let msg = format!("fatal: CRLF would be replaced by LF in {rel_path}");
1323        if conv.safecrlf == SafeCrlf::True {
1324            return Err(msg);
1325        }
1326        eprint_safecrlf_warn_crlf_to_lf(rel_path);
1327    } else if old_stats.lonelf > 0 && new_stats.lonelf == 0 {
1328        let msg = format!("fatal: LF would be replaced by CRLF in {rel_path}");
1329        if conv.safecrlf == SafeCrlf::True {
1330            return Err(msg);
1331        }
1332        eprint_safecrlf_warn_lf_to_crlf(rel_path);
1333    }
1334
1335    Ok(())
1336}
1337
1338/// Replace CRLF with LF.
1339pub fn crlf_to_lf(data: &[u8]) -> Vec<u8> {
1340    let mut out = Vec::with_capacity(data.len());
1341    let mut i = 0;
1342    while i < data.len() {
1343        if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
1344            out.push(b'\n');
1345            i += 2;
1346        } else {
1347            out.push(data[i]);
1348            i += 1;
1349        }
1350    }
1351    out
1352}
1353
1354/// Replace lone LF with CRLF.
1355pub fn lf_to_crlf(data: &[u8]) -> Vec<u8> {
1356    let mut out = Vec::with_capacity(data.len() + data.len() / 10);
1357    let mut i = 0;
1358    while i < data.len() {
1359        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
1360            out.push(b'\r');
1361            out.push(b'\n');
1362        } else {
1363            out.push(data[i]);
1364        }
1365        i += 1;
1366    }
1367    out
1368}
1369
1370// ---------------------------------------------------------------------------
1371// Output (checkout / smudge) direction
1372// ---------------------------------------------------------------------------
1373
1374/// Convert data from the object database for writing to the working tree
1375/// (the "smudge" direction).
1376///
1377/// This handles (Git `convert_to_working_tree_ca_internal` order):
1378/// 1. Ident keyword expansion
1379/// 2. LF → CRLF conversion based on config + attributes
1380/// 3. `working-tree-encoding` (UTF-8 blob → working tree bytes)
1381/// 4. Smudge filter execution
1382///
1383/// Returns `Ok(None)` when the process filter returned `status=delayed` and `delayed_checkout` was
1384/// provided (Git `delayed_checkout`); the path is queued for [`crate::filter_process::DelayedProcessCheckout::finish`].
1385pub fn convert_to_worktree(
1386    data: &[u8],
1387    rel_path: &str,
1388    conv: &ConversionConfig,
1389    file_attrs: &FileAttrs,
1390    oid_hex: Option<&str>,
1391    smudge_meta: Option<&FilterSmudgeMeta>,
1392    delayed_checkout: Option<&mut crate::filter_process::DelayedProcessCheckout>,
1393) -> Result<Option<Vec<u8>>, String> {
1394    let mut buf = data.to_vec();
1395
1396    // 1. Ident expansion
1397    if file_attrs.ident {
1398        if let Some(oid) = oid_hex {
1399            buf = expand_ident(&buf, oid);
1400        }
1401    }
1402
1403    let can_delay_smudge = delayed_checkout.is_some()
1404        && file_attrs.working_tree_encoding.is_none()
1405        && !file_attrs.ident
1406        && file_attrs
1407            .filter_process
1408            .as_deref()
1409            .is_some_and(|c| !c.is_empty())
1410        && !should_convert_to_crlf(conv, file_attrs, &buf)
1411        && file_attrs
1412            .filter_process
1413            .as_deref()
1414            .is_some_and(crate::filter_process::process_filter_supports_delay);
1415
1416    // 2. LF→CRLF for working tree
1417    let should_convert = should_convert_to_crlf(conv, file_attrs, &buf);
1418    if should_convert {
1419        buf = lf_to_crlf(&buf);
1420    }
1421
1422    // 3. working-tree-encoding (Git `encode_to_worktree`)
1423    if let Some(ref enc) = file_attrs.working_tree_encoding {
1424        buf = encode_utf8_blob_to_working_tree_bytes(&buf, rel_path, enc)?;
1425    }
1426
1427    // 4. Smudge filter — process driver overrides shell smudge
1428    let driver = file_attrs.filter_driver_name.as_deref().unwrap_or("");
1429    if let Some(ref proc_cmd) = file_attrs.filter_process {
1430        let smudge_out =
1431            apply_process_smudge(proc_cmd, rel_path, &buf, smudge_meta, can_delay_smudge).map_err(
1432                |_e| {
1433                    if file_attrs.filter_smudge_required {
1434                        format!("fatal: {rel_path}: smudge filter {driver} failed")
1435                    } else {
1436                        _e
1437                    }
1438                },
1439            )?;
1440        let Some(out) = smudge_out else {
1441            let Some(q) = delayed_checkout else {
1442                return Err(format!(
1443                    "internal error: delayed smudge without checkout queue for {rel_path}"
1444                ));
1445            };
1446            q.push_delayed(
1447                proc_cmd.clone(),
1448                rel_path.to_string(),
1449                smudge_meta.cloned().unwrap_or_default(),
1450            );
1451            return Ok(None);
1452        };
1453        buf = out;
1454    } else {
1455        match file_attrs.filter_smudge.as_ref() {
1456            Some(smudge_cmd) => match run_filter(smudge_cmd, &buf, rel_path) {
1457                Ok(filtered) => buf = filtered,
1458                Err(_e) => {
1459                    if file_attrs.filter_smudge_required {
1460                        return Err(format!("fatal: {rel_path}: smudge filter {driver} failed"));
1461                    }
1462                }
1463            },
1464            None => {
1465                if file_attrs.filter_smudge_required {
1466                    return Err(format!("fatal: {rel_path}: smudge filter {driver} failed"));
1467                }
1468            }
1469        }
1470    }
1471
1472    Ok(Some(buf))
1473}
1474
1475/// Like [`convert_to_worktree`] without delayed-checkout queueing (always materializes or errors).
1476#[must_use]
1477pub fn convert_to_worktree_eager(
1478    data: &[u8],
1479    rel_path: &str,
1480    conv: &ConversionConfig,
1481    file_attrs: &FileAttrs,
1482    oid_hex: Option<&str>,
1483    smudge_meta: Option<&FilterSmudgeMeta>,
1484) -> Result<Vec<u8>, String> {
1485    match convert_to_worktree(data, rel_path, conv, file_attrs, oid_hex, smudge_meta, None)? {
1486        Some(v) => Ok(v),
1487        None => Err(format!(
1488            "internal error: unexpected delayed smudge for {rel_path}"
1489        )),
1490    }
1491}
1492
1493/// Decide whether to convert LF→CRLF on output (working tree / smudge direction).
1494#[must_use]
1495pub fn should_convert_to_crlf(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
1496    match attrs.crlf_legacy {
1497        CrlfLegacyAttr::Unset | CrlfLegacyAttr::Input => return false,
1498        CrlfLegacyAttr::Crlf => {
1499            if attrs.text == TextAttr::Unset {
1500                return false;
1501            }
1502            // Legacy `crlf` (set) forces CRLF on checkout (even for paths Git
1503            // would otherwise treat as binary; see t0020 "t* crlf" + `three`).
1504            return true;
1505        }
1506        CrlfLegacyAttr::Unspecified => {}
1507    }
1508
1509    // If text is explicitly unset, never convert
1510    if attrs.text == TextAttr::Unset {
1511        return false;
1512    }
1513
1514    // If there's an explicit eol attribute
1515    if attrs.eol != EolAttr::Unspecified {
1516        if attrs.text == TextAttr::Auto && is_binary(data) {
1517            return false;
1518        }
1519        if attrs.eol != EolAttr::Crlf {
1520            return false;
1521        }
1522        // `text=auto` + `eol=crlf` → Git `CRLF_AUTO_CRLF` (safe mixed handling).
1523        if attrs.text == TextAttr::Auto {
1524            return auto_crlf_should_smudge_lf_to_crlf(data);
1525        }
1526        // Explicit `eol=crlf` with `text` set, etc. → `CRLF_TEXT_CRLF` (always normalize).
1527        return true;
1528    }
1529
1530    // If text is explicitly set, use eol config
1531    if attrs.text == TextAttr::Set {
1532        return output_eol_is_crlf(conv);
1533    }
1534
1535    if attrs.text == TextAttr::Auto {
1536        if is_binary(data) {
1537            return false;
1538        }
1539        if !output_eol_is_crlf(conv) {
1540            return false;
1541        }
1542        return auto_crlf_should_smudge_lf_to_crlf(data);
1543    }
1544
1545    // No text attribute: fall back to core.autocrlf
1546    match conv.autocrlf {
1547        AutoCrlf::True => {
1548            if is_binary(data) {
1549                return false;
1550            }
1551            auto_crlf_should_smudge_lf_to_crlf(data)
1552        }
1553        AutoCrlf::Input | AutoCrlf::False => false,
1554    }
1555}
1556
1557/// Whether the output EOL should be CRLF based on config.
1558fn output_eol_is_crlf(conv: &ConversionConfig) -> bool {
1559    // Git `text_eol_is_crlf`: autocrlf=input forces LF output before `core.eol` is consulted.
1560    if conv.autocrlf == AutoCrlf::Input {
1561        return false;
1562    }
1563    if conv.autocrlf == AutoCrlf::True {
1564        return true;
1565    }
1566    match conv.eol {
1567        CoreEol::Crlf => true,
1568        CoreEol::Lf => false,
1569        CoreEol::Native => {
1570            // On Unix, native is LF
1571            cfg!(windows)
1572        }
1573    }
1574}
1575
1576/// Expand `$Id$` → `$Id: <oid>$` in data.
1577///
1578/// Matches Git's `ident_to_worktree` in `convert.c`: same-line `$` terminator, and foreign
1579/// idents (internal spaces before the closing `$`) are left unchanged.
1580fn expand_ident(data: &[u8], oid: &str) -> Vec<u8> {
1581    if !count_ident_regions(data) {
1582        return data.to_vec();
1583    }
1584    let replacement = format!("$Id: {oid} $");
1585    let mut out = Vec::with_capacity(data.len() + 60);
1586    let mut i = 0;
1587    while i < data.len() {
1588        if data[i] != b'$' {
1589            out.push(data[i]);
1590            i += 1;
1591            continue;
1592        }
1593        if i + 3 > data.len() || data[i + 1] != b'I' || data[i + 2] != b'd' {
1594            out.push(data[i]);
1595            i += 1;
1596            continue;
1597        }
1598        let after_id = i + 3;
1599        let ch = data.get(after_id).copied();
1600        match ch {
1601            Some(b'$') => {
1602                out.extend_from_slice(replacement.as_bytes());
1603                i = after_id + 1;
1604            }
1605            Some(b':') => {
1606                let rest = &data[after_id + 1..];
1607                let line_end = rest
1608                    .iter()
1609                    .position(|&b| b == b'\n' || b == b'\r')
1610                    .unwrap_or(rest.len());
1611                let line = &rest[..line_end];
1612                let Some(dollar_rel) = line.iter().position(|&b| b == b'$') else {
1613                    out.push(data[i]);
1614                    i += 1;
1615                    continue;
1616                };
1617                if line[..dollar_rel].contains(&b'\n') {
1618                    out.push(data[i]);
1619                    i += 1;
1620                    continue;
1621                }
1622                // Foreign ident (Git `ident_to_worktree`): first space in the payload after the
1623                // byte following `:` must not be the last character before `$`.
1624                let payload = &line[..dollar_rel];
1625                let foreign = payload.len() > 1
1626                    && payload[1..]
1627                        .iter()
1628                        .position(|&b| b == b' ')
1629                        .is_some_and(|rel| {
1630                            let pos = 1 + rel;
1631                            pos < payload.len().saturating_sub(1)
1632                        });
1633                if foreign {
1634                    out.push(data[i]);
1635                    i += 1;
1636                    continue;
1637                }
1638                out.extend_from_slice(replacement.as_bytes());
1639                i = after_id + 1 + dollar_rel + 1;
1640            }
1641            _ => {
1642                out.push(data[i]);
1643                i += 1;
1644            }
1645        }
1646    }
1647    out
1648}
1649
1650/// Whether the buffer contains any `$Id$` / `$Id: ... $` regions Git would rewrite (`count_ident`).
1651fn count_ident_regions(data: &[u8]) -> bool {
1652    let mut i = 0usize;
1653    while i < data.len() {
1654        if data[i] != b'$' {
1655            i += 1;
1656            continue;
1657        }
1658        if i + 3 > data.len() || data[i + 1] != b'I' || data[i + 2] != b'd' {
1659            i += 1;
1660            continue;
1661        }
1662        let after = i + 3;
1663        match data.get(after).copied() {
1664            Some(b'$') => return true,
1665            Some(b':') => {
1666                let mut j = after + 1;
1667                let mut found = false;
1668                while j < data.len() {
1669                    match data[j] {
1670                        b'$' => {
1671                            found = true;
1672                            break;
1673                        }
1674                        b'\n' | b'\r' => break,
1675                        _ => j += 1,
1676                    }
1677                }
1678                if found {
1679                    return true;
1680                }
1681                i += 1;
1682            }
1683            _ => i += 1,
1684        }
1685    }
1686    false
1687}
1688
1689/// Collapse `$Id: ... $` back to `$Id$`.
1690pub fn collapse_ident(data: &[u8]) -> Vec<u8> {
1691    let mut out = Vec::with_capacity(data.len());
1692    let mut i = 0;
1693    while i < data.len() {
1694        if i + 4 <= data.len() && &data[i..i + 4] == b"$Id:" {
1695            let rest = &data[i + 4..];
1696            let line_end = rest
1697                .iter()
1698                .position(|&b| b == b'\n' || b == b'\r')
1699                .unwrap_or(rest.len());
1700            let line = &rest[..line_end];
1701            if let Some(end) = line.iter().position(|&b| b == b'$') {
1702                out.extend_from_slice(b"$Id$");
1703                i += 4 + end + 1;
1704                continue;
1705            }
1706        }
1707        out.push(data[i]);
1708        i += 1;
1709    }
1710    out
1711}
1712
1713/// Shell-quote `s` with single quotes, matching Git's `sq_quote_buf` (`'` → `'\''`).
1714fn sq_quote_buf(s: &str) -> String {
1715    let mut out = String::with_capacity(s.len() + 2);
1716    out.push('\'');
1717    for ch in s.chars() {
1718        if ch == '\'' {
1719            out.push_str("'\\''");
1720        } else {
1721            out.push(ch);
1722        }
1723    }
1724    out.push('\'');
1725    out
1726}
1727
1728/// Expand Git filter command placeholders: `%%` → `%`, `%f` → quoted repository-relative path.
1729fn expand_filter_command(cmd: &str, rel_path: &str) -> String {
1730    let mut out = String::with_capacity(cmd.len() + rel_path.len() + 8);
1731    let mut chars = cmd.chars().peekable();
1732    while let Some(c) = chars.next() {
1733        if c == '%' {
1734            match chars.peek() {
1735                Some('%') => {
1736                    chars.next();
1737                    out.push('%');
1738                }
1739                Some('f') => {
1740                    chars.next();
1741                    out.push_str(&sq_quote_buf(rel_path));
1742                }
1743                _ => out.push('%'),
1744            }
1745        } else {
1746            out.push(c);
1747        }
1748    }
1749    out
1750}
1751
1752/// Run a filter command, piping data through stdin→stdout.
1753fn run_filter(cmd: &str, data: &[u8], rel_path: &str) -> Result<Vec<u8>, std::io::Error> {
1754    let expanded = expand_filter_command(cmd, rel_path);
1755    let mut child = Command::new("sh")
1756        .arg("-c")
1757        .arg(&expanded)
1758        .stdin(Stdio::piped())
1759        .stdout(Stdio::piped())
1760        .stderr(Stdio::inherit())
1761        .spawn()?;
1762
1763    use std::io::{ErrorKind, Write};
1764    if let Some(ref mut stdin) = child.stdin {
1765        if let Err(e) = stdin.write_all(data) {
1766            // Match Git: if the filter exits without reading stdin, ignore EPIPE.
1767            if e.kind() != ErrorKind::BrokenPipe {
1768                return Err(e);
1769            }
1770        }
1771    }
1772    drop(child.stdin.take());
1773
1774    let output = child.wait_with_output()?;
1775    if !output.status.success() {
1776        return Err(std::io::Error::other(format!(
1777            "filter command exited with status {}",
1778            output.status
1779        )));
1780    }
1781
1782    Ok(output.stdout)
1783}
1784
1785// Re-export AttrRule type is internal, but we expose the vec through load_gitattributes.
1786// The public API uses the opaque Vec from load_gitattributes + get_file_attrs.
1787
1788/// Opaque type alias for loaded gitattributes rules.
1789pub type GitAttributes = Vec<AttrRule>;
1790
1791#[cfg(test)]
1792mod tests {
1793    use super::*;
1794
1795    #[test]
1796    fn test_crlf_to_lf() {
1797        assert_eq!(crlf_to_lf(b"hello\r\nworld\r\n"), b"hello\nworld\n");
1798        assert_eq!(crlf_to_lf(b"hello\nworld\n"), b"hello\nworld\n");
1799        assert_eq!(crlf_to_lf(b"hello\r\n"), b"hello\n");
1800    }
1801
1802    #[test]
1803    fn test_lf_to_crlf() {
1804        assert_eq!(lf_to_crlf(b"hello\nworld\n"), b"hello\r\nworld\r\n");
1805        assert_eq!(lf_to_crlf(b"hello\r\nworld\r\n"), b"hello\r\nworld\r\n");
1806    }
1807
1808    #[test]
1809    fn test_has_crlf() {
1810        assert!(has_crlf(b"hello\r\nworld"));
1811        assert!(!has_crlf(b"hello\nworld"));
1812    }
1813
1814    #[test]
1815    fn smudge_mixed_line_endings_unchanged_with_autocrlf_true() {
1816        let mut blob = Vec::new();
1817        for part in [
1818            b"Oh\n".as_slice(),
1819            b"here\n",
1820            b"is\n",
1821            b"CRLF\r\n",
1822            b"in\n",
1823            b"text\n",
1824        ] {
1825            blob.extend_from_slice(part);
1826        }
1827        let conv = ConversionConfig {
1828            autocrlf: AutoCrlf::True,
1829            eol: CoreEol::Lf,
1830            safecrlf: SafeCrlf::False,
1831        };
1832        let attrs = FileAttrs::default();
1833        let out = convert_to_worktree_eager(&blob, "mixed", &conv, &attrs, None, None).unwrap();
1834        assert_eq!(out, blob);
1835    }
1836
1837    #[test]
1838    fn smudge_lf_only_gets_crlf_with_autocrlf_true() {
1839        let blob = b"a\nb\n";
1840        let conv = ConversionConfig {
1841            autocrlf: AutoCrlf::True,
1842            eol: CoreEol::Lf,
1843            safecrlf: SafeCrlf::False,
1844        };
1845        let attrs = FileAttrs::default();
1846        let out = convert_to_worktree_eager(blob, "x", &conv, &attrs, None, None).unwrap();
1847        assert_eq!(out, b"a\r\nb\r\n");
1848    }
1849
1850    #[test]
1851    fn test_is_binary() {
1852        assert!(is_binary(b"hello\0world"));
1853        assert!(!is_binary(b"hello world"));
1854    }
1855
1856    #[test]
1857    fn attr_dir_only_pattern_does_not_match_same_named_file() {
1858        let rules = parse_gitattributes_content("ignored-only-if-dir/ export-ignore\n");
1859        let rule = &rules[0];
1860        assert!(rule.must_be_dir);
1861        assert!(rule.basename_only);
1862        assert!(!attr_rule_matches(
1863            rule,
1864            "not-ignored-dir/ignored-only-if-dir",
1865            false
1866        ));
1867        assert!(attr_rule_matches(rule, "ignored-only-if-dir", true));
1868    }
1869
1870    #[test]
1871    fn test_expand_collapse_ident() {
1872        let data = b"$Id$";
1873        let expanded = expand_ident(data, "abc123");
1874        assert_eq!(expanded, b"$Id: abc123 $");
1875        let collapsed = collapse_ident(&expanded);
1876        assert_eq!(collapsed, b"$Id$");
1877    }
1878
1879    #[test]
1880    fn expand_ident_does_not_span_lines_for_partial_keyword() {
1881        let data = b"$Id: NoTerminatingSymbol\n$Id: deadbeef $\n";
1882        let expanded = expand_ident(data, "newoid");
1883        assert_eq!(expanded, b"$Id: NoTerminatingSymbol\n$Id: newoid $\n");
1884    }
1885
1886    #[test]
1887    fn expand_ident_preserves_foreign_id_with_internal_spaces() {
1888        let data = b"$Id: Foreign Commit With Spaces $\n";
1889        let expanded = expand_ident(data, "abc");
1890        assert_eq!(expanded, data);
1891    }
1892
1893    #[test]
1894    fn expand_filter_command_percent_f_quotes_path() {
1895        let s = expand_filter_command("sh ./x.sh %f --extra", "name  with 'sq'");
1896        assert_eq!(s, "sh ./x.sh 'name  with '\\''sq'\\''' --extra");
1897        assert_eq!(expand_filter_command("a %% b", "p"), "a % b");
1898    }
1899}
grit_lib/crlf.rs

grit_lib/
crlf.rs