grit_lib/
crlf.rs

1//! CRLF / EOL conversion and clean/smudge filter support.
2//!
3//! This module handles line-ending conversion when staging files (`git add`)
4//! and checking out files (`git checkout`, `read-tree -u`, `checkout-index`).
5//!
6//! Config knobs:
7//!   - `core.autocrlf` (true / input / false)
8//!   - `core.eol` (lf / crlf / native)
9//!   - `core.safecrlf` (true / warn / false)
10//!
11//! Gitattributes:
12//!   - `text` / `text=auto` / `-text` / `binary`
13//!   - `eol=lf` / `eol=crlf`
14//!   - `filter=<name>` (with `filter.<name>.clean` / `filter.<name>.smudge`)
15//!   - `ident` keyword expansion
16
17use std::path::{Path, PathBuf};
18use std::process::{Command, Stdio};
19
20use encoding_rs::UTF_8;
21
22use crate::config::ConfigSet;
23use crate::filter_process::{apply_process_clean, apply_process_smudge, FilterSmudgeMeta};
24use crate::objects::{parse_tree, ObjectId, ObjectKind};
25use crate::odb::Odb;
26
27/// What `core.autocrlf` is set to.
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum AutoCrlf {
30    True,
31    Input,
32    False,
33}
34
35/// What `core.eol` is set to.
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub enum CoreEol {
38    Lf,
39    Crlf,
40    Native,
41}
42
43/// What `core.safecrlf` is set to.
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum SafeCrlf {
46    True,
47    Warn,
48    False,
49}
50
51/// Per-file text attribute from .gitattributes.
52#[derive(Debug, Clone, Copy, PartialEq, Eq)]
53pub enum TextAttr {
54    /// `text` — always treat as text.
55    Set,
56    /// `text=auto` — auto-detect.
57    Auto,
58    /// `-text` or `binary` — never convert.
59    Unset,
60    /// No text attribute specified.
61    Unspecified,
62}
63
64/// Per-file eol attribute from .gitattributes.
65#[derive(Debug, Clone, Copy, PartialEq, Eq)]
66pub enum EolAttr {
67    Lf,
68    Crlf,
69    Unspecified,
70}
71
72/// Legacy `crlf` gitattribute (deprecated in Git; still honored for EOL conversion).
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
74pub enum CrlfLegacyAttr {
75    #[default]
76    Unspecified,
77    /// `-crlf` — disable CRLF conversion.
78    Unset,
79    /// `crlf=input` — normalize to LF in the object database; no CRLF on checkout.
80    Input,
81    /// Bare `crlf` (set) — force CRLF on checkout for text files.
82    Crlf,
83}
84
85/// Per-file merge attribute from .gitattributes.
86#[derive(Debug, Clone, PartialEq, Eq)]
87pub enum MergeAttr {
88    /// No merge attribute specified.
89    Unspecified,
90    /// `-merge` — treat as binary/non-text merge.
91    Unset,
92    /// `merge=<driver>` — use named merge driver.
93    Driver(String),
94}
95
96/// How the `diff` gitattribute affects diff output.
97#[derive(Debug, Clone, PartialEq, Eq)]
98pub enum DiffAttr {
99    /// No `diff` attribute (use heuristics / default).
100    Unspecified,
101    /// `-diff` / `diff=unset` / `binary` — treat as binary for diff purposes.
102    Unset,
103    /// Bare `diff` (set) — force textual diff even when the blob contains NUL.
104    Set,
105    /// `diff=<driver>` — use named driver (e.g. for textconv).
106    Driver(String),
107}
108
109/// Per-file attributes relevant to conversion.
110#[derive(Debug, Clone)]
111pub struct FileAttrs {
112    pub text: TextAttr,
113    pub eol: EolAttr,
114    /// Effect of the `diff` gitattribute on diff output.
115    pub diff_attr: DiffAttr,
116    /// `export-ignore` — omit from `git archive`.
117    pub export_ignore: bool,
118    /// `export-subst` — expand `$Format:` placeholders using the archived commit.
119    pub export_subst: bool,
120    pub filter_clean: Option<String>,
121    pub filter_smudge: Option<String>,
122    /// `filter.<name>.process` — long-running filter (takes precedence over clean/smudge commands).
123    pub filter_process: Option<String>,
124    /// Driver name from the active `filter=<name>` gitattribute (for error messages).
125    pub filter_driver_name: Option<String>,
126    /// Whether `filter.<name>.required` is set for this path's filter driver.
127    pub filter_smudge_required: bool,
128    /// Same config key as smudge; clean direction fails when unset if true.
129    pub filter_clean_required: bool,
130    pub ident: bool,
131    pub merge: MergeAttr,
132    pub conflict_marker_size: Option<String>,
133    /// Working tree encoding (e.g. "utf-16") — content is converted to UTF-8 on add.
134    pub working_tree_encoding: Option<String>,
135    /// Legacy `crlf` / `-crlf` / `crlf=input` from `.gitattributes`.
136    pub crlf_legacy: CrlfLegacyAttr,
137    /// `whitespace` attribute value: `None` if unset, `Some("set")` for bare `whitespace`,
138    /// `Some("unset")` for `-whitespace`, or `Some("trailing,...")` for `whitespace=...`.
139    pub whitespace: Option<String>,
140}
141
142impl Default for FileAttrs {
143    fn default() -> Self {
144        FileAttrs {
145            text: TextAttr::Unspecified,
146            eol: EolAttr::Unspecified,
147            diff_attr: DiffAttr::Unspecified,
148            export_ignore: false,
149            export_subst: false,
150            filter_clean: None,
151            filter_smudge: None,
152            filter_process: None,
153            filter_driver_name: None,
154            filter_smudge_required: false,
155            filter_clean_required: false,
156            ident: false,
157            merge: MergeAttr::Unspecified,
158            conflict_marker_size: None,
159            working_tree_encoding: None,
160            crlf_legacy: CrlfLegacyAttr::Unspecified,
161            whitespace: None,
162        }
163    }
164}
165
166/// Global conversion settings derived from config.
167#[derive(Debug, Clone)]
168pub struct ConversionConfig {
169    pub autocrlf: AutoCrlf,
170    pub eol: CoreEol,
171    pub safecrlf: SafeCrlf,
172    /// `core.checkRoundtripEncoding` — comma/space separated encodings whose UTF-8 round trip is
173    /// verified when writing to the object DB. `None` keeps Git's default (`SHIFT-JIS`).
174    pub check_roundtrip_encoding: Option<String>,
175}
176
177impl ConversionConfig {
178    /// Load conversion settings from a ConfigSet.
179    pub fn from_config(config: &ConfigSet) -> Self {
180        let autocrlf = match config.get("core.autocrlf") {
181            Some(v) => match v.to_lowercase().as_str() {
182                "true" | "yes" | "on" | "1" => AutoCrlf::True,
183                "input" => AutoCrlf::Input,
184                _ => AutoCrlf::False,
185            },
186            None => AutoCrlf::False,
187        };
188
189        let eol = match config.get("core.eol") {
190            Some(v) => match v.to_lowercase().as_str() {
191                "crlf" => CoreEol::Crlf,
192                "lf" => CoreEol::Lf,
193                "native" => CoreEol::Native,
194                _ => CoreEol::Native,
195            },
196            None => CoreEol::Native,
197        };
198
199        let safecrlf = match config.get("core.safecrlf") {
200            Some(v) => match v.to_lowercase().as_str() {
201                "true" | "yes" | "on" | "1" => SafeCrlf::True,
202                "warn" => SafeCrlf::Warn,
203                _ => SafeCrlf::False,
204            },
205            // Git warns on round-trip EOL issues by default when unset.
206            None => SafeCrlf::Warn,
207        };
208
209        let check_roundtrip_encoding = config
210            .get("core.checkRoundtripEncoding")
211            .filter(|s| !s.is_empty());
212
213        ConversionConfig {
214            autocrlf,
215            eol,
216            safecrlf,
217            check_roundtrip_encoding,
218        }
219    }
220}
221
222/// A parsed .gitattributes rule.
223#[derive(Debug, Clone)]
224pub struct AttrRule {
225    /// Glob text used for matching (trailing directory `/` stripped; see [`AttrRule::must_be_dir`]).
226    pattern: String,
227    /// When true, the source pattern ended with `/` and matches only directories (Git `PATTERN_FLAG_MUSTBEDIR`).
228    must_be_dir: bool,
229    /// When true, match only the path's final component (Git `PATTERN_FLAG_NODIR` / no `/` in the pattern body).
230    basename_only: bool,
231    attrs: Vec<(String, String)>, // (name, value) where value is "set"/"unset"/specific value
232}
233
234impl AttrRule {
235    /// Diff driver names assigned by this rule (`diff=<driver>`), excluding `set`/`unset`.
236    pub fn diff_drivers(&self) -> impl Iterator<Item = &str> + '_ {
237        self.attrs.iter().filter_map(|(name, value)| {
238            if name == "diff" && !value.is_empty() && value != "unset" && value != "set" {
239                Some(value.as_str())
240            } else {
241                None
242            }
243        })
244    }
245}
246
247/// Load .gitattributes from the worktree root.
248pub fn load_gitattributes(work_tree: &Path) -> Vec<AttrRule> {
249    let mut rules = Vec::new();
250
251    let root_attrs = work_tree.join(".gitattributes");
252    if let Ok(content) = std::fs::read_to_string(&root_attrs) {
253        parse_gitattributes(&content, &mut rules);
254    }
255
256    let info_attrs = work_tree.join(".git/info/attributes");
257    if let Ok(content) = std::fs::read_to_string(&info_attrs) {
258        parse_gitattributes(&content, &mut rules);
259    }
260
261    rules
262}
263
264/// Parse gitattributes content into attribute rules.
265///
266/// This is useful when attributes are sourced from non-worktree inputs
267/// (for example, tree objects selected by `--attr-source`).
268#[must_use]
269pub fn parse_gitattributes_content(content: &str) -> Vec<AttrRule> {
270    let mut rules = Vec::new();
271    parse_gitattributes(content, &mut rules);
272    rules
273}
274
275/// Load .gitattributes from the index (for use during checkout when
276/// the worktree file may not yet exist).
277pub fn load_gitattributes_from_index(
278    index: &crate::index::Index,
279    odb: &crate::odb::Odb,
280) -> Vec<AttrRule> {
281    let mut rules = Vec::new();
282
283    // Look for .gitattributes in the index (stage 0)
284    if let Some(entry) = index.get(b".gitattributes", 0) {
285        if let Ok(obj) = odb.read(&entry.oid) {
286            if let Ok(content) = String::from_utf8(obj.data) {
287                parse_gitattributes(&content, &mut rules);
288            }
289        }
290    }
291
292    rules
293}
294
295/// Load `.gitattributes` rules that apply to `rel_path`, including root and
296/// nested `dir/.gitattributes` along parent directories (Git-consistent order:
297/// root first, then each ancestor directory; later rules win in [`get_file_attrs`]).
298///
299/// Reads from the working tree when present, otherwise from a stage-0 index entry.
300pub fn load_gitattributes_for_checkout(
301    work_tree: &Path,
302    rel_path: &str,
303    index: &crate::index::Index,
304    odb: &crate::odb::Odb,
305) -> Vec<AttrRule> {
306    let mut rules = load_gitattributes(work_tree);
307
308    // Root `.gitattributes` may exist only in the index while the worktree file
309    // is missing (e.g. t0020 in-tree attributes after `rm -rf .gitattributes`).
310    if !work_tree.join(".gitattributes").exists() {
311        if let Some(entry) = index.get(b".gitattributes", 0) {
312            if let Ok(obj) = odb.read(&entry.oid) {
313                if let Ok(content) = String::from_utf8(obj.data) {
314                    parse_gitattributes(&content, &mut rules);
315                }
316            }
317        }
318    }
319
320    let path = Path::new(rel_path);
321    if let Some(parent) = path.parent() {
322        let mut accum = PathBuf::new();
323        for comp in parent.components() {
324            accum.push(comp);
325            let ga_rel = accum.join(".gitattributes");
326            let wt_ga = work_tree.join(&ga_rel);
327            if let Ok(content) = std::fs::read_to_string(&wt_ga) {
328                parse_gitattributes(&content, &mut rules);
329            } else {
330                let key = path_to_index_bytes(&ga_rel);
331                if let Some(entry) = index.get(&key, 0) {
332                    if let Ok(obj) = odb.read(&entry.oid) {
333                        if let Ok(content) = String::from_utf8(obj.data) {
334                            parse_gitattributes(&content, &mut rules);
335                        }
336                    }
337                }
338            }
339        }
340    }
341
342    rules
343}
344
345/// Load `.gitattributes` rules from `tree_oid` that can apply to `rel_path`.
346///
347/// `odb` supplies tree and blob objects, `tree_oid` is the root tree to read, and `rel_path` is the
348/// repository-relative path being matched.
349///
350/// Returns rules in root-to-leaf order. Missing, non-blob, or invalid UTF-8 `.gitattributes` entries
351/// are ignored, matching the best-effort behavior of the worktree loader.
352pub fn load_gitattributes_for_tree_path(
353    odb: &Odb,
354    tree_oid: &ObjectId,
355    rel_path: &str,
356) -> Vec<AttrRule> {
357    let mut rules = Vec::new();
358    load_gitattributes_blob_from_tree(odb, tree_oid, ".gitattributes", &mut rules);
359
360    let path = Path::new(rel_path);
361    if let Some(parent) = path.parent() {
362        let mut accum = PathBuf::new();
363        for comp in parent.components() {
364            accum.push(comp);
365            let ga_rel = accum.join(".gitattributes");
366            let ga_rel = ga_rel.to_string_lossy().replace('\\', "/");
367            load_gitattributes_blob_from_tree(odb, tree_oid, &ga_rel, &mut rules);
368        }
369    }
370
371    rules
372}
373
374fn load_gitattributes_blob_from_tree(
375    odb: &Odb,
376    tree_oid: &ObjectId,
377    ga_path: &str,
378    rules: &mut Vec<AttrRule>,
379) {
380    let Some(oid) = lookup_tree_path(odb, tree_oid, ga_path) else {
381        return;
382    };
383    let Ok(obj) = odb.read(&oid) else {
384        return;
385    };
386    if obj.kind != ObjectKind::Blob {
387        return;
388    }
389    if let Ok(content) = String::from_utf8(obj.data) {
390        parse_gitattributes(&content, rules);
391    }
392}
393
394fn lookup_tree_path(odb: &Odb, tree_oid: &ObjectId, rel_path: &str) -> Option<ObjectId> {
395    let mut current = *tree_oid;
396    let mut parts = rel_path.split('/').peekable();
397    while let Some(part) = parts.next() {
398        let obj = odb.read(&current).ok()?;
399        if obj.kind != ObjectKind::Tree {
400            return None;
401        }
402        let entries = parse_tree(&obj.data).ok()?;
403        let entry = entries
404            .iter()
405            .find(|entry| String::from_utf8_lossy(&entry.name) == part)?;
406        if parts.peek().is_none() {
407            return Some(entry.oid);
408        }
409        if entry.mode != 0o040000 {
410            return None;
411        }
412        current = entry.oid;
413    }
414    None
415}
416
417fn path_to_index_bytes(path: &Path) -> Vec<u8> {
418    #[cfg(unix)]
419    {
420        use std::os::unix::ffi::OsStrExt;
421        path.as_os_str().as_bytes().to_vec()
422    }
423    #[cfg(not(unix))]
424    {
425        path.to_string_lossy().as_bytes().to_vec()
426    }
427}
428
429fn parse_gitattributes(content: &str, rules: &mut Vec<AttrRule>) {
430    for line in content.lines() {
431        let line = line.trim();
432        if line.is_empty() || line.starts_with('#') {
433            continue;
434        }
435
436        let mut parts = line.split_whitespace();
437        let raw_pattern = match parts.next() {
438            Some(p) => p,
439            None => continue,
440        };
441
442        let mut pat = raw_pattern.to_owned();
443        let mut must_be_dir = false;
444        if pat.ends_with('/') && pat.len() > 1 {
445            pat.pop();
446            must_be_dir = true;
447        }
448        let basename_only = !pat.contains('/');
449
450        let mut attrs = Vec::new();
451        for part in parts {
452            if part == "binary" {
453                attrs.push(("text".to_owned(), "unset".to_owned()));
454                attrs.push(("diff".to_owned(), "unset".to_owned()));
455            } else if let Some(rest) = part.strip_prefix('-') {
456                attrs.push((rest.to_owned(), "unset".to_owned()));
457            } else if let Some((key, val)) = part.split_once('=') {
458                attrs.push((key.to_owned(), val.to_owned()));
459            } else {
460                attrs.push((part.to_owned(), "set".to_owned()));
461            }
462        }
463
464        if !attrs.is_empty() {
465            rules.push(AttrRule {
466                pattern: pat,
467                must_be_dir,
468                basename_only,
469                attrs,
470            });
471        }
472    }
473}
474
475fn config_bool_truthy(value: &str) -> bool {
476    matches!(
477        value.trim().to_ascii_lowercase().as_str(),
478        "true" | "yes" | "on" | "1"
479    )
480}
481
482/// Get file attributes for a given path from .gitattributes rules and config.
483///
484/// `is_dir` should be true when `rel_path` names a directory (Git passes a trailing `/` for
485/// directory paths in some call sites; we accept either trailing `/` or this flag from tree walks).
486pub fn get_file_attrs(
487    rules: &[AttrRule],
488    rel_path: &str,
489    is_dir: bool,
490    config: &ConfigSet,
491) -> FileAttrs {
492    let mut fa = FileAttrs::default();
493
494    // Walk rules; last match wins for each attribute.
495    for rule in rules {
496        if attr_rule_matches(rule, rel_path, is_dir) {
497            for (name, value) in &rule.attrs {
498                match name.as_str() {
499                    "text" => {
500                        fa.text = match value.as_str() {
501                            "set" => TextAttr::Set,
502                            "unset" => TextAttr::Unset,
503                            "auto" => TextAttr::Auto,
504                            _ => TextAttr::Unspecified,
505                        };
506                    }
507                    "eol" => {
508                        fa.eol = match value.as_str() {
509                            "lf" => EolAttr::Lf,
510                            "crlf" => EolAttr::Crlf,
511                            _ => EolAttr::Unspecified,
512                        };
513                    }
514                    "filter" => {
515                        if value == "unset" {
516                            fa.filter_clean = None;
517                            fa.filter_smudge = None;
518                            fa.filter_process = None;
519                            fa.filter_driver_name = None;
520                            fa.filter_smudge_required = false;
521                            fa.filter_clean_required = false;
522                        } else {
523                            let clean_key = format!("filter.{value}.clean");
524                            let smudge_key = format!("filter.{value}.smudge");
525                            let process_key = format!("filter.{value}.process");
526                            let req_key = format!("filter.{value}.required");
527                            fa.filter_driver_name = Some(value.clone());
528                            fa.filter_process = config.get(&process_key).filter(|s| !s.is_empty());
529                            if fa.filter_process.is_some() {
530                                fa.filter_clean = None;
531                                fa.filter_smudge = None;
532                            } else {
533                                fa.filter_clean = config.get(&clean_key);
534                                fa.filter_smudge = config.get(&smudge_key);
535                            }
536                            let required =
537                                config.get(&req_key).is_some_and(|v| config_bool_truthy(&v));
538                            fa.filter_smudge_required = required;
539                            fa.filter_clean_required = required;
540                        }
541                    }
542                    "diff" => {
543                        if value == "unset" {
544                            fa.diff_attr = DiffAttr::Unset;
545                        } else if value == "set" {
546                            fa.diff_attr = DiffAttr::Set;
547                        } else if !value.is_empty() {
548                            fa.diff_attr = DiffAttr::Driver(value.clone());
549                        }
550                    }
551                    "ident" => {
552                        fa.ident = value == "set";
553                    }
554                    "export-ignore" => {
555                        fa.export_ignore = value != "unset";
556                    }
557                    "export-subst" => {
558                        fa.export_subst = value != "unset";
559                    }
560                    "merge" => {
561                        fa.merge = match value.as_str() {
562                            "unset" => MergeAttr::Unset,
563                            "set" => MergeAttr::Unspecified,
564                            other => MergeAttr::Driver(other.to_string()),
565                        };
566                    }
567                    "conflict-marker-size" => {
568                        if value == "unset" {
569                            fa.conflict_marker_size = None;
570                        } else {
571                            fa.conflict_marker_size = Some(value.clone());
572                        }
573                    }
574                    "working-tree-encoding" => {
575                        if value != "unset" && !value.is_empty() {
576                            fa.working_tree_encoding = Some(value.clone());
577                        }
578                    }
579                    "crlf" => {
580                        fa.crlf_legacy = match value.as_str() {
581                            "unset" => CrlfLegacyAttr::Unset,
582                            "input" => CrlfLegacyAttr::Input,
583                            "set" => CrlfLegacyAttr::Crlf,
584                            _ => CrlfLegacyAttr::Unspecified,
585                        };
586                    }
587                    "whitespace" => {
588                        if value == "unset" {
589                            fa.whitespace = Some("unset".to_owned());
590                        } else if !value.is_empty() {
591                            fa.whitespace = Some(value.clone());
592                        }
593                    }
594                    _ => {}
595                }
596            }
597        }
598    }
599
600    fa
601}
602
603/// Returns whether gitattribute `attr_name` is set (last matching rule wins), for arbitrary
604/// attribute names used by pathspec `:(attr:...)`.
605///
606/// `is_dir` is whether `path` refers to a directory (see [`get_file_attrs`]).
607#[must_use]
608pub fn path_has_gitattribute(
609    rules: &[AttrRule],
610    path: &str,
611    is_dir: bool,
612    attr_name: &str,
613) -> bool {
614    matches!(
615        path_gitattribute_value(rules, path, is_dir, attr_name).as_deref(),
616        Some(value) if value != "unset"
617    )
618}
619
620/// Return the final value assigned to `attr_name` for `path`.
621///
622/// `rules` is the ordered set of parsed attribute rules, `path` is repository-relative, `is_dir`
623/// selects directory-only pattern handling, and `attr_name` is the attribute to query.
624///
625/// Returns `"set"`, `"unset"`, or an explicit string value. `None` means the attribute is
626/// unspecified after all matching rules are applied.
627#[must_use]
628pub fn path_gitattribute_value(
629    rules: &[AttrRule],
630    path: &str,
631    is_dir: bool,
632    attr_name: &str,
633) -> Option<String> {
634    let mut last: Option<&str> = None;
635    for rule in rules {
636        if attr_rule_matches(rule, path, is_dir) {
637            for (name, value) in &rule.attrs {
638                if name == attr_name {
639                    last = Some(value.as_str());
640                }
641            }
642        }
643    }
644    last.map(str::to_string)
645}
646
647/// Whether `rule` matches `rel_path` given directory vs file context (Git `path_matches`).
648#[must_use]
649pub fn attr_rule_matches(rule: &AttrRule, rel_path: &str, is_dir: bool) -> bool {
650    let path_is_dir = is_dir || rel_path.ends_with('/');
651    if rule.must_be_dir && !path_is_dir {
652        return false;
653    }
654    let path_for_glob = rel_path.trim_end_matches('/');
655    if rule.basename_only {
656        let basename = path_for_glob.rsplit('/').next().unwrap_or(path_for_glob);
657        glob_matches(rule.pattern.as_str(), basename)
658    } else {
659        glob_matches(rule.pattern.as_str(), path_for_glob)
660    }
661}
662
663fn glob_matches(pattern: &str, text: &str) -> bool {
664    glob_match_bytes(pattern.as_bytes(), text.as_bytes())
665}
666
667fn glob_match_bytes(pat: &[u8], text: &[u8]) -> bool {
668    match (pat.first(), text.first()) {
669        (None, None) => true,
670        (Some(&b'*'), _) => {
671            let pat_rest = pat
672                .iter()
673                .position(|&b| b != b'*')
674                .map_or(&pat[pat.len()..], |i| &pat[i..]);
675            if pat_rest.is_empty() {
676                return true;
677            }
678            for i in 0..=text.len() {
679                if glob_match_bytes(pat_rest, &text[i..]) {
680                    return true;
681                }
682            }
683            false
684        }
685        (Some(&b'?'), Some(_)) => glob_match_bytes(&pat[1..], &text[1..]),
686        (Some(p), Some(t)) if p == t => glob_match_bytes(&pat[1..], &text[1..]),
687        _ => false,
688    }
689}
690
691/// Returns true if the data looks binary (contains NUL bytes in the first 8000 bytes).
692pub fn is_binary(data: &[u8]) -> bool {
693    let check_len = data.len().min(8000);
694    data[..check_len].contains(&0)
695}
696
697// Git `convert.c` `CONVERT_STAT_BITS_*` / `gather_convert_stats_ascii` (for `ls-files --eol`).
698const CONVERT_STAT_BITS_TXT_LF: u32 = 0x1;
699const CONVERT_STAT_BITS_TXT_CRLF: u32 = 0x2;
700const CONVERT_STAT_BITS_BIN: u32 = 0x4;
701
702#[derive(Default, Clone)]
703struct TextStat {
704    nul: u32,
705    lonecr: u32,
706    lonelf: u32,
707    crlf: u32,
708    printable: u32,
709    nonprintable: u32,
710}
711
712fn gather_text_stat(data: &[u8]) -> TextStat {
713    let mut s = TextStat::default();
714    let mut i = 0usize;
715    while i < data.len() {
716        let c = data[i];
717        if c == b'\r' {
718            if i + 1 < data.len() && data[i + 1] == b'\n' {
719                s.crlf += 1;
720                i += 2;
721            } else {
722                s.lonecr += 1;
723                i += 1;
724            }
725            continue;
726        }
727        if c == b'\n' {
728            s.lonelf += 1;
729            i += 1;
730            continue;
731        }
732        if c == 127 {
733            s.nonprintable += 1;
734        } else if c < 32 {
735            match c {
736                b'\t' | b'\x08' | b'\x1b' | b'\x0c' => s.printable += 1,
737                0 => {
738                    s.nul += 1;
739                    s.nonprintable += 1;
740                }
741                _ => s.nonprintable += 1,
742            }
743        } else {
744            s.printable += 1;
745        }
746        i += 1;
747    }
748    s
749}
750
751fn convert_is_binary(stats: &TextStat) -> bool {
752    stats.lonecr > 0 || stats.nul > 0 || (stats.printable >> 7) < stats.nonprintable
753}
754
755fn git_text_stat(data: &[u8]) -> TextStat {
756    let mut stats = gather_text_stat(data);
757    if !data.is_empty() && data[data.len() - 1] == 0x1a {
758        stats.nonprintable = stats.nonprintable.saturating_sub(1);
759    }
760    stats
761}
762
763/// Git `will_convert_lf_to_crlf` using [`TextStat`] (same rules as [`should_convert_to_crlf`] on bytes).
764fn will_convert_lf_to_crlf_from_stats(
765    stats: &TextStat,
766    conv: &ConversionConfig,
767    attrs: &FileAttrs,
768) -> bool {
769    let has_lone_lf = stats.lonelf > 0;
770    let is_bin = convert_is_binary(stats);
771
772    match attrs.crlf_legacy {
773        CrlfLegacyAttr::Unset | CrlfLegacyAttr::Input => return false,
774        CrlfLegacyAttr::Crlf => {
775            if attrs.text == TextAttr::Unset {
776                return false;
777            }
778            return has_lone_lf;
779        }
780        CrlfLegacyAttr::Unspecified => {}
781    }
782
783    if attrs.text == TextAttr::Unset {
784        return false;
785    }
786
787    if attrs.eol != EolAttr::Unspecified {
788        if attrs.text == TextAttr::Auto && is_bin {
789            return false;
790        }
791        if attrs.eol != EolAttr::Crlf {
792            return false;
793        }
794        if attrs.text == TextAttr::Auto {
795            return auto_crlf_should_smudge_lf_to_crlf_from_stats(stats);
796        }
797        return has_lone_lf;
798    }
799
800    if attrs.text == TextAttr::Set {
801        if !output_eol_is_crlf(conv) {
802            return false;
803        }
804        return has_lone_lf;
805    }
806
807    if attrs.text == TextAttr::Auto {
808        if is_bin || !output_eol_is_crlf(conv) {
809            return false;
810        }
811        return auto_crlf_should_smudge_lf_to_crlf_from_stats(stats);
812    }
813
814    match conv.autocrlf {
815        AutoCrlf::True => {
816            if is_bin {
817                return false;
818            }
819            auto_crlf_should_smudge_lf_to_crlf_from_stats(stats)
820        }
821        AutoCrlf::Input | AutoCrlf::False => false,
822    }
823}
824
825fn auto_crlf_should_smudge_lf_to_crlf_from_stats(stats: &TextStat) -> bool {
826    if stats.lonelf == 0 {
827        return false;
828    }
829    if stats.lonecr > 0 || stats.crlf > 0 {
830        return false;
831    }
832    !convert_is_binary(stats)
833}
834
835fn gather_convert_stats(data: &[u8]) -> u32 {
836    if data.is_empty() {
837        return 0;
838    }
839    let mut stats = gather_text_stat(data);
840    if !data.is_empty() && data[data.len() - 1] == 0x1a {
841        stats.nonprintable = stats.nonprintable.saturating_sub(1);
842    }
843    let mut ret = 0u32;
844    if convert_is_binary(&stats) {
845        ret |= CONVERT_STAT_BITS_BIN;
846    }
847    if stats.crlf > 0 {
848        ret |= CONVERT_STAT_BITS_TXT_CRLF;
849    }
850    if stats.lonelf > 0 {
851        ret |= CONVERT_STAT_BITS_TXT_LF;
852    }
853    ret
854}
855
856/// Git `convert.c` `gather_convert_stats_ascii` — worktree/index blob EOL stats for `ls-files --eol`.
857#[must_use]
858pub fn gather_convert_stats_ascii(data: &[u8]) -> &'static str {
859    let convert_stats = gather_convert_stats(data);
860    if convert_stats & CONVERT_STAT_BITS_BIN != 0 {
861        return "-text";
862    }
863    match convert_stats {
864        CONVERT_STAT_BITS_TXT_LF => "lf",
865        CONVERT_STAT_BITS_TXT_CRLF => "crlf",
866        x if x == (CONVERT_STAT_BITS_TXT_LF | CONVERT_STAT_BITS_TXT_CRLF) => "mixed",
867        _ => "none",
868    }
869}
870
871/// Git `convert.c` `get_convert_attr_ascii` — ASCII summary of EOL-related attributes for
872/// `git ls-files --eol` (matches `attr_action` after attribute merge, before clean/smudge).
873#[must_use]
874pub fn convert_attr_ascii_for_ls_files(
875    rules: &[AttrRule],
876    rel_path: &str,
877    config: &ConfigSet,
878) -> String {
879    let fa = get_file_attrs(rules, rel_path, false, config);
880    // Mirror `git_path_check_crlf` for `text` then legacy `crlf` (Git checks `text` first).
881    let mut action = match fa.text {
882        TextAttr::Set => 1,   // CRLF_TEXT
883        TextAttr::Unset => 2, // CRLF_BINARY
884        TextAttr::Auto => 5,  // CRLF_AUTO
885        TextAttr::Unspecified => 0,
886    };
887    if action == 0 {
888        action = match fa.crlf_legacy {
889            CrlfLegacyAttr::Crlf => 1,
890            CrlfLegacyAttr::Unset => 2,
891            CrlfLegacyAttr::Input => 3, // CRLF_TEXT_INPUT
892            CrlfLegacyAttr::Unspecified => 0,
893        };
894    }
895    if action == 2 {
896        return "-text".to_string();
897    }
898    // Bare `eol=lf` / `eol=crlf` without `text` still implies text mode (`convert_attrs`).
899    if action == 0 {
900        if fa.eol == EolAttr::Unspecified {
901            return String::new();
902        }
903        action = 1; // CRLF_TEXT
904    }
905
906    // Merge `eol=` like `convert_attrs` (only when not already binary).
907    if fa.eol == EolAttr::Lf {
908        if action == 5 {
909            action = 7; // CRLF_AUTO_INPUT
910        } else {
911            action = 3; // CRLF_TEXT_INPUT
912        }
913    } else if fa.eol == EolAttr::Crlf {
914        if action == 5 {
915            action = 6; // CRLF_AUTO_CRLF
916        } else {
917            action = 4; // CRLF_TEXT_CRLF
918        }
919    }
920
921    // `attr_action` snapshot (Git assigns before splitting bare `text` / applying autocrlf).
922    let attr_action = action;
923
924    match attr_action {
925        1 => "text".to_string(),
926        3 => "text eol=lf".to_string(),
927        4 => "text eol=crlf".to_string(),
928        5 => "text=auto".to_string(),
929        6 => "text=auto eol=crlf".to_string(),
930        7 => "text=auto eol=lf".to_string(),
931        _ => String::new(),
932    }
933}
934
935/// Returns true if data contains any CRLF sequences.
936pub fn has_crlf(data: &[u8]) -> bool {
937    data.windows(2).any(|w| w == b"\r\n")
938}
939
940/// Returns true if data contains any lone LF (not preceded by CR).
941pub fn has_lone_lf(data: &[u8]) -> bool {
942    for i in 0..data.len() {
943        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
944            return true;
945        }
946    }
947    false
948}
949
950/// Returns true if data contains a bare CR not followed by LF (Git `text_stat.lonecr`).
951fn has_lone_cr(data: &[u8]) -> bool {
952    for i in 0..data.len() {
953        if data[i] == b'\r' && (i + 1 >= data.len() || data[i + 1] != b'\n') {
954            return true;
955        }
956    }
957    false
958}
959
960/// Git `convert.c` `will_convert_lf_to_crlf` for `CRLF_AUTO` / `CRLF_AUTO_INPUT` / `CRLF_AUTO_CRLF`:
961/// if the blob already has CRLF pairs or lone CRs, do not convert lone LFs to CRLF on checkout.
962fn auto_crlf_should_smudge_lf_to_crlf(data: &[u8]) -> bool {
963    if !has_lone_lf(data) {
964        return false;
965    }
966    if has_lone_cr(data) || has_crlf(data) {
967        return false;
968    }
969    if is_binary(data) {
970        return false;
971    }
972    true
973}
974
975/// Returns true if ALL line endings are CRLF (no lone LF).
976pub fn is_all_crlf(data: &[u8]) -> bool {
977    has_crlf(data) && !has_lone_lf(data)
978}
979
980/// Returns true if ALL line endings are LF (no CRLF).
981pub fn is_all_lf(data: &[u8]) -> bool {
982    has_lone_lf(data) && !has_crlf(data)
983}
984
985/// Git `convert.c` `has_crlf_in_index`: index blob already contains CRLF pairs (non-binary).
986#[must_use]
987pub fn has_crlf_in_index_blob(data: &[u8]) -> bool {
988    if !data.contains(&b'\r') {
989        return false;
990    }
991    let st = gather_convert_stats(data);
992    st & CONVERT_STAT_BITS_BIN == 0 && (st & CONVERT_STAT_BITS_TXT_CRLF) != 0
993}
994
995/// Whether clean conversion uses Git's `has_crlf_in_index` guard (`convert.c` only for
996/// `CRLF_AUTO`, `CRLF_AUTO_INPUT`, `CRLF_AUTO_CRLF`). Bare `eol=` without `text=auto` becomes
997/// `CRLF_TEXT_*` and must not use this guard.
998#[must_use]
999pub fn clean_uses_autocrlf_index_guard(attrs: &FileAttrs, conv: &ConversionConfig) -> bool {
1000    if attrs.text == TextAttr::Unset || attrs.crlf_legacy == CrlfLegacyAttr::Unset {
1001        return false;
1002    }
1003    if attrs.eol != EolAttr::Unspecified && attrs.text != TextAttr::Auto {
1004        return false;
1005    }
1006    attrs.text == TextAttr::Auto
1007        || (attrs.text == TextAttr::Unspecified
1008            && matches!(conv.autocrlf, AutoCrlf::True | AutoCrlf::Input))
1009}
1010
1011/// Optional inputs for [`convert_to_git_with_opts`] (Git `CONV_EOL_RENORMALIZE` / index blob).
1012#[derive(Debug, Clone, Copy)]
1013pub struct ConvertToGitOpts<'a> {
1014    /// Stage-0 blob bytes for this path before the current add (for safer-autocrlf).
1015    pub index_blob: Option<&'a [u8]>,
1016    /// When true, always apply CRLF→LF when configured (merge/cherry-pick renormalize).
1017    pub renormalize: bool,
1018    /// When false, skip `core.safecrlf` simulation (used for internal diff/hashing — must not spam stderr).
1019    pub check_safecrlf: bool,
1020}
1021
1022impl Default for ConvertToGitOpts<'_> {
1023    fn default() -> Self {
1024        Self {
1025            index_blob: None,
1026            renormalize: false,
1027            check_safecrlf: true,
1028        }
1029    }
1030}
1031
1032// ---------------------------------------------------------------------------
1033// working-tree-encoding (Git `convert.c` `encode_to_git` / `encode_to_worktree`)
1034// ---------------------------------------------------------------------------
1035
1036// BOM byte sequences (Git `utf8.c`).
1037const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
1038const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
1039const UTF32_BE_BOM: &[u8] = &[0x00, 0x00, 0xFE, 0xFF];
1040const UTF32_LE_BOM: &[u8] = &[0xFF, 0xFE, 0x00, 0x00];
1041
1042/// Canonical lowercase UTF label for a `working-tree-encoding` value, or `None` if the label is
1043/// not a UTF-16/UTF-32/UTF-8 variant Git treats specially. Mirrors Git's `same_utf_encoding`
1044/// (strip a leading `utf` then an optional `-`, case-insensitive), so `utf16`, `UTF-16`,
1045/// `Utf16Le-Bom` all normalize.
1046fn canonical_utf_label(label: &str) -> Option<String> {
1047    let trimmed = label.trim();
1048    let lower = trimmed.to_ascii_lowercase();
1049    let rest = lower.strip_prefix("utf")?;
1050    let rest = rest.strip_prefix('-').unwrap_or(rest);
1051    match rest {
1052        "8" => Some("utf-8".to_string()),
1053        "16" => Some("utf-16".to_string()),
1054        "16be" => Some("utf-16be".to_string()),
1055        "16le" => Some("utf-16le".to_string()),
1056        "16be-bom" => Some("utf-16be-bom".to_string()),
1057        "16le-bom" => Some("utf-16le-bom".to_string()),
1058        "32" => Some("utf-32".to_string()),
1059        "32be" => Some("utf-32be".to_string()),
1060        "32le" => Some("utf-32le".to_string()),
1061        _ => None,
1062    }
1063}
1064
1065fn has_bom_prefix(data: &[u8], bom: &[u8]) -> bool {
1066    data.len() >= bom.len() && &data[..bom.len()] == bom
1067}
1068
1069/// Git `has_prohibited_utf_bom`: UTF-16BE/LE and UTF-32BE/LE must not begin with a BOM.
1070fn has_prohibited_utf_bom(canon: &str, data: &[u8]) -> bool {
1071    match canon {
1072        "utf-16be" | "utf-16le" => {
1073            has_bom_prefix(data, UTF16_BE_BOM) || has_bom_prefix(data, UTF16_LE_BOM)
1074        }
1075        "utf-32be" | "utf-32le" => {
1076            has_bom_prefix(data, UTF32_BE_BOM) || has_bom_prefix(data, UTF32_LE_BOM)
1077        }
1078        _ => false,
1079    }
1080}
1081
1082/// Git `is_missing_required_utf_bom`: bare UTF-16 / UTF-32 must begin with a BOM.
1083fn is_missing_required_utf_bom(canon: &str, data: &[u8]) -> bool {
1084    match canon {
1085        "utf-16" => !(has_bom_prefix(data, UTF16_BE_BOM) || has_bom_prefix(data, UTF16_LE_BOM)),
1086        "utf-32" => !(has_bom_prefix(data, UTF32_BE_BOM) || has_bom_prefix(data, UTF32_LE_BOM)),
1087        _ => false,
1088    }
1089}
1090
1091/// Git `validate_encoding`: emit the advice line to stderr and return an error body when the BOM
1092/// presence is wrong for a UTF-16/UTF-32 encoding.
1093///
1094/// `label` is the original attribute spelling (preserved in messages, like Git). When
1095/// `die_on_error` is true (`CONV_WRITE_OBJECT`) the body is prefixed `fatal:` so the top-level
1096/// printer surfaces it verbatim; otherwise the `error:` line is printed here (Git `error()` returns
1097/// "content unmodified") and the same body is returned for the caller to swallow.
1098fn validate_utf_bom(
1099    canon: &str,
1100    label: &str,
1101    rel_path: &str,
1102    data: &[u8],
1103    die_on_error: bool,
1104) -> Result<(), String> {
1105    if has_prohibited_utf_bom(canon, data) {
1106        // Advice cuts the trailing "be"/"le" so the user sees the BOM-capable name (UTF-16/UTF-32).
1107        let stripped = label
1108            .strip_prefix("utf")
1109            .or_else(|| label.strip_prefix("UTF"));
1110        let utf_num = stripped
1111            .map(|s| s.trim_start_matches('-'))
1112            .and_then(|s| s.get(..s.len().saturating_sub(2)))
1113            .unwrap_or("");
1114        eprintln!(
1115            "The file '{rel_path}' contains a byte order mark (BOM). Please use UTF-{utf_num} as working-tree-encoding."
1116        );
1117        let body = format!("BOM is prohibited in '{rel_path}' if encoded as {label}");
1118        if die_on_error {
1119            return Err(format!("fatal: {body}"));
1120        }
1121        eprintln!("error: {body}");
1122        return Err(body);
1123    }
1124    if is_missing_required_utf_bom(canon, data) {
1125        let utf_num = label
1126            .strip_prefix("utf")
1127            .or_else(|| label.strip_prefix("UTF"))
1128            .map(|s| s.trim_start_matches('-'))
1129            .unwrap_or("");
1130        eprintln!(
1131            "The file '{rel_path}' is missing a byte order mark (BOM). Please use UTF-{utf_num}BE or UTF-{utf_num}LE (depending on the byte order) as working-tree-encoding."
1132        );
1133        let body = format!("BOM is required in '{rel_path}' if encoded as {label}");
1134        if die_on_error {
1135            return Err(format!("fatal: {body}"));
1136        }
1137        eprintln!("error: {body}");
1138        return Err(body);
1139    }
1140    Ok(())
1141}
1142
1143/// Git `convert.c` `check_roundtrip`: whether `enc_name` appears as a whole, comma/space-delimited
1144/// token in `core.checkRoundtripEncoding` (default `SHIFT-JIS`), case-insensitively.
1145fn encoding_needs_roundtrip_check(enc_name: &str, conv: &ConversionConfig) -> bool {
1146    let list = conv
1147        .check_roundtrip_encoding
1148        .as_deref()
1149        .unwrap_or("SHIFT-JIS");
1150    let target = enc_name.to_ascii_lowercase();
1151    list.split([',', ' ', '\t'])
1152        .map(str::trim)
1153        .filter(|tok| !tok.is_empty())
1154        .any(|tok| tok.eq_ignore_ascii_case(&target))
1155}
1156
1157/// Git `trace_printf("Checking roundtrip encoding for %s...\n", enc)`.
1158fn trace_roundtrip_encoding(enc_name: &str) {
1159    use std::io::Write;
1160    let Ok(trace_val) = std::env::var("GIT_TRACE") else {
1161        return;
1162    };
1163    if trace_val.is_empty() || trace_val == "0" || trace_val.eq_ignore_ascii_case("false") {
1164        return;
1165    }
1166    let line = format!("Checking roundtrip encoding for {enc_name}...\n");
1167    match trace_val.as_str() {
1168        "1" | "true" | "2" => {
1169            let _ = std::io::stderr().write_all(line.as_bytes());
1170        }
1171        path_dest => {
1172            if let Ok(mut f) = std::fs::OpenOptions::new()
1173                .create(true)
1174                .append(true)
1175                .open(path_dest)
1176            {
1177                let _ = f.write_all(line.as_bytes());
1178            }
1179        }
1180    }
1181}
1182
1183/// Re-encode `data` from `from` to `to` via the system `iconv`, matching Git's `reencode_string_len`
1184/// (which is libiconv). Returns `None` if `iconv` is unavailable or reports a conversion error, so
1185/// callers can fall back to `encoding_rs`.
1186fn reencode_via_iconv(data: &[u8], from: &str, to: &str) -> Option<Vec<u8>> {
1187    use std::io::Write;
1188    let mut child = Command::new("iconv")
1189        .arg("-f")
1190        .arg(from)
1191        .arg("-t")
1192        .arg(to)
1193        .stdin(Stdio::piped())
1194        .stdout(Stdio::piped())
1195        .stderr(Stdio::null())
1196        .spawn()
1197        .ok()?;
1198    if let Some(mut stdin) = child.stdin.take() {
1199        let _ = stdin.write_all(data);
1200    }
1201    let output = child.wait_with_output().ok()?;
1202    if !output.status.success() {
1203        return None;
1204    }
1205    Some(output.stdout)
1206}
1207
1208/// Decode raw working-tree bytes (`enc_label`) into UTF-8 for the object DB (Git `encode_to_git`).
1209///
1210/// When `validate` is true (writing to the object DB), enforce Git's UTF BOM rules and surface the
1211/// matching fatal message + advice (`die_on_error`). For internal diff/status reads it is false.
1212fn decode_working_tree_bytes_to_utf8(
1213    src: &[u8],
1214    rel_path: &str,
1215    enc_label: &str,
1216    validate: bool,
1217) -> Result<Vec<u8>, String> {
1218    let label = enc_label.trim();
1219    if label.is_empty() {
1220        return Ok(src.to_vec());
1221    }
1222
1223    let canon = canonical_utf_label(label);
1224
1225    // BOM validation (only the UTF-16/UTF-32 family). Git validates on every `encode_to_git`; when
1226    // writing to the object DB it dies, otherwise (diff/status reads) it prints `error:` and treats
1227    // the content as unmodified — `validate` here is Git's `die_on_error` (`CONV_WRITE_OBJECT`).
1228    if let Some(ref c) = canon {
1229        validate_utf_bom(c, label, rel_path, src, validate)?;
1230    }
1231
1232    // UTF-8 is the default encoding: no conversion (Git `git_path_check_encoding`).
1233    if canon.as_deref() == Some("utf-8") {
1234        return Ok(src.to_vec());
1235    }
1236
1237    // The `*-BOM` aliases decode like the matching raw encoding once the BOM is stripped.
1238    let (iconv_from, body): (&str, &[u8]) = match canon.as_deref() {
1239        Some("utf-16le-bom") => {
1240            let body = if has_bom_prefix(src, UTF16_LE_BOM) {
1241                &src[2..]
1242            } else {
1243                src
1244            };
1245            ("UTF-16LE", body)
1246        }
1247        Some("utf-16be-bom") => {
1248            let body = if has_bom_prefix(src, UTF16_BE_BOM) {
1249                &src[2..]
1250            } else {
1251                src
1252            };
1253            ("UTF-16BE", body)
1254        }
1255        // Bare UTF-16/UTF-32 keep their BOM; iconv consumes it to pick the byte order.
1256        Some(c) => (utf_canon_to_iconv_name(c), src),
1257        None => {
1258            // Non-UTF label: try iconv, then encoding_rs as a fallback.
1259            if let Some(out) = reencode_via_iconv(src, label, "UTF-8") {
1260                return Ok(out);
1261            }
1262            // Unknown / unsupported label (Git `reencode_string_len` returns NULL →
1263            // `failed to encode '%s' from %s to %s`).
1264            let Some(enc) = crate::commit_encoding::resolve(label) else {
1265                return Err(format!(
1266                    "failed to encode '{rel_path}' from {label} to UTF-8"
1267                ));
1268            };
1269            if enc == UTF_8 {
1270                return Ok(src.to_vec());
1271            }
1272            let (cow, _, had_errors) = enc.decode(src);
1273            if had_errors {
1274                return Err(format!(
1275                    "failed to encode '{rel_path}' from {label} to UTF-8"
1276                ));
1277            }
1278            return Ok(cow.into_owned().into_bytes());
1279        }
1280    };
1281
1282    if let Some(out) = reencode_via_iconv(body, iconv_from, "UTF-8") {
1283        return Ok(out);
1284    }
1285
1286    // Fallback: encoding_rs for UTF-16 families (UTF-32 has no encoding_rs codec).
1287    decode_utf_bytes_with_encoding_rs(body, rel_path, label, iconv_from)
1288}
1289
1290/// `encoding_rs` fallback for UTF-16/UTF-32 decode when `iconv` is unavailable.
1291fn decode_utf_bytes_with_encoding_rs(
1292    body: &[u8],
1293    rel_path: &str,
1294    label: &str,
1295    iconv_from: &str,
1296) -> Result<Vec<u8>, String> {
1297    let fail = || format!("failed to encode '{rel_path}' from {label} to UTF-8");
1298    match iconv_from {
1299        "UTF-16BE" => {
1300            let (cow, _, had_errors) = encoding_rs::UTF_16BE.decode(body);
1301            if had_errors {
1302                return Err(fail());
1303            }
1304            Ok(cow.into_owned().into_bytes())
1305        }
1306        "UTF-16LE" => {
1307            let (cow, _, had_errors) = encoding_rs::UTF_16LE.decode(body);
1308            if had_errors {
1309                return Err(fail());
1310            }
1311            Ok(cow.into_owned().into_bytes())
1312        }
1313        "UTF-16" => {
1314            if has_bom_prefix(body, UTF16_BE_BOM) {
1315                decode_utf_bytes_with_encoding_rs(&body[2..], rel_path, label, "UTF-16BE")
1316            } else if has_bom_prefix(body, UTF16_LE_BOM) {
1317                decode_utf_bytes_with_encoding_rs(&body[2..], rel_path, label, "UTF-16LE")
1318            } else {
1319                Err(fail())
1320            }
1321        }
1322        "UTF-32" => {
1323            if has_bom_prefix(body, UTF32_BE_BOM) {
1324                decode_utf32_body_to_utf8_bytes(&body[4..], rel_path, true)
1325            } else if has_bom_prefix(body, UTF32_LE_BOM) {
1326                decode_utf32_body_to_utf8_bytes(&body[4..], rel_path, false)
1327            } else {
1328                Err(fail())
1329            }
1330        }
1331        "UTF-32BE" => decode_utf32_body_to_utf8_bytes(body, rel_path, true),
1332        "UTF-32LE" => decode_utf32_body_to_utf8_bytes(body, rel_path, false),
1333        _ => Err(fail()),
1334    }
1335}
1336
1337fn decode_utf32_body_to_utf8_bytes(
1338    body: &[u8],
1339    rel_path: &str,
1340    big_endian: bool,
1341) -> Result<Vec<u8>, String> {
1342    let fail = || format!("failed to encode '{rel_path}' from UTF-32 to UTF-8");
1343    if !body.len().is_multiple_of(4) {
1344        return Err(fail());
1345    }
1346    let mut s = String::new();
1347    for chunk in body.chunks_exact(4) {
1348        let cp = if big_endian {
1349            u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
1350        } else {
1351            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
1352        };
1353        let Some(ch) = char::from_u32(cp) else {
1354            return Err(fail());
1355        };
1356        s.push(ch);
1357    }
1358    Ok(s.into_bytes())
1359}
1360
1361/// iconv encoding name for a canonical UTF label (raw encodings only; `*-bom` handled separately).
1362fn utf_canon_to_iconv_name(canon: &str) -> &'static str {
1363    match canon {
1364        "utf-16" => "UTF-16",
1365        "utf-16be" => "UTF-16BE",
1366        "utf-16le" => "UTF-16LE",
1367        "utf-32" => "UTF-32",
1368        "utf-32be" => "UTF-32BE",
1369        "utf-32le" => "UTF-32LE",
1370        _ => "UTF-8",
1371    }
1372}
1373
1374/// Encode a UTF-8 blob into raw working-tree bytes for `enc_label` (Git `encode_to_worktree`).
1375///
1376/// Bare `UTF-16`/`UTF-32` and the `*-BOM` aliases get a BOM (Git relies on libiconv / explicit
1377/// BOM handling); the raw `UTF-16BE`/`UTF-16LE`/`UTF-32BE`/`UTF-32LE` encodings produce no BOM.
1378fn encode_utf8_blob_to_working_tree_bytes(
1379    src: &[u8],
1380    rel_path: &str,
1381    enc_label: &str,
1382) -> Result<Vec<u8>, String> {
1383    let label = enc_label.trim();
1384    if label.is_empty() {
1385        return Ok(src.to_vec());
1386    }
1387
1388    let canon = canonical_utf_label(label);
1389    if canon.as_deref() == Some("utf-8") {
1390        return Ok(src.to_vec());
1391    }
1392
1393    let fail = || format!("failed to encode '{rel_path}' from UTF-8 to {label}");
1394
1395    // The `*-BOM` aliases: encode to the raw form, then prepend the requested BOM.
1396    match canon.as_deref() {
1397        Some("utf-16le-bom") => {
1398            let body = reencode_via_iconv(src, "UTF-8", "UTF-16LE")
1399                .or_else(|| encode_utf_with_encoding_rs(src, "UTF-16LE"))
1400                .ok_or_else(fail)?;
1401            let mut out = UTF16_LE_BOM.to_vec();
1402            out.extend(body);
1403            return Ok(out);
1404        }
1405        Some("utf-16be-bom") => {
1406            let body = reencode_via_iconv(src, "UTF-8", "UTF-16BE")
1407                .or_else(|| encode_utf_with_encoding_rs(src, "UTF-16BE"))
1408                .ok_or_else(fail)?;
1409            let mut out = UTF16_BE_BOM.to_vec();
1410            out.extend(body);
1411            return Ok(out);
1412        }
1413        Some(c) => {
1414            let iconv_name = utf_canon_to_iconv_name(c);
1415            if let Some(out) = reencode_via_iconv(src, "UTF-8", iconv_name) {
1416                return Ok(out);
1417            }
1418            return encode_utf_with_encoding_rs(src, c).ok_or_else(fail);
1419        }
1420        None => {}
1421    }
1422
1423    // Non-UTF label: iconv, then encoding_rs.
1424    if let Some(out) = reencode_via_iconv(src, "UTF-8", label) {
1425        return Ok(out);
1426    }
1427    let s = std::str::from_utf8(src).map_err(|_| fail())?;
1428    let Some(enc) = crate::commit_encoding::resolve(label) else {
1429        return Err(format!(
1430            "unknown working-tree-encoding '{label}' for '{rel_path}'"
1431        ));
1432    };
1433    if enc == UTF_8 {
1434        return Ok(src.to_vec());
1435    }
1436    let (cow, _, had_errors) = enc.encode(s);
1437    if had_errors {
1438        return Err(fail());
1439    }
1440    Ok(cow.into_owned())
1441}
1442
1443/// `encoding_rs`/manual fallback for UTF encode when `iconv` is unavailable. `target` is a
1444/// canonical label or an iconv name (`UTF-16BE` etc.). Produces raw bytes (no BOM).
1445fn encode_utf_with_encoding_rs(src: &[u8], target: &str) -> Option<Vec<u8>> {
1446    let s = std::str::from_utf8(src).ok()?;
1447    let lower = target.to_ascii_lowercase();
1448    let mut out = Vec::new();
1449    match lower.as_str() {
1450        "utf-16" | "utf-16be" => {
1451            for u in s.encode_utf16() {
1452                out.extend_from_slice(&u.to_be_bytes());
1453            }
1454        }
1455        "utf-16le" => {
1456            for u in s.encode_utf16() {
1457                out.extend_from_slice(&u.to_le_bytes());
1458            }
1459        }
1460        "utf-32" | "utf-32be" => {
1461            for ch in s.chars() {
1462                out.extend_from_slice(&(ch as u32).to_be_bytes());
1463            }
1464        }
1465        "utf-32le" => {
1466            for ch in s.chars() {
1467                out.extend_from_slice(&(ch as u32).to_le_bytes());
1468            }
1469        }
1470        _ => return None,
1471    }
1472    Some(out)
1473}
1474
1475// ---------------------------------------------------------------------------
1476// Input (add / clean) direction
1477// ---------------------------------------------------------------------------
1478
1479/// Convert data for storage in the index/object database (the "clean" direction).
1480///
1481/// This handles:
1482/// 1. Clean filter execution
1483/// 2. CRLF → LF conversion based on config + attributes
1484/// 3. safecrlf checking
1485///
1486/// Returns `Ok(data)` on success, or an error if safecrlf rejects it.
1487pub fn convert_to_git(
1488    data: &[u8],
1489    rel_path: &str,
1490    conv: &ConversionConfig,
1491    file_attrs: &FileAttrs,
1492) -> Result<Vec<u8>, String> {
1493    convert_to_git_with_opts(
1494        data,
1495        rel_path,
1496        conv,
1497        file_attrs,
1498        ConvertToGitOpts::default(),
1499    )
1500}
1501
1502/// Like [`convert_to_git`] with Git-compatible safer-autocrlf index handling.
1503pub fn convert_to_git_with_opts(
1504    data: &[u8],
1505    rel_path: &str,
1506    conv: &ConversionConfig,
1507    file_attrs: &FileAttrs,
1508    opts: ConvertToGitOpts<'_>,
1509) -> Result<Vec<u8>, String> {
1510    let mut buf = data.to_vec();
1511
1512    // 1. Run clean filter if configured (long-running `process` overrides clean command)
1513    if let Some(ref proc_cmd) = file_attrs.filter_process {
1514        let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1515        match apply_process_clean(proc_cmd, rel_path, &buf) {
1516            Ok(filtered) => buf = filtered,
1517            Err(e) => {
1518                if file_attrs.filter_clean_required {
1519                    if e.contains("expected git-filter-server") {
1520                        return Err(e);
1521                    }
1522                    return Err(format!("fatal: {rel_path}: clean filter '{name}' failed"));
1523                }
1524                if e.starts_with("filter status: abort") {
1525                    crate::filter_process::disable_process_filter(proc_cmd);
1526                }
1527                eprintln!("error: external filter '{name}' failed");
1528            }
1529        }
1530    } else {
1531        match file_attrs.filter_clean.as_ref() {
1532            Some(clean_cmd) => {
1533                buf = run_filter(clean_cmd, &buf, rel_path).map_err(|e| {
1534                    let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1535                    if file_attrs.filter_clean_required {
1536                        format!("fatal: {rel_path}: clean filter '{name}' failed")
1537                    } else {
1538                        format!("clean filter failed: {e}")
1539                    }
1540                })?;
1541            }
1542            None => {
1543                if file_attrs.filter_clean_required {
1544                    let name = file_attrs.filter_driver_name.as_deref().unwrap_or_default();
1545                    return Err(format!("fatal: {rel_path}: clean filter '{name}' failed"));
1546                }
1547            }
1548        }
1549    }
1550
1551    // 2. working-tree-encoding: working tree bytes → UTF-8 for the object DB (Git `encode_to_git`).
1552    if let Some(ref enc) = file_attrs.working_tree_encoding {
1553        // Bare `working-tree-encoding` (boolean true) / `false` are rejected (Git
1554        // `git_path_check_encoding`).
1555        if enc == "set" || enc == "true" || enc == "false" {
1556            return Err("fatal: true/false are no valid working-tree-encodings".to_string());
1557        }
1558        // `CONV_WRITE_OBJECT` → validate BOM rules and die on error (Git `encode_to_git`).
1559        let writing_object = opts.check_safecrlf;
1560        buf = decode_working_tree_bytes_to_utf8(&buf, rel_path, enc, writing_object)?;
1561        // Git `encode_to_git`: when writing to the object DB, verify the round trip for encodings
1562        // listed in `core.checkRoundtripEncoding` (default `SHIFT-JIS`); emit the GIT_TRACE line.
1563        if writing_object && encoding_needs_roundtrip_check(enc, conv) {
1564            trace_roundtrip_encoding(enc);
1565        }
1566    }
1567
1568    // 3. Determine if we should do CRLF→LF conversion
1569    let would_convert = would_convert_on_input(conv, file_attrs, &buf);
1570
1571    let mut convert_crlf_into_lf = would_convert && has_crlf(&buf);
1572    if convert_crlf_into_lf
1573        && clean_uses_autocrlf_index_guard(file_attrs, conv)
1574        && !opts.renormalize
1575        && opts.index_blob.is_some_and(has_crlf_in_index_blob)
1576    {
1577        convert_crlf_into_lf = false;
1578    }
1579
1580    // 4. safecrlf check — Git simulates clean then smudge (`check_global_conv_flags_eol`).
1581    if would_convert && opts.check_safecrlf {
1582        check_safecrlf_roundtrip(conv, file_attrs, &buf, rel_path, convert_crlf_into_lf)?;
1583    }
1584
1585    // 5. Actually convert CRLF → LF if the file has CRLFs
1586    if convert_crlf_into_lf {
1587        buf = crlf_to_lf(&buf);
1588    }
1589
1590    Ok(buf)
1591}
1592
1593/// Decide whether CRLF/LF conversion is configured for this file on input.
1594/// Returns true if the file *would* be subject to conversion (even if no
1595/// actual bytes need changing).
1596fn would_convert_on_input(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
1597    match attrs.crlf_legacy {
1598        CrlfLegacyAttr::Unset => return false,
1599        CrlfLegacyAttr::Input => {
1600            if is_binary(data) {
1601                return false;
1602            }
1603            return true;
1604        }
1605        CrlfLegacyAttr::Crlf => {
1606            if attrs.text == TextAttr::Unset {
1607                return false;
1608            }
1609            if is_binary(data) {
1610                return false;
1611            }
1612            return true;
1613        }
1614        CrlfLegacyAttr::Unspecified => {}
1615    }
1616
1617    // If text is explicitly unset (-text or binary), never convert
1618    if attrs.text == TextAttr::Unset {
1619        return false;
1620    }
1621
1622    // If eol attr is set, this implies text mode
1623    if attrs.eol != EolAttr::Unspecified {
1624        if attrs.text == TextAttr::Auto && is_binary(data) {
1625            return false;
1626        }
1627        return true;
1628    }
1629
1630    // If text is explicitly set, always convert
1631    if attrs.text == TextAttr::Set {
1632        return true;
1633    }
1634
1635    if attrs.text == TextAttr::Auto {
1636        if is_binary(data) {
1637            return false;
1638        }
1639        return true;
1640    }
1641
1642    // No text attribute: fall back to core.autocrlf
1643    match conv.autocrlf {
1644        AutoCrlf::True | AutoCrlf::Input => {
1645            if is_binary(data) {
1646                return false;
1647            }
1648            true
1649        }
1650        AutoCrlf::False => false,
1651    }
1652}
1653
1654/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, CRLF→LF).
1655fn eprint_safecrlf_warn_crlf_to_lf(rel_path: &str) {
1656    eprintln!(
1657        "warning: in the working copy of '{rel_path}', CRLF will be replaced by LF the next time Git touches it"
1658    );
1659}
1660
1661/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, LF→CRLF).
1662fn eprint_safecrlf_warn_lf_to_crlf(rel_path: &str) {
1663    eprintln!(
1664        "warning: in the working copy of '{rel_path}', LF will be replaced by CRLF the next time Git touches it"
1665    );
1666}
1667
1668/// Git `convert.c` `check_global_conv_flags_eol` after simulating clean + smudge.
1669fn check_safecrlf_roundtrip(
1670    conv: &ConversionConfig,
1671    file_attrs: &FileAttrs,
1672    data: &[u8],
1673    rel_path: &str,
1674    convert_crlf_into_lf: bool,
1675) -> Result<(), String> {
1676    if conv.safecrlf == SafeCrlf::False {
1677        return Ok(());
1678    }
1679
1680    let old_stats = git_text_stat(data);
1681
1682    let mut new_stats = old_stats.clone();
1683    if convert_crlf_into_lf && new_stats.crlf > 0 {
1684        new_stats.lonelf += new_stats.crlf;
1685        new_stats.crlf = 0;
1686    }
1687    if will_convert_lf_to_crlf_from_stats(&new_stats, conv, file_attrs) {
1688        new_stats.crlf += new_stats.lonelf;
1689        new_stats.lonelf = 0;
1690    }
1691
1692    if old_stats.crlf > 0 && new_stats.crlf == 0 {
1693        let msg = format!("fatal: CRLF would be replaced by LF in {rel_path}");
1694        if conv.safecrlf == SafeCrlf::True {
1695            return Err(msg);
1696        }
1697        eprint_safecrlf_warn_crlf_to_lf(rel_path);
1698    } else if old_stats.lonelf > 0 && new_stats.lonelf == 0 {
1699        let msg = format!("fatal: LF would be replaced by CRLF in {rel_path}");
1700        if conv.safecrlf == SafeCrlf::True {
1701            return Err(msg);
1702        }
1703        eprint_safecrlf_warn_lf_to_crlf(rel_path);
1704    }
1705
1706    Ok(())
1707}
1708
1709/// Replace CRLF with LF.
1710pub fn crlf_to_lf(data: &[u8]) -> Vec<u8> {
1711    let mut out = Vec::with_capacity(data.len());
1712    let mut i = 0;
1713    while i < data.len() {
1714        if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
1715            out.push(b'\n');
1716            i += 2;
1717        } else {
1718            out.push(data[i]);
1719            i += 1;
1720        }
1721    }
1722    out
1723}
1724
1725/// Replace lone LF with CRLF.
1726pub fn lf_to_crlf(data: &[u8]) -> Vec<u8> {
1727    let mut out = Vec::with_capacity(data.len() + data.len() / 10);
1728    let mut i = 0;
1729    while i < data.len() {
1730        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
1731            out.push(b'\r');
1732            out.push(b'\n');
1733        } else {
1734            out.push(data[i]);
1735        }
1736        i += 1;
1737    }
1738    out
1739}
1740
1741// ---------------------------------------------------------------------------
1742// Output (checkout / smudge) direction
1743// ---------------------------------------------------------------------------
1744
1745/// Convert data from the object database for writing to the working tree
1746/// (the "smudge" direction).
1747///
1748/// This handles (Git `convert_to_working_tree_ca_internal` order):
1749/// 1. Ident keyword expansion
1750/// 2. LF → CRLF conversion based on config + attributes
1751/// 3. `working-tree-encoding` (UTF-8 blob → working tree bytes)
1752/// 4. Smudge filter execution
1753///
1754/// Returns `Ok(None)` when the process filter returned `status=delayed` and `delayed_checkout` was
1755/// provided (Git `delayed_checkout`); the path is queued for [`crate::filter_process::DelayedProcessCheckout::finish`].
1756pub fn convert_to_worktree(
1757    data: &[u8],
1758    rel_path: &str,
1759    conv: &ConversionConfig,
1760    file_attrs: &FileAttrs,
1761    oid_hex: Option<&str>,
1762    smudge_meta: Option<&FilterSmudgeMeta>,
1763    delayed_checkout: Option<&mut crate::filter_process::DelayedProcessCheckout>,
1764) -> Result<Option<Vec<u8>>, String> {
1765    let mut buf = data.to_vec();
1766
1767    // 1. Ident expansion
1768    if file_attrs.ident {
1769        if let Some(oid) = oid_hex {
1770            buf = expand_ident(&buf, oid);
1771        }
1772    }
1773
1774    let can_delay_smudge = delayed_checkout.is_some()
1775        && file_attrs.working_tree_encoding.is_none()
1776        && !file_attrs.ident
1777        && file_attrs
1778            .filter_process
1779            .as_deref()
1780            .is_some_and(|c| !c.is_empty())
1781        && !should_convert_to_crlf(conv, file_attrs, &buf)
1782        && file_attrs
1783            .filter_process
1784            .as_deref()
1785            .is_some_and(crate::filter_process::process_filter_supports_delay);
1786
1787    // 2. LF→CRLF for working tree
1788    let should_convert = should_convert_to_crlf(conv, file_attrs, &buf);
1789    if should_convert {
1790        buf = lf_to_crlf(&buf);
1791    }
1792
1793    // 3. working-tree-encoding (Git `encode_to_worktree`)
1794    if let Some(ref enc) = file_attrs.working_tree_encoding {
1795        buf = encode_utf8_blob_to_working_tree_bytes(&buf, rel_path, enc)?;
1796    }
1797
1798    // 4. Smudge filter — process driver overrides shell smudge
1799    let driver = file_attrs.filter_driver_name.as_deref().unwrap_or("");
1800    if let Some(ref proc_cmd) = file_attrs.filter_process {
1801        let smudge_out =
1802            match apply_process_smudge(proc_cmd, rel_path, &buf, smudge_meta, can_delay_smudge) {
1803                Ok(out) => out,
1804                Err(e) => {
1805                    if file_attrs.filter_smudge_required {
1806                        return Err(format!("fatal: {rel_path}: smudge filter {driver} failed"));
1807                    }
1808                    if e.starts_with("filter status: abort") {
1809                        crate::filter_process::disable_process_filter(proc_cmd);
1810                    }
1811                    eprintln!("error: external filter '{driver}' failed");
1812                    return Ok(Some(buf));
1813                }
1814            };
1815        let Some(out) = smudge_out else {
1816            let Some(q) = delayed_checkout else {
1817                return Err(format!(
1818                    "internal error: delayed smudge without checkout queue for {rel_path}"
1819                ));
1820            };
1821            q.push_delayed(
1822                proc_cmd.clone(),
1823                rel_path.to_string(),
1824                smudge_meta.cloned().unwrap_or_default(),
1825            );
1826            return Ok(None);
1827        };
1828        buf = out;
1829    } else {
1830        match file_attrs.filter_smudge.as_ref() {
1831            Some(smudge_cmd) => match run_filter(smudge_cmd, &buf, rel_path) {
1832                Ok(filtered) => buf = filtered,
1833                Err(_e) => {
1834                    if file_attrs.filter_smudge_required {
1835                        return Err(format!("fatal: {rel_path}: smudge filter {driver} failed"));
1836                    }
1837                }
1838            },
1839            None => {
1840                if file_attrs.filter_smudge_required {
1841                    return Err(format!("fatal: {rel_path}: smudge filter {driver} failed"));
1842                }
1843            }
1844        }
1845    }
1846
1847    Ok(Some(buf))
1848}
1849
1850/// Like [`convert_to_worktree`] without delayed-checkout queueing (always materializes or errors).
1851#[must_use]
1852pub fn convert_to_worktree_eager(
1853    data: &[u8],
1854    rel_path: &str,
1855    conv: &ConversionConfig,
1856    file_attrs: &FileAttrs,
1857    oid_hex: Option<&str>,
1858    smudge_meta: Option<&FilterSmudgeMeta>,
1859) -> Result<Vec<u8>, String> {
1860    match convert_to_worktree(data, rel_path, conv, file_attrs, oid_hex, smudge_meta, None)? {
1861        Some(v) => Ok(v),
1862        None => Err(format!(
1863            "internal error: unexpected delayed smudge for {rel_path}"
1864        )),
1865    }
1866}
1867
1868/// Decide whether to convert LF→CRLF on output (working tree / smudge direction).
1869#[must_use]
1870pub fn should_convert_to_crlf(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
1871    match attrs.crlf_legacy {
1872        CrlfLegacyAttr::Unset | CrlfLegacyAttr::Input => return false,
1873        CrlfLegacyAttr::Crlf => {
1874            if attrs.text == TextAttr::Unset {
1875                return false;
1876            }
1877            // Legacy `crlf` (set) forces CRLF on checkout (even for paths Git
1878            // would otherwise treat as binary; see t0020 "t* crlf" + `three`).
1879            return true;
1880        }
1881        CrlfLegacyAttr::Unspecified => {}
1882    }
1883
1884    // If text is explicitly unset, never convert
1885    if attrs.text == TextAttr::Unset {
1886        return false;
1887    }
1888
1889    // If there's an explicit eol attribute
1890    if attrs.eol != EolAttr::Unspecified {
1891        if attrs.text == TextAttr::Auto && is_binary(data) {
1892            return false;
1893        }
1894        if attrs.eol != EolAttr::Crlf {
1895            return false;
1896        }
1897        // `text=auto` + `eol=crlf` → Git `CRLF_AUTO_CRLF` (safe mixed handling).
1898        if attrs.text == TextAttr::Auto {
1899            return auto_crlf_should_smudge_lf_to_crlf(data);
1900        }
1901        // Explicit `eol=crlf` with `text` set, etc. → `CRLF_TEXT_CRLF` (always normalize).
1902        return true;
1903    }
1904
1905    // If text is explicitly set, use eol config
1906    if attrs.text == TextAttr::Set {
1907        return output_eol_is_crlf(conv);
1908    }
1909
1910    if attrs.text == TextAttr::Auto {
1911        if is_binary(data) {
1912            return false;
1913        }
1914        if !output_eol_is_crlf(conv) {
1915            return false;
1916        }
1917        return auto_crlf_should_smudge_lf_to_crlf(data);
1918    }
1919
1920    // No text attribute: fall back to core.autocrlf
1921    match conv.autocrlf {
1922        AutoCrlf::True => {
1923            if is_binary(data) {
1924                return false;
1925            }
1926            auto_crlf_should_smudge_lf_to_crlf(data)
1927        }
1928        AutoCrlf::Input | AutoCrlf::False => false,
1929    }
1930}
1931
1932/// Whether the output EOL should be CRLF based on config.
1933fn output_eol_is_crlf(conv: &ConversionConfig) -> bool {
1934    // Git `text_eol_is_crlf`: autocrlf=input forces LF output before `core.eol` is consulted.
1935    if conv.autocrlf == AutoCrlf::Input {
1936        return false;
1937    }
1938    if conv.autocrlf == AutoCrlf::True {
1939        return true;
1940    }
1941    match conv.eol {
1942        CoreEol::Crlf => true,
1943        CoreEol::Lf => false,
1944        CoreEol::Native => {
1945            // On Unix, native is LF
1946            cfg!(windows)
1947        }
1948    }
1949}
1950
1951/// Expand `$Id$` → `$Id: <oid>$` in data.
1952///
1953/// Matches Git's `ident_to_worktree` in `convert.c`: same-line `$` terminator, and foreign
1954/// idents (internal spaces before the closing `$`) are left unchanged.
1955fn expand_ident(data: &[u8], oid: &str) -> Vec<u8> {
1956    if !count_ident_regions(data) {
1957        return data.to_vec();
1958    }
1959    let replacement = format!("$Id: {oid} $");
1960    let mut out = Vec::with_capacity(data.len() + 60);
1961    let mut i = 0;
1962    while i < data.len() {
1963        if data[i] != b'$' {
1964            out.push(data[i]);
1965            i += 1;
1966            continue;
1967        }
1968        if i + 3 > data.len() || data[i + 1] != b'I' || data[i + 2] != b'd' {
1969            out.push(data[i]);
1970            i += 1;
1971            continue;
1972        }
1973        let after_id = i + 3;
1974        let ch = data.get(after_id).copied();
1975        match ch {
1976            Some(b'$') => {
1977                out.extend_from_slice(replacement.as_bytes());
1978                i = after_id + 1;
1979            }
1980            Some(b':') => {
1981                let rest = &data[after_id + 1..];
1982                let line_end = rest
1983                    .iter()
1984                    .position(|&b| b == b'\n' || b == b'\r')
1985                    .unwrap_or(rest.len());
1986                let line = &rest[..line_end];
1987                let Some(dollar_rel) = line.iter().position(|&b| b == b'$') else {
1988                    out.push(data[i]);
1989                    i += 1;
1990                    continue;
1991                };
1992                if line[..dollar_rel].contains(&b'\n') {
1993                    out.push(data[i]);
1994                    i += 1;
1995                    continue;
1996                }
1997                // Foreign ident (Git `ident_to_worktree`): first space in the payload after the
1998                // byte following `:` must not be the last character before `$`.
1999                let payload = &line[..dollar_rel];
2000                let foreign = payload.len() > 1
2001                    && payload[1..]
2002                        .iter()
2003                        .position(|&b| b == b' ')
2004                        .is_some_and(|rel| {
2005                            let pos = 1 + rel;
2006                            pos < payload.len().saturating_sub(1)
2007                        });
2008                if foreign {
2009                    out.push(data[i]);
2010                    i += 1;
2011                    continue;
2012                }
2013                out.extend_from_slice(replacement.as_bytes());
2014                i = after_id + 1 + dollar_rel + 1;
2015            }
2016            _ => {
2017                out.push(data[i]);
2018                i += 1;
2019            }
2020        }
2021    }
2022    out
2023}
2024
2025/// Whether the buffer contains any `$Id$` / `$Id: ... $` regions Git would rewrite (`count_ident`).
2026fn count_ident_regions(data: &[u8]) -> bool {
2027    let mut i = 0usize;
2028    while i < data.len() {
2029        if data[i] != b'$' {
2030            i += 1;
2031            continue;
2032        }
2033        if i + 3 > data.len() || data[i + 1] != b'I' || data[i + 2] != b'd' {
2034            i += 1;
2035            continue;
2036        }
2037        let after = i + 3;
2038        match data.get(after).copied() {
2039            Some(b'$') => return true,
2040            Some(b':') => {
2041                let mut j = after + 1;
2042                let mut found = false;
2043                while j < data.len() {
2044                    match data[j] {
2045                        b'$' => {
2046                            found = true;
2047                            break;
2048                        }
2049                        b'\n' | b'\r' => break,
2050                        _ => j += 1,
2051                    }
2052                }
2053                if found {
2054                    return true;
2055                }
2056                i += 1;
2057            }
2058            _ => i += 1,
2059        }
2060    }
2061    false
2062}
2063
2064/// Collapse `$Id: ... $` back to `$Id$`.
2065pub fn collapse_ident(data: &[u8]) -> Vec<u8> {
2066    let mut out = Vec::with_capacity(data.len());
2067    let mut i = 0;
2068    while i < data.len() {
2069        if i + 4 <= data.len() && &data[i..i + 4] == b"$Id:" {
2070            let rest = &data[i + 4..];
2071            let line_end = rest
2072                .iter()
2073                .position(|&b| b == b'\n' || b == b'\r')
2074                .unwrap_or(rest.len());
2075            let line = &rest[..line_end];
2076            if let Some(end) = line.iter().position(|&b| b == b'$') {
2077                out.extend_from_slice(b"$Id$");
2078                i += 4 + end + 1;
2079                continue;
2080            }
2081        }
2082        out.push(data[i]);
2083        i += 1;
2084    }
2085    out
2086}
2087
2088/// Shell-quote `s` with single quotes, matching Git's `sq_quote_buf` (`'` → `'\''`).
2089fn sq_quote_buf(s: &str) -> String {
2090    let mut out = String::with_capacity(s.len() + 2);
2091    out.push('\'');
2092    for ch in s.chars() {
2093        if ch == '\'' {
2094            out.push_str("'\\''");
2095        } else {
2096            out.push(ch);
2097        }
2098    }
2099    out.push('\'');
2100    out
2101}
2102
2103/// Expand Git filter command placeholders: `%%` → `%`, `%f` → quoted repository-relative path.
2104fn expand_filter_command(cmd: &str, rel_path: &str) -> String {
2105    let mut out = String::with_capacity(cmd.len() + rel_path.len() + 8);
2106    let mut chars = cmd.chars().peekable();
2107    while let Some(c) = chars.next() {
2108        if c == '%' {
2109            match chars.peek() {
2110                Some('%') => {
2111                    chars.next();
2112                    out.push('%');
2113                }
2114                Some('f') => {
2115                    chars.next();
2116                    out.push_str(&sq_quote_buf(rel_path));
2117                }
2118                _ => out.push('%'),
2119            }
2120        } else {
2121            out.push(c);
2122        }
2123    }
2124    out
2125}
2126
2127/// Run a filter command, piping data through stdin→stdout.
2128fn run_filter(cmd: &str, data: &[u8], rel_path: &str) -> Result<Vec<u8>, std::io::Error> {
2129    let expanded = expand_filter_command(cmd, rel_path);
2130    let mut child = Command::new("sh")
2131        .arg("-c")
2132        .arg(&expanded)
2133        .stdin(Stdio::piped())
2134        .stdout(Stdio::piped())
2135        .stderr(Stdio::inherit())
2136        .spawn()?;
2137
2138    use std::io::{ErrorKind, Write};
2139    if let Some(ref mut stdin) = child.stdin {
2140        if let Err(e) = stdin.write_all(data) {
2141            // Match Git: if the filter exits without reading stdin, ignore EPIPE.
2142            if e.kind() != ErrorKind::BrokenPipe {
2143                return Err(e);
2144            }
2145        }
2146    }
2147    drop(child.stdin.take());
2148
2149    let output = child.wait_with_output()?;
2150    if !output.status.success() {
2151        return Err(std::io::Error::other(format!(
2152            "filter command exited with status {}",
2153            output.status
2154        )));
2155    }
2156
2157    Ok(output.stdout)
2158}
2159
2160// Re-export AttrRule type is internal, but we expose the vec through load_gitattributes.
2161// The public API uses the opaque Vec from load_gitattributes + get_file_attrs.
2162
2163/// Opaque type alias for loaded gitattributes rules.
2164pub type GitAttributes = Vec<AttrRule>;
2165
2166#[cfg(test)]
2167mod tests {
2168    use super::*;
2169
2170    #[test]
2171    fn test_crlf_to_lf() {
2172        assert_eq!(crlf_to_lf(b"hello\r\nworld\r\n"), b"hello\nworld\n");
2173        assert_eq!(crlf_to_lf(b"hello\nworld\n"), b"hello\nworld\n");
2174        assert_eq!(crlf_to_lf(b"hello\r\n"), b"hello\n");
2175    }
2176
2177    #[test]
2178    fn test_lf_to_crlf() {
2179        assert_eq!(lf_to_crlf(b"hello\nworld\n"), b"hello\r\nworld\r\n");
2180        assert_eq!(lf_to_crlf(b"hello\r\nworld\r\n"), b"hello\r\nworld\r\n");
2181    }
2182
2183    #[test]
2184    fn test_has_crlf() {
2185        assert!(has_crlf(b"hello\r\nworld"));
2186        assert!(!has_crlf(b"hello\nworld"));
2187    }
2188
2189    #[test]
2190    fn smudge_mixed_line_endings_unchanged_with_autocrlf_true() {
2191        let mut blob = Vec::new();
2192        for part in [
2193            b"Oh\n".as_slice(),
2194            b"here\n",
2195            b"is\n",
2196            b"CRLF\r\n",
2197            b"in\n",
2198            b"text\n",
2199        ] {
2200            blob.extend_from_slice(part);
2201        }
2202        let conv = ConversionConfig {
2203            autocrlf: AutoCrlf::True,
2204            eol: CoreEol::Lf,
2205            safecrlf: SafeCrlf::False,
2206            check_roundtrip_encoding: None,
2207        };
2208        let attrs = FileAttrs::default();
2209        let out = convert_to_worktree_eager(&blob, "mixed", &conv, &attrs, None, None).unwrap();
2210        assert_eq!(out, blob);
2211    }
2212
2213    #[test]
2214    fn smudge_lf_only_gets_crlf_with_autocrlf_true() {
2215        let blob = b"a\nb\n";
2216        let conv = ConversionConfig {
2217            autocrlf: AutoCrlf::True,
2218            eol: CoreEol::Lf,
2219            safecrlf: SafeCrlf::False,
2220            check_roundtrip_encoding: None,
2221        };
2222        let attrs = FileAttrs::default();
2223        let out = convert_to_worktree_eager(blob, "x", &conv, &attrs, None, None).unwrap();
2224        assert_eq!(out, b"a\r\nb\r\n");
2225    }
2226
2227    #[test]
2228    fn test_is_binary() {
2229        assert!(is_binary(b"hello\0world"));
2230        assert!(!is_binary(b"hello world"));
2231    }
2232
2233    #[test]
2234    fn attr_dir_only_pattern_does_not_match_same_named_file() {
2235        let rules = parse_gitattributes_content("ignored-only-if-dir/ export-ignore\n");
2236        let rule = &rules[0];
2237        assert!(rule.must_be_dir);
2238        assert!(rule.basename_only);
2239        assert!(!attr_rule_matches(
2240            rule,
2241            "not-ignored-dir/ignored-only-if-dir",
2242            false
2243        ));
2244        assert!(attr_rule_matches(rule, "ignored-only-if-dir", true));
2245    }
2246
2247    #[test]
2248    fn test_expand_collapse_ident() {
2249        let data = b"$Id$";
2250        let expanded = expand_ident(data, "abc123");
2251        assert_eq!(expanded, b"$Id: abc123 $");
2252        let collapsed = collapse_ident(&expanded);
2253        assert_eq!(collapsed, b"$Id$");
2254    }
2255
2256    #[test]
2257    fn expand_ident_does_not_span_lines_for_partial_keyword() {
2258        let data = b"$Id: NoTerminatingSymbol\n$Id: deadbeef $\n";
2259        let expanded = expand_ident(data, "newoid");
2260        assert_eq!(expanded, b"$Id: NoTerminatingSymbol\n$Id: newoid $\n");
2261    }
2262
2263    #[test]
2264    fn expand_ident_preserves_foreign_id_with_internal_spaces() {
2265        let data = b"$Id: Foreign Commit With Spaces $\n";
2266        let expanded = expand_ident(data, "abc");
2267        assert_eq!(expanded, data);
2268    }
2269
2270    #[test]
2271    fn expand_filter_command_percent_f_quotes_path() {
2272        let s = expand_filter_command("sh ./x.sh %f --extra", "name  with 'sq'");
2273        assert_eq!(s, "sh ./x.sh 'name  with '\\''sq'\\''' --extra");
2274        assert_eq!(expand_filter_command("a %% b", "p"), "a % b");
2275    }
2276}
grit_lib/crlf.rs

grit_lib/
crlf.rs