Skip to main content

grit_lib/
crlf.rs

1//! CRLF / EOL conversion and clean/smudge filter support.
2//!
3//! This module handles line-ending conversion when staging files (`git add`)
4//! and checking out files (`git checkout`, `read-tree -u`, `checkout-index`).
5//!
6//! Config knobs:
7//!   - `core.autocrlf` (true / input / false)
8//!   - `core.eol` (lf / crlf / native)
9//!   - `core.safecrlf` (true / warn / false)
10//!
11//! Gitattributes:
12//!   - `text` / `text=auto` / `-text` / `binary`
13//!   - `eol=lf` / `eol=crlf`
14//!   - `filter=<name>` (with `filter.<name>.clean` / `filter.<name>.smudge`)
15//!   - `ident` keyword expansion
16
17use std::path::{Path, PathBuf};
18use std::process::{Command, Stdio};
19
20use crate::config::ConfigSet;
21
22/// What `core.autocrlf` is set to.
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24pub enum AutoCrlf {
25    True,
26    Input,
27    False,
28}
29
30/// What `core.eol` is set to.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum CoreEol {
33    Lf,
34    Crlf,
35    Native,
36}
37
38/// What `core.safecrlf` is set to.
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum SafeCrlf {
41    True,
42    Warn,
43    False,
44}
45
46/// Per-file text attribute from .gitattributes.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum TextAttr {
49    /// `text` — always treat as text.
50    Set,
51    /// `text=auto` — auto-detect.
52    Auto,
53    /// `-text` or `binary` — never convert.
54    Unset,
55    /// No text attribute specified.
56    Unspecified,
57}
58
59/// Per-file eol attribute from .gitattributes.
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub enum EolAttr {
62    Lf,
63    Crlf,
64    Unspecified,
65}
66
67/// Legacy `crlf` gitattribute (deprecated in Git; still honored for EOL conversion).
68#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
69pub enum CrlfLegacyAttr {
70    #[default]
71    Unspecified,
72    /// `-crlf` — disable CRLF conversion.
73    Unset,
74    /// `crlf=input` — normalize to LF in the object database; no CRLF on checkout.
75    Input,
76    /// Bare `crlf` (set) — force CRLF on checkout for text files.
77    Crlf,
78}
79
80/// Per-file merge attribute from .gitattributes.
81#[derive(Debug, Clone, PartialEq, Eq)]
82pub enum MergeAttr {
83    /// No merge attribute specified.
84    Unspecified,
85    /// `-merge` — treat as binary/non-text merge.
86    Unset,
87    /// `merge=<driver>` — use named merge driver.
88    Driver(String),
89}
90
91/// Per-file attributes relevant to conversion.
92#[derive(Debug, Clone)]
93pub struct FileAttrs {
94    pub text: TextAttr,
95    pub eol: EolAttr,
96    /// Diff driver name from `diff=<driver>` attribute.
97    pub diff_driver: Option<String>,
98    pub filter_clean: Option<String>,
99    pub filter_smudge: Option<String>,
100    /// Whether `filter.<name>.required` is set for the active smudge filter.
101    pub filter_smudge_required: bool,
102    pub ident: bool,
103    pub merge: MergeAttr,
104    pub conflict_marker_size: Option<String>,
105    /// Working tree encoding (e.g. "utf-16") — content is converted to UTF-8 on add.
106    pub working_tree_encoding: Option<String>,
107    /// Legacy `crlf` / `-crlf` / `crlf=input` from `.gitattributes`.
108    pub crlf_legacy: CrlfLegacyAttr,
109}
110
111impl Default for FileAttrs {
112    fn default() -> Self {
113        FileAttrs {
114            text: TextAttr::Unspecified,
115            eol: EolAttr::Unspecified,
116            diff_driver: None,
117            filter_clean: None,
118            filter_smudge: None,
119            filter_smudge_required: false,
120            ident: false,
121            merge: MergeAttr::Unspecified,
122            conflict_marker_size: None,
123            working_tree_encoding: None,
124            crlf_legacy: CrlfLegacyAttr::Unspecified,
125        }
126    }
127}
128
129/// Global conversion settings derived from config.
130#[derive(Debug, Clone)]
131pub struct ConversionConfig {
132    pub autocrlf: AutoCrlf,
133    pub eol: CoreEol,
134    pub safecrlf: SafeCrlf,
135}
136
137impl ConversionConfig {
138    /// Load conversion settings from a ConfigSet.
139    pub fn from_config(config: &ConfigSet) -> Self {
140        let autocrlf = match config.get("core.autocrlf") {
141            Some(v) => match v.to_lowercase().as_str() {
142                "true" | "yes" | "on" | "1" => AutoCrlf::True,
143                "input" => AutoCrlf::Input,
144                _ => AutoCrlf::False,
145            },
146            None => AutoCrlf::False,
147        };
148
149        let eol = match config.get("core.eol") {
150            Some(v) => match v.to_lowercase().as_str() {
151                "crlf" => CoreEol::Crlf,
152                "lf" => CoreEol::Lf,
153                "native" => CoreEol::Native,
154                _ => CoreEol::Native,
155            },
156            None => CoreEol::Native,
157        };
158
159        let safecrlf = match config.get("core.safecrlf") {
160            Some(v) => match v.to_lowercase().as_str() {
161                "true" | "yes" | "on" | "1" => SafeCrlf::True,
162                "warn" => SafeCrlf::Warn,
163                _ => SafeCrlf::False,
164            },
165            None => SafeCrlf::False,
166        };
167
168        ConversionConfig {
169            autocrlf,
170            eol,
171            safecrlf,
172        }
173    }
174}
175
176/// A parsed .gitattributes rule.
177#[derive(Debug, Clone)]
178pub struct AttrRule {
179    pattern: String,
180    attrs: Vec<(String, String)>, // (name, value) where value is "set"/"unset"/specific value
181}
182
183/// Load .gitattributes from the worktree root.
184pub fn load_gitattributes(work_tree: &Path) -> Vec<AttrRule> {
185    let mut rules = Vec::new();
186
187    let root_attrs = work_tree.join(".gitattributes");
188    if let Ok(content) = std::fs::read_to_string(&root_attrs) {
189        parse_gitattributes(&content, &mut rules);
190    }
191
192    let info_attrs = work_tree.join(".git/info/attributes");
193    if let Ok(content) = std::fs::read_to_string(&info_attrs) {
194        parse_gitattributes(&content, &mut rules);
195    }
196
197    rules
198}
199
200/// Parse gitattributes content into attribute rules.
201///
202/// This is useful when attributes are sourced from non-worktree inputs
203/// (for example, tree objects selected by `--attr-source`).
204#[must_use]
205pub fn parse_gitattributes_content(content: &str) -> Vec<AttrRule> {
206    let mut rules = Vec::new();
207    parse_gitattributes(content, &mut rules);
208    rules
209}
210
211/// Load .gitattributes from the index (for use during checkout when
212/// the worktree file may not yet exist).
213pub fn load_gitattributes_from_index(
214    index: &crate::index::Index,
215    odb: &crate::odb::Odb,
216) -> Vec<AttrRule> {
217    let mut rules = Vec::new();
218
219    // Look for .gitattributes in the index (stage 0)
220    if let Some(entry) = index.get(b".gitattributes", 0) {
221        if let Ok(obj) = odb.read(&entry.oid) {
222            if let Ok(content) = String::from_utf8(obj.data) {
223                parse_gitattributes(&content, &mut rules);
224            }
225        }
226    }
227
228    rules
229}
230
231/// Load `.gitattributes` rules that apply to `rel_path`, including root and
232/// nested `dir/.gitattributes` along parent directories (Git-consistent order:
233/// root first, then each ancestor directory; later rules win in [`get_file_attrs`]).
234///
235/// Reads from the working tree when present, otherwise from a stage-0 index entry.
236pub fn load_gitattributes_for_checkout(
237    work_tree: &Path,
238    rel_path: &str,
239    index: &crate::index::Index,
240    odb: &crate::odb::Odb,
241) -> Vec<AttrRule> {
242    let mut rules = load_gitattributes(work_tree);
243
244    // Root `.gitattributes` may exist only in the index while the worktree file
245    // is missing (e.g. t0020 in-tree attributes after `rm -rf .gitattributes`).
246    if !work_tree.join(".gitattributes").exists() {
247        if let Some(entry) = index.get(b".gitattributes", 0) {
248            if let Ok(obj) = odb.read(&entry.oid) {
249                if let Ok(content) = String::from_utf8(obj.data) {
250                    parse_gitattributes(&content, &mut rules);
251                }
252            }
253        }
254    }
255
256    let path = Path::new(rel_path);
257    if let Some(parent) = path.parent() {
258        let mut accum = PathBuf::new();
259        for comp in parent.components() {
260            accum.push(comp);
261            let ga_rel = accum.join(".gitattributes");
262            let wt_ga = work_tree.join(&ga_rel);
263            if let Ok(content) = std::fs::read_to_string(&wt_ga) {
264                parse_gitattributes(&content, &mut rules);
265            } else {
266                let key = path_to_index_bytes(&ga_rel);
267                if let Some(entry) = index.get(&key, 0) {
268                    if let Ok(obj) = odb.read(&entry.oid) {
269                        if let Ok(content) = String::from_utf8(obj.data) {
270                            parse_gitattributes(&content, &mut rules);
271                        }
272                    }
273                }
274            }
275        }
276    }
277
278    rules
279}
280
281fn path_to_index_bytes(path: &Path) -> Vec<u8> {
282    use std::os::unix::ffi::OsStrExt;
283    path.as_os_str().as_bytes().to_vec()
284}
285
286fn parse_gitattributes(content: &str, rules: &mut Vec<AttrRule>) {
287    for line in content.lines() {
288        let line = line.trim();
289        if line.is_empty() || line.starts_with('#') {
290            continue;
291        }
292
293        let mut parts = line.split_whitespace();
294        let pattern = match parts.next() {
295            Some(p) => p.to_owned(),
296            None => continue,
297        };
298
299        let mut attrs = Vec::new();
300        for part in parts {
301            if part == "binary" {
302                attrs.push(("text".to_owned(), "unset".to_owned()));
303                attrs.push(("diff".to_owned(), "unset".to_owned()));
304            } else if let Some(rest) = part.strip_prefix('-') {
305                attrs.push((rest.to_owned(), "unset".to_owned()));
306            } else if let Some((key, val)) = part.split_once('=') {
307                attrs.push((key.to_owned(), val.to_owned()));
308            } else {
309                attrs.push((part.to_owned(), "set".to_owned()));
310            }
311        }
312
313        if !attrs.is_empty() {
314            rules.push(AttrRule { pattern, attrs });
315        }
316    }
317}
318
319fn config_bool_truthy(value: &str) -> bool {
320    matches!(
321        value.trim().to_ascii_lowercase().as_str(),
322        "true" | "yes" | "on" | "1"
323    )
324}
325
326/// Get file attributes for a given path from .gitattributes rules and config.
327pub fn get_file_attrs(rules: &[AttrRule], rel_path: &str, config: &ConfigSet) -> FileAttrs {
328    let mut fa = FileAttrs::default();
329
330    // Walk rules; last match wins for each attribute.
331    for rule in rules {
332        if pattern_matches(&rule.pattern, rel_path) {
333            for (name, value) in &rule.attrs {
334                match name.as_str() {
335                    "text" => {
336                        fa.text = match value.as_str() {
337                            "set" => TextAttr::Set,
338                            "unset" => TextAttr::Unset,
339                            "auto" => TextAttr::Auto,
340                            _ => TextAttr::Unspecified,
341                        };
342                    }
343                    "eol" => {
344                        fa.eol = match value.as_str() {
345                            "lf" => EolAttr::Lf,
346                            "crlf" => EolAttr::Crlf,
347                            _ => EolAttr::Unspecified,
348                        };
349                    }
350                    "filter" => {
351                        if value == "unset" {
352                            fa.filter_clean = None;
353                            fa.filter_smudge = None;
354                            fa.filter_smudge_required = false;
355                        } else {
356                            let clean_key = format!("filter.{value}.clean");
357                            let smudge_key = format!("filter.{value}.smudge");
358                            let req_key = format!("filter.{value}.required");
359                            fa.filter_clean = config.get(&clean_key);
360                            fa.filter_smudge = config.get(&smudge_key);
361                            fa.filter_smudge_required =
362                                config.get(&req_key).is_some_and(|v| config_bool_truthy(&v));
363                        }
364                    }
365                    "diff" => {
366                        if value == "unset" {
367                            fa.diff_driver = None;
368                        } else if !value.is_empty() && value != "set" {
369                            fa.diff_driver = Some(value.clone());
370                        }
371                    }
372                    "ident" => {
373                        fa.ident = value == "set";
374                    }
375                    "merge" => {
376                        fa.merge = match value.as_str() {
377                            "unset" => MergeAttr::Unset,
378                            "set" => MergeAttr::Unspecified,
379                            other => MergeAttr::Driver(other.to_string()),
380                        };
381                    }
382                    "conflict-marker-size" => {
383                        if value == "unset" {
384                            fa.conflict_marker_size = None;
385                        } else {
386                            fa.conflict_marker_size = Some(value.clone());
387                        }
388                    }
389                    "working-tree-encoding" => {
390                        if value != "unset" && !value.is_empty() {
391                            fa.working_tree_encoding = Some(value.clone());
392                        }
393                    }
394                    "crlf" => {
395                        fa.crlf_legacy = match value.as_str() {
396                            "unset" => CrlfLegacyAttr::Unset,
397                            "input" => CrlfLegacyAttr::Input,
398                            "set" => CrlfLegacyAttr::Crlf,
399                            _ => CrlfLegacyAttr::Unspecified,
400                        };
401                    }
402                    _ => {}
403                }
404            }
405        }
406    }
407
408    fa
409}
410
411/// Simple gitattributes pattern matching.
412fn pattern_matches(pattern: &str, path: &str) -> bool {
413    if !pattern.contains('/') {
414        // Match against basename only
415        let basename = path.rsplit('/').next().unwrap_or(path);
416        glob_matches(pattern, basename)
417    } else {
418        glob_matches(pattern, path)
419    }
420}
421
422fn glob_matches(pattern: &str, text: &str) -> bool {
423    glob_match_bytes(pattern.as_bytes(), text.as_bytes())
424}
425
426fn glob_match_bytes(pat: &[u8], text: &[u8]) -> bool {
427    match (pat.first(), text.first()) {
428        (None, None) => true,
429        (Some(&b'*'), _) => {
430            let pat_rest = pat
431                .iter()
432                .position(|&b| b != b'*')
433                .map_or(&pat[pat.len()..], |i| &pat[i..]);
434            if pat_rest.is_empty() {
435                return true;
436            }
437            for i in 0..=text.len() {
438                if glob_match_bytes(pat_rest, &text[i..]) {
439                    return true;
440                }
441            }
442            false
443        }
444        (Some(&b'?'), Some(_)) => glob_match_bytes(&pat[1..], &text[1..]),
445        (Some(p), Some(t)) if p == t => glob_match_bytes(&pat[1..], &text[1..]),
446        _ => false,
447    }
448}
449
450/// Returns true if the data looks binary (contains NUL bytes in the first 8000 bytes).
451pub fn is_binary(data: &[u8]) -> bool {
452    let check_len = data.len().min(8000);
453    data[..check_len].contains(&0)
454}
455
456/// Returns true if data contains any CRLF sequences.
457pub fn has_crlf(data: &[u8]) -> bool {
458    data.windows(2).any(|w| w == b"\r\n")
459}
460
461/// Returns true if data contains any lone LF (not preceded by CR).
462pub fn has_lone_lf(data: &[u8]) -> bool {
463    for i in 0..data.len() {
464        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
465            return true;
466        }
467    }
468    false
469}
470
471/// Returns true if data contains a bare CR not followed by LF (Git `text_stat.lonecr`).
472fn has_lone_cr(data: &[u8]) -> bool {
473    for i in 0..data.len() {
474        if data[i] == b'\r' && (i + 1 >= data.len() || data[i + 1] != b'\n') {
475            return true;
476        }
477    }
478    false
479}
480
481/// Git `convert.c` `will_convert_lf_to_crlf` for `CRLF_AUTO` / `CRLF_AUTO_INPUT` / `CRLF_AUTO_CRLF`:
482/// if the blob already has CRLF pairs or lone CRs, do not convert lone LFs to CRLF on checkout.
483fn auto_crlf_should_smudge_lf_to_crlf(data: &[u8]) -> bool {
484    if !has_lone_lf(data) {
485        return false;
486    }
487    if has_lone_cr(data) || has_crlf(data) {
488        return false;
489    }
490    if is_binary(data) {
491        return false;
492    }
493    true
494}
495
496/// Returns true if ALL line endings are CRLF (no lone LF).
497pub fn is_all_crlf(data: &[u8]) -> bool {
498    has_crlf(data) && !has_lone_lf(data)
499}
500
501/// Returns true if ALL line endings are LF (no CRLF).
502pub fn is_all_lf(data: &[u8]) -> bool {
503    has_lone_lf(data) && !has_crlf(data)
504}
505
506// ---------------------------------------------------------------------------
507// Input (add / clean) direction
508// ---------------------------------------------------------------------------
509
510/// Convert data for storage in the index/object database (the "clean" direction).
511///
512/// This handles:
513/// 1. Clean filter execution
514/// 2. CRLF → LF conversion based on config + attributes
515/// 3. safecrlf checking
516///
517/// Returns `Ok(data)` on success, or an error if safecrlf rejects it.
518pub fn convert_to_git(
519    data: &[u8],
520    rel_path: &str,
521    conv: &ConversionConfig,
522    file_attrs: &FileAttrs,
523) -> Result<Vec<u8>, String> {
524    let mut buf = data.to_vec();
525
526    // 1. Run clean filter if configured
527    if let Some(ref clean_cmd) = file_attrs.filter_clean {
528        buf = run_filter(clean_cmd, &buf, rel_path)
529            .map_err(|e| format!("clean filter failed: {e}"))?;
530    }
531
532    // 2. Determine if we should do CRLF→LF conversion
533    let would_convert = would_convert_on_input(conv, file_attrs, &buf);
534
535    // 3. safecrlf check — always check if conversion is configured,
536    // even if no actual conversion is needed for this particular file.
537    if would_convert {
538        check_safecrlf_input(conv, &buf, rel_path)?;
539    }
540
541    // 4. Actually convert CRLF → LF if the file has CRLFs
542    if would_convert && has_crlf(&buf) {
543        buf = crlf_to_lf(&buf);
544    }
545
546    Ok(buf)
547}
548
549/// Decide whether CRLF/LF conversion is configured for this file on input.
550/// Returns true if the file *would* be subject to conversion (even if no
551/// actual bytes need changing).
552fn would_convert_on_input(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
553    match attrs.crlf_legacy {
554        CrlfLegacyAttr::Unset => return false,
555        CrlfLegacyAttr::Input => {
556            if is_binary(data) {
557                return false;
558            }
559            return true;
560        }
561        CrlfLegacyAttr::Crlf => {
562            if attrs.text == TextAttr::Unset {
563                return false;
564            }
565            if is_binary(data) {
566                return false;
567            }
568            return true;
569        }
570        CrlfLegacyAttr::Unspecified => {}
571    }
572
573    // If text is explicitly unset (-text or binary), never convert
574    if attrs.text == TextAttr::Unset {
575        return false;
576    }
577
578    // If eol attr is set, this implies text mode
579    if attrs.eol != EolAttr::Unspecified {
580        if attrs.text == TextAttr::Auto && is_binary(data) {
581            return false;
582        }
583        return true;
584    }
585
586    // If text is explicitly set, always convert
587    if attrs.text == TextAttr::Set {
588        return true;
589    }
590
591    if attrs.text == TextAttr::Auto {
592        if is_binary(data) {
593            return false;
594        }
595        return true;
596    }
597
598    // No text attribute: fall back to core.autocrlf
599    match conv.autocrlf {
600        AutoCrlf::True | AutoCrlf::Input => {
601            if is_binary(data) {
602                return false;
603            }
604            true
605        }
606        AutoCrlf::False => false,
607    }
608}
609
610/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, CRLF→LF).
611fn eprint_safecrlf_warn_crlf_to_lf(rel_path: &str) {
612    eprintln!(
613        "warning: in the working copy of '{rel_path}', CRLF will be replaced by LF the next time Git touches it"
614    );
615}
616
617/// Git-compatible stderr when `core.safecrlf` is `warn` (clean direction, LF→CRLF).
618fn eprint_safecrlf_warn_lf_to_crlf(rel_path: &str) {
619    eprintln!(
620        "warning: in the working copy of '{rel_path}', LF will be replaced by CRLF the next time Git touches it"
621    );
622}
623
624/// Check safecrlf constraints on input.
625fn check_safecrlf_input(
626    conv: &ConversionConfig,
627    data: &[u8],
628    rel_path: &str,
629) -> Result<(), String> {
630    if conv.safecrlf == SafeCrlf::False {
631        return Ok(());
632    }
633
634    if is_binary(data) {
635        return Ok(());
636    }
637
638    let mixed = has_crlf(data) && has_lone_lf(data);
639
640    // Mixed line endings: clean would change some lines; unsafe for both autocrlf modes.
641    if mixed {
642        if conv.autocrlf == AutoCrlf::Input {
643            let msg = format!("fatal: CRLF would be replaced by LF in {rel_path}");
644            if conv.safecrlf == SafeCrlf::True {
645                return Err(msg);
646            }
647            eprint_safecrlf_warn_crlf_to_lf(rel_path);
648            return Ok(());
649        }
650        if conv.autocrlf == AutoCrlf::True {
651            let msg = format!("fatal: LF would be replaced by CRLF in {rel_path}");
652            if conv.safecrlf == SafeCrlf::True {
653                return Err(msg);
654            }
655            eprint_safecrlf_warn_lf_to_crlf(rel_path);
656            return Ok(());
657        }
658    }
659
660    // safecrlf with autocrlf=input: reject if file is all CRLF
661    // (the conversion would be irreversible — CRLF→LF, but checkout won't
662    // add CR back because autocrlf=input only strips on input)
663    if conv.autocrlf == AutoCrlf::Input && is_all_crlf(data) {
664        let msg = format!("fatal: CRLF would be replaced by LF in {rel_path}");
665        if conv.safecrlf == SafeCrlf::True {
666            return Err(msg);
667        }
668        eprint_safecrlf_warn_crlf_to_lf(rel_path);
669        return Ok(());
670    }
671
672    // safecrlf with autocrlf=true: reject if file is all LF
673    // (LF→LF on input, then LF→CRLF on checkout changes the file)
674    if conv.autocrlf == AutoCrlf::True && is_all_lf(data) {
675        let msg = format!("fatal: LF would be replaced by CRLF in {rel_path}");
676        if conv.safecrlf == SafeCrlf::True {
677            return Err(msg);
678        }
679        eprint_safecrlf_warn_lf_to_crlf(rel_path);
680        return Ok(());
681    }
682
683    Ok(())
684}
685
686/// Replace CRLF with LF.
687pub fn crlf_to_lf(data: &[u8]) -> Vec<u8> {
688    let mut out = Vec::with_capacity(data.len());
689    let mut i = 0;
690    while i < data.len() {
691        if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
692            out.push(b'\n');
693            i += 2;
694        } else {
695            out.push(data[i]);
696            i += 1;
697        }
698    }
699    out
700}
701
702/// Replace lone LF with CRLF.
703pub fn lf_to_crlf(data: &[u8]) -> Vec<u8> {
704    let mut out = Vec::with_capacity(data.len() + data.len() / 10);
705    let mut i = 0;
706    while i < data.len() {
707        if data[i] == b'\n' && (i == 0 || data[i - 1] != b'\r') {
708            out.push(b'\r');
709            out.push(b'\n');
710        } else {
711            out.push(data[i]);
712        }
713        i += 1;
714    }
715    out
716}
717
718// ---------------------------------------------------------------------------
719// Output (checkout / smudge) direction
720// ---------------------------------------------------------------------------
721
722/// Convert data from the object database for writing to the working tree
723/// (the "smudge" direction).
724///
725/// This handles:
726/// 1. LF → CRLF conversion based on config + attributes
727/// 2. Smudge filter execution
728/// 3. Ident keyword expansion
729pub fn convert_to_worktree(
730    data: &[u8],
731    rel_path: &str,
732    conv: &ConversionConfig,
733    file_attrs: &FileAttrs,
734    oid_hex: Option<&str>,
735) -> std::io::Result<Vec<u8>> {
736    let mut buf = data.to_vec();
737
738    // 1. Ident expansion
739    if file_attrs.ident {
740        if let Some(oid) = oid_hex {
741            buf = expand_ident(&buf, oid);
742        }
743    }
744
745    // 2. Smudge filter (before EOL conversion) — matches Git's checkout pipeline
746    if let Some(ref smudge_cmd) = file_attrs.filter_smudge {
747        match run_filter(smudge_cmd, &buf, rel_path) {
748            Ok(filtered) => buf = filtered,
749            Err(e) => {
750                if file_attrs.filter_smudge_required {
751                    return Err(e);
752                }
753            }
754        }
755    }
756
757    // 3. LF→CRLF for working tree
758    let should_convert = should_convert_to_crlf(conv, file_attrs, &buf);
759    if should_convert {
760        buf = lf_to_crlf(&buf);
761    }
762
763    Ok(buf)
764}
765
766/// Decide whether to convert LF→CRLF on output.
767fn should_convert_to_crlf(conv: &ConversionConfig, attrs: &FileAttrs, data: &[u8]) -> bool {
768    match attrs.crlf_legacy {
769        CrlfLegacyAttr::Unset | CrlfLegacyAttr::Input => return false,
770        CrlfLegacyAttr::Crlf => {
771            if attrs.text == TextAttr::Unset {
772                return false;
773            }
774            // Legacy `crlf` (set) forces CRLF on checkout (even for paths Git
775            // would otherwise treat as binary; see t0020 "t* crlf" + `three`).
776            return true;
777        }
778        CrlfLegacyAttr::Unspecified => {}
779    }
780
781    // If text is explicitly unset, never convert
782    if attrs.text == TextAttr::Unset {
783        return false;
784    }
785
786    // If there's an explicit eol attribute
787    if attrs.eol != EolAttr::Unspecified {
788        if attrs.text == TextAttr::Auto && is_binary(data) {
789            return false;
790        }
791        if attrs.eol != EolAttr::Crlf {
792            return false;
793        }
794        // `text=auto` + `eol=crlf` → Git `CRLF_AUTO_CRLF` (safe mixed handling).
795        if attrs.text == TextAttr::Auto {
796            return auto_crlf_should_smudge_lf_to_crlf(data);
797        }
798        // Explicit `eol=crlf` with `text` set, etc. → `CRLF_TEXT_CRLF` (always normalize).
799        return true;
800    }
801
802    // If text is explicitly set, use eol config
803    if attrs.text == TextAttr::Set {
804        return output_eol_is_crlf(conv);
805    }
806
807    if attrs.text == TextAttr::Auto {
808        if is_binary(data) {
809            return false;
810        }
811        if !output_eol_is_crlf(conv) {
812            return false;
813        }
814        return auto_crlf_should_smudge_lf_to_crlf(data);
815    }
816
817    // No text attribute: fall back to core.autocrlf
818    match conv.autocrlf {
819        AutoCrlf::True => {
820            if is_binary(data) {
821                return false;
822            }
823            auto_crlf_should_smudge_lf_to_crlf(data)
824        }
825        AutoCrlf::Input | AutoCrlf::False => false,
826    }
827}
828
829/// Whether the output EOL should be CRLF based on config.
830fn output_eol_is_crlf(conv: &ConversionConfig) -> bool {
831    // autocrlf=true overrides core.eol
832    if conv.autocrlf == AutoCrlf::True {
833        return true;
834    }
835    match conv.eol {
836        CoreEol::Crlf => true,
837        CoreEol::Lf => false,
838        CoreEol::Native => {
839            // On Unix, native is LF
840            cfg!(windows)
841        }
842    }
843}
844
845/// Expand `$Id$` → `$Id: <oid>$` in data.
846fn expand_ident(data: &[u8], oid: &str) -> Vec<u8> {
847    let needle = b"$Id$";
848    let replacement = format!("$Id: {oid} $");
849    let mut out = Vec::with_capacity(data.len() + 60);
850    let mut i = 0;
851    while i < data.len() {
852        if i + needle.len() <= data.len() && &data[i..i + needle.len()] == needle {
853            out.extend_from_slice(replacement.as_bytes());
854            i += needle.len();
855        } else if i + 4 <= data.len() && &data[i..i + 4] == b"$Id:" {
856            // Already expanded — replace existing expansion
857            if let Some(end) = data[i + 4..].iter().position(|&b| b == b'$') {
858                out.extend_from_slice(replacement.as_bytes());
859                i += 4 + end + 1;
860            } else {
861                out.push(data[i]);
862                i += 1;
863            }
864        } else {
865            out.push(data[i]);
866            i += 1;
867        }
868    }
869    out
870}
871
872/// Collapse `$Id: ... $` back to `$Id$`.
873pub fn collapse_ident(data: &[u8]) -> Vec<u8> {
874    let mut out = Vec::with_capacity(data.len());
875    let mut i = 0;
876    while i < data.len() {
877        if i + 4 <= data.len() && &data[i..i + 4] == b"$Id:" {
878            if let Some(end) = data[i + 4..].iter().position(|&b| b == b'$') {
879                out.extend_from_slice(b"$Id$");
880                i += 4 + end + 1;
881                continue;
882            }
883        }
884        out.push(data[i]);
885        i += 1;
886    }
887    out
888}
889
890/// Run a filter command, piping data through stdin→stdout.
891fn run_filter(cmd: &str, data: &[u8], _rel_path: &str) -> Result<Vec<u8>, std::io::Error> {
892    let mut child = Command::new("sh")
893        .arg("-c")
894        .arg(cmd)
895        .stdin(Stdio::piped())
896        .stdout(Stdio::piped())
897        .stderr(Stdio::inherit())
898        .spawn()?;
899
900    use std::io::Write;
901    if let Some(ref mut stdin) = child.stdin {
902        stdin.write_all(data)?;
903    }
904    drop(child.stdin.take());
905
906    let output = child.wait_with_output()?;
907    if !output.status.success() {
908        return Err(std::io::Error::other(format!(
909            "filter command exited with status {}",
910            output.status
911        )));
912    }
913
914    Ok(output.stdout)
915}
916
917// Re-export AttrRule type is internal, but we expose the vec through load_gitattributes.
918// The public API uses the opaque Vec from load_gitattributes + get_file_attrs.
919
920/// Opaque type alias for loaded gitattributes rules.
921pub type GitAttributes = Vec<AttrRule>;
922
923#[cfg(test)]
924mod tests {
925    use super::*;
926
927    #[test]
928    fn test_crlf_to_lf() {
929        assert_eq!(crlf_to_lf(b"hello\r\nworld\r\n"), b"hello\nworld\n");
930        assert_eq!(crlf_to_lf(b"hello\nworld\n"), b"hello\nworld\n");
931        assert_eq!(crlf_to_lf(b"hello\r\n"), b"hello\n");
932    }
933
934    #[test]
935    fn test_lf_to_crlf() {
936        assert_eq!(lf_to_crlf(b"hello\nworld\n"), b"hello\r\nworld\r\n");
937        assert_eq!(lf_to_crlf(b"hello\r\nworld\r\n"), b"hello\r\nworld\r\n");
938    }
939
940    #[test]
941    fn test_has_crlf() {
942        assert!(has_crlf(b"hello\r\nworld"));
943        assert!(!has_crlf(b"hello\nworld"));
944    }
945
946    #[test]
947    fn smudge_mixed_line_endings_unchanged_with_autocrlf_true() {
948        let mut blob = Vec::new();
949        for part in [
950            b"Oh\n".as_slice(),
951            b"here\n",
952            b"is\n",
953            b"CRLF\r\n",
954            b"in\n",
955            b"text\n",
956        ] {
957            blob.extend_from_slice(part);
958        }
959        let conv = ConversionConfig {
960            autocrlf: AutoCrlf::True,
961            eol: CoreEol::Lf,
962            safecrlf: SafeCrlf::False,
963        };
964        let attrs = FileAttrs::default();
965        let out = convert_to_worktree(&blob, "mixed", &conv, &attrs, None).unwrap();
966        assert_eq!(out, blob);
967    }
968
969    #[test]
970    fn smudge_lf_only_gets_crlf_with_autocrlf_true() {
971        let blob = b"a\nb\n";
972        let conv = ConversionConfig {
973            autocrlf: AutoCrlf::True,
974            eol: CoreEol::Lf,
975            safecrlf: SafeCrlf::False,
976        };
977        let attrs = FileAttrs::default();
978        let out = convert_to_worktree(blob, "x", &conv, &attrs, None).unwrap();
979        assert_eq!(out, b"a\r\nb\r\n");
980    }
981
982    #[test]
983    fn test_is_binary() {
984        assert!(is_binary(b"hello\0world"));
985        assert!(!is_binary(b"hello world"));
986    }
987
988    #[test]
989    fn test_expand_collapse_ident() {
990        let data = b"$Id$";
991        let expanded = expand_ident(data, "abc123");
992        assert_eq!(expanded, b"$Id: abc123 $");
993        let collapsed = collapse_ident(&expanded);
994        assert_eq!(collapsed, b"$Id$");
995    }
996}