Skip to main content

sley_worktree/
filter.rs

1//! Content filtering on the blob<->worktree boundary: CRLF/encoding/ident/process filters, clean/smudge, and `ls-files --eol` info.
2//!
3//! Split out of `lib.rs` in the wave-47 mechanical refactor: a pure code move
4//! (no function body changed); all items are re-exported from `lib.rs`.
5use super::*;
6use crate::attributes::*;
7use crate::ignore::*;
8use crate::index::*;
9use crate::index_io::*;
10use crate::types_admin::*;
11
12// ---------------------------------------------------------------------------
13// Content filtering on the blob <-> worktree boundary
14//
15// Git runs two kinds of conversion when content crosses between the worktree
16// and the object database:
17//
18//   * the line-ending / `core.autocrlf` conversion (driven by the `text`,
19//     `eol` attributes and the `core.autocrlf` / `core.eol` config), and
20//   * the long-running `filter.<name>.clean` / `.smudge` driver filters
21//     (selected by the `filter=<name>` attribute and configured commands).
22//
23// "clean" runs on the way *into* the object store (worktree -> blob), e.g. on
24// `git add` / `git hash-object -w`. "smudge" runs on the way *out* (blob ->
25// worktree), e.g. on checkout / restore. The driver filter, when present,
26// wraps the EOL conversion: on clean git first runs the configured `clean`
27// command and then applies CRLF->LF normalization; on smudge git first applies
28// LF->CRLF and then runs the `smudge` command.
29// ---------------------------------------------------------------------------
30
31/// The line-ending conversion that applies to a path, derived from its
32/// attributes and the repository config.
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
34pub(crate) enum EolConversion {
35    /// No conversion: binary content, or text with `core.autocrlf=false` and no
36    /// `eol`/`text=auto` request to add carriage returns.
37    None,
38    /// Normalize to LF on clean; no carriage returns on smudge (`eol=lf`, or
39    /// `core.autocrlf=input`).
40    Lf,
41    /// Normalize to LF on clean; emit CRLF on smudge (`eol=crlf`, or
42    /// `core.autocrlf=true`).
43    Crlf,
44}
45
46/// How git should decide whether a path is text for the purpose of EOL
47/// conversion.
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49pub(crate) enum TextDecision {
50    /// `-text` / `binary`: never convert.
51    Binary,
52    /// `text` is set explicitly: always treat as text.
53    Text,
54    /// `text=auto` (or implied by `core.autocrlf`): treat as text unless the
55    /// content looks binary.
56    Auto,
57    /// No opinion from attributes or config: leave content untouched.
58    Unspecified,
59}
60
61/// The fully resolved set of conversions that apply to a single path.
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub(crate) struct ContentFilterPlan {
64    pub(crate) text: TextDecision,
65    /// The conversion to apply when `text` resolves to "this is text".
66    pub(crate) eol: EolConversion,
67    /// Whether `$Id$` keyword collapse/expansion applies to this path.
68    pub(crate) ident: bool,
69    /// `filter.<name>` driver, if assigned via attributes and configured.
70    pub(crate) driver: Option<FilterDriver>,
71    /// `working-tree-encoding` attribute: the worktree charset to decode from
72    /// on checkin / encode to on checkout.
73    pub(crate) encoding: WtEncoding,
74}
75
76#[derive(Debug, Clone, PartialEq, Eq)]
77pub(crate) struct FilterDriver {
78    name: Vec<u8>,
79    process: Option<String>,
80    clean: Option<String>,
81    smudge: Option<String>,
82    required: bool,
83}
84
85/// The resolved `working-tree-encoding` attribute (convert.c
86/// `git_path_check_encoding`): an unset / empty / `UTF-8` value means no
87/// conversion; `working-tree-encoding` (true) / `working-tree-encoding=false`
88/// are rejected; any other value names a charset the worktree file is stored in.
89#[derive(Debug, Clone, PartialEq, Eq)]
90pub(crate) enum WtEncoding {
91    /// No reencoding (unset, empty, or the default UTF-8).
92    None,
93    /// `working-tree-encoding` set as a boolean — `true`/`false` are invalid.
94    Invalid,
95    /// A named encoding (the original attribute value, preserved for messages).
96    Named(Vec<u8>),
97}
98
99impl WtEncoding {
100    fn from_attr(state: Option<&AttributeState>) -> WtEncoding {
101        match state {
102            // Unset (`-working-tree-encoding`) or no attribute: nothing to do.
103            None | Some(AttributeState::Unset) => WtEncoding::None,
104            // `working-tree-encoding` with no value is the boolean true.
105            Some(AttributeState::Set) => WtEncoding::Invalid,
106            Some(AttributeState::Value(value)) => {
107                // An empty value (`working-tree-encoding=`) or UTF-8 (the
108                // in-repo default) needs no conversion.
109                if value.is_empty() || encoding_name_is_utf8(value) {
110                    WtEncoding::None
111                } else {
112                    WtEncoding::Named(value.clone())
113                }
114            }
115        }
116    }
117}
118
119/// Whether `name` denotes UTF-8 (`same_encoding(name, "UTF-8")`): the leading
120/// `UTF` prefix and an optional `-` are skipped, then the remainder compared
121/// case-insensitively against `8`.
122pub(crate) fn encoding_name_is_utf8(name: &[u8]) -> bool {
123    utf_suffix(name).is_some_and(|suffix| suffix == "8")
124}
125
126/// Strip a leading case-insensitive `UTF` and optional `-`, returning the
127/// uppercased remainder (e.g. `utf-16le` → `16LE`, `UTF-16LE-BOM` →
128/// `16LE-BOM`). `None` when `name` is not a UTF family encoding or not UTF-8.
129pub(crate) fn utf_suffix(name: &[u8]) -> Option<String> {
130    let upper: String = std::str::from_utf8(name).ok()?.to_ascii_uppercase();
131    let rest = upper.strip_prefix("UTF")?;
132    Some(rest.strip_prefix('-').unwrap_or(rest).to_string())
133}
134
135#[derive(Clone, Copy)]
136pub(crate) enum BomProblem {
137    Prohibited,
138    Required,
139}
140
141/// The byte order mark validation from convert.c `validate_encoding`: an
142/// explicit-endianness UTF encoding must not start with a BOM, while a
143/// byte-order-agnostic one (`UTF-16` / `UTF-32`) must.
144pub(crate) fn utf_bom_problem(suffix: &str, data: &[u8]) -> Option<BomProblem> {
145    let has16 = data.starts_with(&[0xFF, 0xFE]) || data.starts_with(&[0xFE, 0xFF]);
146    let has32 = data.starts_with(&[0xFF, 0xFE, 0, 0]) || data.starts_with(&[0, 0, 0xFE, 0xFF]);
147    match suffix {
148        "16LE" | "16BE" => has16.then_some(BomProblem::Prohibited),
149        "32LE" | "32BE" => has32.then_some(BomProblem::Prohibited),
150        "16" => (!has16).then_some(BomProblem::Required),
151        "32" => (!has32).then_some(BomProblem::Required),
152        _ => None,
153    }
154}
155
156/// `true` on a little-endian host — matches glibc iconv's native byte order for
157/// the byte-order-agnostic `UTF-16` / `UTF-32` output (LE BOM + LE bytes on x86).
158pub(crate) const HOST_LE: bool = cfg!(target_endian = "little");
159
160/// Decode worktree-encoded bytes to UTF-8 (encode_to_git's reencode), or `None`
161/// when the encoding is unsupported or the bytes are not valid in it.
162pub(crate) fn decode_to_utf8(suffix: &str, data: &[u8]) -> Option<Vec<u8>> {
163    match suffix {
164        "16LE" => decode_utf16(data, true),
165        "16BE" => decode_utf16(data, false),
166        "16" | "16LE-BOM" | "16BE-BOM" => {
167            let (le, body) = strip_utf16_bom(data);
168            decode_utf16(body, le)
169        }
170        "32LE" => decode_utf32(data, true),
171        "32BE" => decode_utf32(data, false),
172        "32" | "32LE-BOM" | "32BE-BOM" => {
173            let (le, body) = strip_utf32_bom(data);
174            decode_utf32(body, le)
175        }
176        _ => None,
177    }
178}
179
180/// Encode UTF-8 bytes to the worktree encoding (encode_to_worktree's reencode),
181/// or `None` when the encoding is unsupported or the input is not valid UTF-8.
182pub(crate) fn encode_from_utf8(suffix: &str, utf8: &[u8]) -> Option<Vec<u8>> {
183    match suffix {
184        "16LE" => encode_utf16(utf8, true, false),
185        "16BE" => encode_utf16(utf8, false, false),
186        "16LE-BOM" => encode_utf16(utf8, true, true),
187        "16BE-BOM" => encode_utf16(utf8, false, true),
188        "16" => encode_utf16(utf8, HOST_LE, true),
189        "32LE" => encode_utf32(utf8, true, false),
190        "32BE" => encode_utf32(utf8, false, false),
191        "32LE-BOM" => encode_utf32(utf8, true, true),
192        "32BE-BOM" => encode_utf32(utf8, false, true),
193        "32" => encode_utf32(utf8, HOST_LE, true),
194        _ => None,
195    }
196}
197
198pub(crate) fn strip_utf16_bom(data: &[u8]) -> (bool, &[u8]) {
199    if data.starts_with(&[0xFF, 0xFE]) {
200        (true, &data[2..])
201    } else if data.starts_with(&[0xFE, 0xFF]) {
202        (false, &data[2..])
203    } else {
204        (HOST_LE, data)
205    }
206}
207
208pub(crate) fn strip_utf32_bom(data: &[u8]) -> (bool, &[u8]) {
209    if data.starts_with(&[0xFF, 0xFE, 0, 0]) {
210        (true, &data[4..])
211    } else if data.starts_with(&[0, 0, 0xFE, 0xFF]) {
212        (false, &data[4..])
213    } else {
214        (HOST_LE, data)
215    }
216}
217
218pub(crate) fn decode_utf16(data: &[u8], le: bool) -> Option<Vec<u8>> {
219    if !data.len().is_multiple_of(2) {
220        return None;
221    }
222    let units = data.chunks_exact(2).map(|chunk| {
223        let pair = [chunk[0], chunk[1]];
224        if le {
225            u16::from_le_bytes(pair)
226        } else {
227            u16::from_be_bytes(pair)
228        }
229    });
230    let mut out = String::new();
231    for unit in char::decode_utf16(units) {
232        out.push(unit.ok()?);
233    }
234    Some(out.into_bytes())
235}
236
237pub(crate) fn decode_utf32(data: &[u8], le: bool) -> Option<Vec<u8>> {
238    if !data.len().is_multiple_of(4) {
239        return None;
240    }
241    let mut out = String::new();
242    for chunk in data.chunks_exact(4) {
243        let quad = [chunk[0], chunk[1], chunk[2], chunk[3]];
244        let cp = if le {
245            u32::from_le_bytes(quad)
246        } else {
247            u32::from_be_bytes(quad)
248        };
249        out.push(char::from_u32(cp)?);
250    }
251    Some(out.into_bytes())
252}
253
254pub(crate) fn encode_utf16(utf8: &[u8], le: bool, bom: bool) -> Option<Vec<u8>> {
255    let text = std::str::from_utf8(utf8).ok()?;
256    let mut out = Vec::with_capacity(utf8.len() * 2 + 2);
257    if bom {
258        out.extend_from_slice(if le { &[0xFF, 0xFE] } else { &[0xFE, 0xFF] });
259    }
260    for unit in text.encode_utf16() {
261        out.extend_from_slice(&if le {
262            unit.to_le_bytes()
263        } else {
264            unit.to_be_bytes()
265        });
266    }
267    Some(out)
268}
269
270pub(crate) fn encode_utf32(utf8: &[u8], le: bool, bom: bool) -> Option<Vec<u8>> {
271    let text = std::str::from_utf8(utf8).ok()?;
272    let mut out = Vec::with_capacity(utf8.len() * 4 + 4);
273    if bom {
274        out.extend_from_slice(if le {
275            &[0xFF, 0xFE, 0, 0]
276        } else {
277            &[0, 0, 0xFE, 0xFF]
278        });
279    }
280    for ch in text.chars() {
281        let cp = ch as u32;
282        out.extend_from_slice(&if le {
283            cp.to_le_bytes()
284        } else {
285            cp.to_be_bytes()
286        });
287    }
288    Some(out)
289}
290
291/// Reject a `working-tree-encoding` boolean (`true`/`false`) before any
292/// conversion runs — `git_path_check_encoding` dies on it regardless of
293/// direction.
294pub(crate) fn check_wt_encoding_valid(encoding: &WtEncoding) -> Result<()> {
295    if matches!(encoding, WtEncoding::Invalid) {
296        eprintln!("fatal: true/false are no valid working-tree-encodings");
297        return Err(GitError::Exit(128));
298    }
299    Ok(())
300}
301
302/// encode_to_git: decode worktree-encoded `data` to UTF-8 for storage in the
303/// object database. Runs the BOM validation first (fatal when writing an
304/// object). Returns the borrowed input unchanged when there is no encoding.
305pub(crate) fn encode_to_git<'a>(
306    encoding: &WtEncoding,
307    path: &[u8],
308    data: Cow<'a, [u8]>,
309    write_object: bool,
310) -> Result<Cow<'a, [u8]>> {
311    let name = match encoding {
312        WtEncoding::None => return Ok(data),
313        WtEncoding::Invalid => return check_wt_encoding_valid(encoding).map(|()| data),
314        WtEncoding::Named(name) => name,
315    };
316    if data.is_empty() {
317        return Ok(data);
318    }
319    let display = String::from_utf8_lossy(path);
320    let enc = String::from_utf8_lossy(name);
321    if let Some(suffix) = utf_suffix(name)
322        && let Some(problem) = utf_bom_problem(&suffix, &data)
323    {
324        let number = &suffix[..2.min(suffix.len())];
325        match problem {
326            BomProblem::Prohibited => {
327                eprintln!(
328                    "hint: The file '{display}' contains a byte order mark (BOM). \
329Please use UTF-{number} as working-tree-encoding."
330                );
331                report_encode_failure(
332                    write_object,
333                    &format!("BOM is prohibited in '{display}' if encoded as {enc}"),
334                )?;
335                return Ok(data);
336            }
337            BomProblem::Required => {
338                eprintln!(
339                    "hint: The file '{display}' is missing a byte order mark (BOM). \
340Please use UTF-{number}BE or UTF-{number}LE (depending on the byte order) as \
341working-tree-encoding."
342                );
343                report_encode_failure(
344                    write_object,
345                    &format!("BOM is required in '{display}' if encoded as {enc}"),
346                )?;
347                return Ok(data);
348            }
349        }
350    }
351    match utf_suffix(name).and_then(|suffix| decode_to_utf8(&suffix, &data)) {
352        Some(utf8) => Ok(Cow::Owned(utf8)),
353        None => {
354            report_encode_failure(
355                write_object,
356                &format!("failed to encode '{display}' from {enc} to UTF-8"),
357            )?;
358            Ok(data)
359        }
360    }
361}
362
363/// encode_to_worktree: reencode UTF-8 `data` to the worktree encoding on
364/// checkout. A failure is reported (never fatal) and the content left as-is,
365/// matching convert.c `encode_to_worktree`.
366pub(crate) fn encode_to_worktree<'a>(
367    encoding: &WtEncoding,
368    path: &[u8],
369    data: Cow<'a, [u8]>,
370) -> Result<Cow<'a, [u8]>> {
371    let name = match encoding {
372        WtEncoding::None => return Ok(data),
373        WtEncoding::Invalid => return check_wt_encoding_valid(encoding).map(|()| data),
374        WtEncoding::Named(name) => name,
375    };
376    if data.is_empty() {
377        return Ok(data);
378    }
379    match utf_suffix(name).and_then(|suffix| encode_from_utf8(&suffix, &data)) {
380        Some(encoded) => Ok(Cow::Owned(encoded)),
381        None => {
382            let display = String::from_utf8_lossy(path);
383            let enc = String::from_utf8_lossy(name);
384            eprintln!("error: failed to encode '{display}' from UTF-8 to {enc}");
385            Ok(data)
386        }
387    }
388}
389
390/// Emit a clean-side encoding failure: fatal (`die`) when writing an object,
391/// otherwise an `error:` diagnostic that lets the caller keep the content as-is.
392pub(crate) fn report_encode_failure(write_object: bool, message: &str) -> Result<()> {
393    if write_object {
394        eprintln!("fatal: {message}");
395        Err(GitError::Exit(128))
396    } else {
397        eprintln!("error: {message}");
398        Ok(())
399    }
400}
401
402/// Decode one crlf-family attribute (`text` or its legacy alias `crlf`) into a
403/// text decision, plus whether the value form forced an EOL direction.
404///
405/// Mirrors git's `git_path_check_crlf` (convert.c): a *set* attribute is text,
406/// an *unset* one is binary, `=auto` is auto, `=input` forces LF while still
407/// counting as text, and any other value is "undefined" — i.e. no opinion, so
408/// the caller falls through to the next source (the `crlf` alias, then config).
409pub(crate) fn decode_crlf_family_attribute(
410    state: Option<&AttributeState>,
411) -> (TextDecision, EolConversion) {
412    match state {
413        Some(AttributeState::Set) => (TextDecision::Text, EolConversion::None),
414        Some(AttributeState::Unset) => (TextDecision::Binary, EolConversion::None),
415        Some(AttributeState::Value(value)) if value == b"auto" => {
416            (TextDecision::Auto, EolConversion::None)
417        }
418        // `crlf=input` / `text=input`: text content normalized to LF (no CR on
419        // smudge), exactly like `core.autocrlf=input`.
420        Some(AttributeState::Value(value)) if value == b"input" => {
421            (TextDecision::Text, EolConversion::Lf)
422        }
423        // `=<other>` is CRLF_UNDEFINED in git for the `crlf` alias: no opinion.
424        _ => (TextDecision::Unspecified, EolConversion::None),
425    }
426}
427
428impl ContentFilterPlan {
429    /// Build the plan for `path` from the parsed attributes and repo config.
430    fn resolve(config: &GitConfig, checks: &[AttributeCheck]) -> Self {
431        let text_attr = checks.iter().find(|check| check.attribute == b"text");
432        let crlf_attr = checks.iter().find(|check| check.attribute == b"crlf");
433        let ident_attr = checks.iter().find(|check| check.attribute == b"ident");
434        let eol_attr = checks.iter().find(|check| check.attribute == b"eol");
435        let filter_attr = checks.iter().find(|check| check.attribute == b"filter");
436        let encoding_attr = checks
437            .iter()
438            .find(|check| check.attribute == b"working-tree-encoding");
439        let encoding = WtEncoding::from_attr(encoding_attr.and_then(|check| check.state.as_ref()));
440
441        // Resolve the eol attribute first; `eol=crlf|lf` also forces text.
442        let eol_value = eol_attr.and_then(|check| match &check.state {
443            Some(AttributeState::Value(value)) => Some(value.clone()),
444            _ => None,
445        });
446
447        // The `text` attribute decides first; only when it is unspecified does
448        // git consult the legacy `crlf` alias (convert.c `convert_attrs`).
449        let mut forced_eol = EolConversion::None;
450        let mut text = match text_attr.map(|check| &check.state) {
451            Some(Some(AttributeState::Set)) => TextDecision::Text,
452            Some(Some(AttributeState::Unset)) => TextDecision::Binary,
453            Some(Some(AttributeState::Value(value))) if value == b"auto" => TextDecision::Auto,
454            Some(Some(AttributeState::Value(value))) if value == b"input" => {
455                forced_eol = EolConversion::Lf;
456                TextDecision::Text
457            }
458            // `text=<other>` is treated by git as a set text attribute.
459            Some(Some(AttributeState::Value(_))) => TextDecision::Text,
460            // `!text` (unspecified) or no text attribute: fall through to `crlf`.
461            _ => {
462                let (decision, eol) =
463                    decode_crlf_family_attribute(crlf_attr.and_then(|check| check.state.as_ref()));
464                forced_eol = eol;
465                decision
466            }
467        };
468
469        // A concrete `eol` attribute implies the path is text even when `text`
470        // was left unspecified (git: `eol` without `text` is treated as
471        // `text=auto`-ish; upstream forces conversion). We honour eol only when
472        // text is not explicitly binary.
473        let eol = match (&text, eol_value.as_deref()) {
474            (TextDecision::Binary, _) => EolConversion::None,
475            (_, Some(b"crlf")) => {
476                if text == TextDecision::Unspecified {
477                    text = TextDecision::Text;
478                }
479                EolConversion::Crlf
480            }
481            (_, Some(b"lf")) => {
482                if text == TextDecision::Unspecified {
483                    text = TextDecision::Text;
484                }
485                EolConversion::Lf
486            }
487            // No explicit `eol` attribute, but `text=input`/`crlf=input` already
488            // forced the LF direction (git's CRLF_TEXT_INPUT). Honour it over the
489            // config-derived default.
490            _ if forced_eol == EolConversion::Lf => EolConversion::Lf,
491            // No eol attribute: derive direction from config.
492            _ => eol_from_config(config),
493        };
494
495        // When the path is text but neither `eol` nor `core.autocrlf`/`core.eol`
496        // asked for carriage returns, we still normalize to LF on clean. That is
497        // modelled by `EolConversion::Lf` (clean strips CR, smudge adds none).
498        let eol = match (&text, eol) {
499            (TextDecision::Text | TextDecision::Auto, EolConversion::None) => EolConversion::Lf,
500            (_, eol) => eol,
501        };
502
503        // If config does not enable autocrlf and there is no eol/text opinion,
504        // there is genuinely nothing to do.
505        let text = match (text, eol_attr.is_some()) {
506            (TextDecision::Unspecified, _) => {
507                // Without any text/eol attribute, only `core.autocrlf` can make a
508                // path eligible, and then it behaves like `text=auto`.
509                if autocrlf_enabled(config) {
510                    TextDecision::Auto
511                } else {
512                    TextDecision::Unspecified
513                }
514            }
515            (text, _) => text,
516        };
517
518        let driver = resolve_filter_driver(config, filter_attr);
519        let ident = matches!(
520            ident_attr.and_then(|check| check.state.as_ref()),
521            Some(AttributeState::Set)
522        );
523
524        ContentFilterPlan {
525            text,
526            eol,
527            ident,
528            driver,
529            encoding,
530        }
531    }
532
533    /// Whether EOL conversion should run for the given content.
534    fn convert_eol(&self, content: &[u8]) -> bool {
535        match self.text {
536            TextDecision::Binary | TextDecision::Unspecified => false,
537            TextDecision::Text => self.eol != EolConversion::None,
538            // `text=auto`: only when the blob does not look binary.
539            TextDecision::Auto => self.eol != EolConversion::None && !looks_binary(content),
540        }
541    }
542
543    /// The smudge-side LF->CRLF safety check, mirroring convert.c
544    /// `will_convert_lf_to_crlf`. Returns false (no conversion) when:
545    ///   * there is no naked LF to convert, or
546    ///   * the action is `text=auto`-derived (the "new safer autocrlf") AND the
547    ///     content already contains a lone CR or a CRLF pair, or looks binary.
548    ///
549    /// An explicit `text`/`eol=crlf` (non-auto) path always converts naked LFs.
550    pub(crate) fn will_convert_lf_to_crlf(&self, content: &[u8]) -> bool {
551        self.will_convert_lf_to_crlf_stats(&gather_convert_stats(content))
552    }
553
554    /// Stats-based variant of [`will_convert_lf_to_crlf`], mirroring convert.c
555    /// `will_convert_lf_to_crlf(struct text_stat *, ...)`. Used by the safecrlf
556    /// round-trip simulation, which mutates a copy of the stats rather than
557    /// re-scanning the buffer.
558    fn will_convert_lf_to_crlf_stats(&self, stats: &ConvertStats) -> bool {
559        // `output_eol(crlf_action) != EOL_CRLF` short-circuits in git.
560        if self.eol != EolConversion::Crlf {
561            return false;
562        }
563        // No naked LF? Nothing to convert.
564        if stats.lonelf == 0 {
565            return false;
566        }
567        if self.text == TextDecision::Auto {
568            // Any CR or CRLF already present: leave it untouched (irreversible).
569            if stats.lonecr > 0 || stats.crlf > 0 {
570                return false;
571            }
572            if convert_is_binary(stats) {
573                return false;
574            }
575        }
576        true
577    }
578
579    /// Whether this path is a candidate for the `core.safecrlf` round-trip check
580    /// at all: git only warns for non-`CRLF_BINARY` actions. `Binary` and
581    /// `Unspecified` (with autocrlf off) correspond to git's `CRLF_BINARY`.
582    fn safecrlf_applies(&self) -> bool {
583        matches!(self.text, TextDecision::Text | TextDecision::Auto)
584    }
585
586    /// Emit git's `core.safecrlf` round-trip warning for `path`, mirroring the
587    /// stderr side-effect of convert.c `crlf_to_git` (the `CONV_EOL_RNDTRP_*`
588    /// branch). `old_stats` are the stats of the *pre-conversion* worktree
589    /// content (already gathered by the caller so the buffer is scanned once);
590    /// `index_has_crlf` is whether the path's current index blob already has a
591    /// CRLF (git's `has_crlf_in_index`, used only for the auto-crlf decision).
592    ///
593    /// This never inspects or alters the bytes written to the object store; it is
594    /// purely the additive warning git prints alongside `git add`/`commit`.
595    /// Returns `Err` only under `core.safecrlf=true` when the round-trip is
596    /// irreversible (git `die`s).
597    fn check_safe_crlf_stats(
598        &self,
599        old_stats: &ConvertStats,
600        index_has_crlf: bool,
601        flags: ConvFlags,
602        path: &[u8],
603    ) -> Result<()> {
604        if flags == ConvFlags::Off || !self.safecrlf_applies() {
605            return Ok(());
606        }
607
608        // Replicate `crlf_to_git`'s `convert_crlf_into_lf` decision (the clean
609        // direction). It starts as "there is a CRLF to collapse"; auto paths
610        // suppress conversion for binary content or content whose index blob
611        // already carries a CRLF (the "new safer autocrlf").
612        let mut convert_crlf_into_lf = old_stats.crlf > 0;
613        if self.text == TextDecision::Auto {
614            if convert_is_binary(old_stats) {
615                // git returns 0 here: no conversion *and* no warning.
616                return Ok(());
617            }
618            if index_has_crlf {
619                convert_crlf_into_lf = false;
620            }
621        }
622
623        // Simulate the round-trip on a copy of the stats.
624        let mut new_stats = old_stats.clone();
625        // Simulate "git add" (clean: CRLF -> LF).
626        if convert_crlf_into_lf {
627            new_stats.lonelf += new_stats.crlf;
628            new_stats.crlf = 0;
629        }
630        // Simulate "git checkout" (smudge: LF -> CRLF).
631        if self.will_convert_lf_to_crlf_stats(&new_stats) {
632            new_stats.crlf += new_stats.lonelf;
633            new_stats.lonelf = 0;
634        }
635        check_safe_crlf(old_stats, &new_stats, flags, path)
636    }
637}
638
639/// Derive the smudge-direction line ending from `core.autocrlf` / `core.eol`.
640pub(crate) fn eol_from_config(config: &GitConfig) -> EolConversion {
641    if let Some(value) = config.get("core", None, "autocrlf") {
642        match value.to_ascii_lowercase().as_str() {
643            "input" => return EolConversion::Lf,
644            "true" | "yes" | "on" | "1" => return EolConversion::Crlf,
645            _ => {}
646        }
647    }
648    if config.get_bool("core", None, "autocrlf") == Some(true) {
649        return EolConversion::Crlf;
650    }
651    match config
652        .get("core", None, "eol")
653        .map(|v| v.to_ascii_lowercase())
654    {
655        Some(ref v) if v == "crlf" => EolConversion::Crlf,
656        Some(ref v) if v == "lf" => EolConversion::Lf,
657        _ => EolConversion::None,
658    }
659}
660
661/// Whether `core.autocrlf` is set to anything that enables conversion
662/// (`true` or `input`).
663pub(crate) fn autocrlf_enabled(config: &GitConfig) -> bool {
664    if let Some(value) = config.get("core", None, "autocrlf")
665        && value.eq_ignore_ascii_case("input")
666    {
667        return true;
668    }
669    config.get_bool("core", None, "autocrlf") == Some(true)
670}
671
672/// Resolve the `filter=<name>` attribute against `filter.<name>.*` config.
673pub(crate) fn resolve_filter_driver(
674    config: &GitConfig,
675    filter_attr: Option<&AttributeCheck>,
676) -> Option<FilterDriver> {
677    let name = match filter_attr.map(|check| &check.state) {
678        Some(Some(AttributeState::Value(value))) => value.clone(),
679        // `filter` set/unset without a value selects no driver.
680        _ => return None,
681    };
682    let subsection = String::from_utf8_lossy(&name).into_owned();
683    let process = filter_config_value(config, &subsection, "process").filter(|cmd| !cmd.is_empty());
684    let clean = filter_config_value(config, &subsection, "clean").filter(|cmd| !cmd.is_empty());
685    let smudge = filter_config_value(config, &subsection, "smudge").filter(|cmd| !cmd.is_empty());
686    let required = filter_config_bool(config, &subsection, "required").unwrap_or(false);
687    // A filter with neither command and not required is a no-op.
688    if process.is_none() && clean.is_none() && smudge.is_none() && !required {
689        return None;
690    }
691    Some(FilterDriver {
692        name,
693        process,
694        clean,
695        smudge,
696        required,
697    })
698}
699
700pub(crate) fn filter_config_value(
701    config: &GitConfig,
702    subsection: &str,
703    key: &str,
704) -> Option<String> {
705    config
706        .get("filter", Some(subsection), key)
707        .map(str::to_owned)
708        .or_else(|| global_filter_config_value(subsection, key))
709}
710
711pub(crate) fn filter_config_bool(config: &GitConfig, subsection: &str, key: &str) -> Option<bool> {
712    config
713        .get_bool("filter", Some(subsection), key)
714        .or_else(|| {
715            global_filter_config_value(subsection, key)
716                .as_deref()
717                .and_then(sley_config::parse_config_bool)
718        })
719}
720
721pub(crate) fn global_filter_config_value(subsection: &str, key: &str) -> Option<String> {
722    for (path, _) in sley_config::default_config_layer_paths().into_iter().rev() {
723        let Ok(config) = GitConfig::read(path) else {
724            continue;
725        };
726        if let Some(value) = config.get("filter", Some(subsection), key) {
727            return Some(value.to_owned());
728        }
729    }
730    None
731}
732
733/// Heuristic mirroring git's `buffer_is_binary`: content is treated as binary
734/// when a NUL byte appears within the first 8000 bytes.
735pub(crate) fn looks_binary(content: &[u8]) -> bool {
736    const FIRST_FEW_BYTES: usize = 8000;
737    let window = &content[..content.len().min(FIRST_FEW_BYTES)];
738    window.contains(&0)
739}
740
741/// Strip carriage returns that immediately precede a line feed (CRLF -> LF).
742/// A lone CR (old-Mac line ending) is left untouched, matching git, which only
743/// collapses CRLF pairs.
744pub(crate) fn convert_crlf_to_lf_cow(content: Cow<'_, [u8]>) -> Cow<'_, [u8]> {
745    if !content.windows(2).any(|window| window == b"\r\n") {
746        return content;
747    }
748    let mut out = Vec::with_capacity(content.len());
749    let mut index = 0;
750    while index < content.len() {
751        let byte = content[index];
752        if byte == b'\r' && content.get(index + 1) == Some(&b'\n') {
753            // Drop the CR; the LF is emitted on the next iteration.
754            index += 1;
755            continue;
756        }
757        out.push(byte);
758        index += 1;
759    }
760    Cow::Owned(out)
761}
762
763/// Convert lone LF bytes to CRLF (LF -> CRLF). An LF already preceded by a CR
764/// is left as-is so content is not double-converted, matching git.
765pub(crate) fn convert_lf_to_crlf(content: &[u8]) -> Vec<u8> {
766    let mut out = Vec::with_capacity(content.len() + content.len() / 16);
767    let mut prev = 0u8;
768    for &byte in content {
769        if byte == b'\n' && prev != b'\r' {
770            out.push(b'\r');
771        }
772        out.push(byte);
773        prev = byte;
774    }
775    out
776}
777
778/// Collapse git `$Id: ... $` keywords to `$Id$` on the clean path.
779pub(crate) fn ident_to_git_cow(content: Cow<'_, [u8]>) -> Cow<'_, [u8]> {
780    let input = content.as_ref();
781    if !has_git_ident(input) {
782        return content;
783    }
784    let mut out = Vec::with_capacity(input.len());
785    let mut pos = 0;
786    while let Some(relative) = input[pos..].iter().position(|byte| *byte == b'$') {
787        let dollar = pos + relative;
788        out.extend_from_slice(&input[pos..=dollar]);
789        pos = dollar + 1;
790        if input.len().saturating_sub(pos) > 3 && input[pos..].starts_with(b"Id:") {
791            let search = &input[pos + 3..];
792            let Some(end_relative) = search.iter().position(|byte| *byte == b'$') else {
793                break;
794            };
795            let end = pos + 3 + end_relative;
796            if input[pos + 3..end].contains(&b'\n') {
797                continue;
798            }
799            out.extend_from_slice(b"Id$");
800            pos = end + 1;
801        }
802    }
803    out.extend_from_slice(&input[pos..]);
804    Cow::Owned(out)
805}
806
807/// Expand `$Id$` and git-style `$Id: <hex> $` keywords using the blob id of the
808/// unexpanded content, matching convert.c's ident_to_worktree.
809pub(crate) fn ident_to_worktree_cow(
810    format: ObjectFormat,
811    content: Cow<'_, [u8]>,
812) -> Result<Cow<'_, [u8]>> {
813    let input = content.as_ref();
814    if !has_git_ident(input) {
815        return Ok(content);
816    }
817    let oid = EncodedObject::new(ObjectType::Blob, input.to_vec()).object_id(format)?;
818    let replacement = format!("Id: {} $", oid.to_hex());
819    let mut out = Vec::with_capacity(input.len() + replacement.len());
820    let mut pos = 0;
821    while let Some(relative) = input[pos..].iter().position(|byte| *byte == b'$') {
822        let dollar = pos + relative;
823        out.extend_from_slice(&input[pos..=dollar]);
824        pos = dollar + 1;
825        if input.len().saturating_sub(pos) < 3 || !input[pos..].starts_with(b"Id") {
826            continue;
827        }
828        match input.get(pos + 2) {
829            Some(b'$') => {
830                pos += 3;
831            }
832            Some(b':') => {
833                let search = &input[pos + 3..];
834                let Some(end_relative) = search.iter().position(|byte| *byte == b'$') else {
835                    break;
836                };
837                let end = pos + 3 + end_relative;
838                if input[pos + 3..end].contains(&b'\n') || is_foreign_ident(&input[pos + 3..end]) {
839                    continue;
840                }
841                pos = end + 1;
842            }
843            _ => continue,
844        }
845        out.extend_from_slice(replacement.as_bytes());
846    }
847    out.extend_from_slice(&input[pos..]);
848    Ok(Cow::Owned(out))
849}
850
851pub(crate) fn has_git_ident(content: &[u8]) -> bool {
852    let mut pos = 0;
853    while let Some(relative) = content[pos..].iter().position(|byte| *byte == b'$') {
854        let start = pos + relative + 1;
855        if content.len().saturating_sub(start) < 3 {
856            break;
857        }
858        if !content[start..].starts_with(b"Id") {
859            pos = start;
860            continue;
861        }
862        match content.get(start + 2) {
863            Some(b'$') => return true,
864            Some(b':') => {
865                let search = &content[start + 3..];
866                let Some(end_relative) = search.iter().position(|byte| *byte == b'$') else {
867                    break;
868                };
869                let end = start + 3 + end_relative;
870                if !content[start + 3..end].contains(&b'\n') {
871                    return true;
872                }
873                pos = end + 1;
874            }
875            _ => pos = start,
876        }
877    }
878    false
879}
880
881pub(crate) fn is_foreign_ident(expansion: &[u8]) -> bool {
882    if expansion.len() <= 1 {
883        return false;
884    }
885    expansion[1..expansion.len().saturating_sub(1)].contains(&b' ')
886}
887
888/// Run a configured `clean`/`smudge` command as a subprocess, feeding `content`
889/// on stdin and returning its stdout. Errors carry enough context for the
890/// caller to decide whether the failure is fatal (required filter) or should be
891/// silently ignored (optional filter passthrough).
892pub(crate) fn run_filter_command(command: &str, path: &[u8], content: &[u8]) -> Result<Vec<u8>> {
893    // Git expands `%f` in the filter command to the path of the file being
894    // filtered (quoted). We perform the same substitution.
895    let display_path = String::from_utf8_lossy(path);
896    let expanded = command.replace("%f", &shell_quote(&display_path));
897    // Run through the platform shell so pipelines / arguments in the configured
898    // command behave the same way git's `run_command`-with-shell does.
899    let (shell, flag) = if cfg!(windows) {
900        ("cmd", "/C")
901    } else {
902        ("/bin/sh", "-c")
903    };
904    let mut child = Command::new(shell)
905        .arg(flag)
906        .arg(&expanded)
907        .stdin(Stdio::piped())
908        .stdout(Stdio::piped())
909        .stderr(Stdio::piped())
910        .spawn()
911        .map_err(|err| GitError::Command(format!("failed to spawn filter `{command}`: {err}")))?;
912    // Write the content to the child's stdin on a separate thread so we never
913    // deadlock against a filter that streams output before consuming all input.
914    let mut stdin = child
915        .stdin
916        .take()
917        .ok_or_else(|| GitError::Command(format!("filter `{command}` stdin unavailable")))?;
918    let payload = content.to_vec();
919    let writer = std::thread::spawn(move || {
920        let _ = stdin.write_all(&payload);
921        // Dropping `stdin` here closes the pipe so the child sees EOF.
922    });
923    let output = child
924        .wait_with_output()
925        .map_err(|err| GitError::Command(format!("filter `{command}` failed: {err}")))?;
926    // Join the writer; its own errors (e.g. broken pipe) are non-fatal because
927    // the child's exit status is the authoritative signal.
928    let _ = writer.join();
929    if !output.status.success() {
930        let stderr = String::from_utf8_lossy(&output.stderr);
931        return Err(GitError::Command(format!(
932            "filter `{command}` exited with {}: {}",
933            output.status,
934            stderr.trim()
935        )));
936    }
937    Ok(output.stdout)
938}
939
940pub(crate) const PROCESS_CAP_CLEAN: u8 = 1;
941pub(crate) const PROCESS_CAP_SMUDGE: u8 = 1 << 1;
942pub(crate) const PROCESS_CAP_DELAY: u8 = 1 << 2;
943pub(crate) const PKT_DATA_MAX: usize = 65_516;
944
945pub(crate) static PROCESS_FILTERS: OnceLock<Mutex<HashMap<String, ProcessFilter>>> =
946    OnceLock::new();
947pub(crate) type ProcessFilterMetadata = Vec<(String, String)>;
948pub(crate) static PROCESS_FILTER_METADATA: OnceLock<Mutex<Option<ProcessFilterMetadata>>> =
949    OnceLock::new();
950
951pub(crate) struct ProcessFilterMetadataGuard {
952    previous: Option<ProcessFilterMetadata>,
953}
954
955impl Drop for ProcessFilterMetadataGuard {
956    fn drop(&mut self) {
957        if let Ok(mut guard) = PROCESS_FILTER_METADATA
958            .get_or_init(|| Mutex::new(None))
959            .lock()
960        {
961            *guard = self.previous.take();
962        }
963    }
964}
965
966pub(crate) fn set_process_filter_metadata(
967    metadata: Option<ProcessFilterMetadata>,
968) -> ProcessFilterMetadataGuard {
969    let mutex = PROCESS_FILTER_METADATA.get_or_init(|| Mutex::new(None));
970    let previous = mutex
971        .lock()
972        .map(|mut guard| std::mem::replace(&mut *guard, metadata))
973        .unwrap_or(None);
974    ProcessFilterMetadataGuard { previous }
975}
976
977pub(crate) fn current_process_filter_metadata() -> Option<ProcessFilterMetadata> {
978    PROCESS_FILTER_METADATA
979        .get_or_init(|| Mutex::new(None))
980        .lock()
981        .ok()
982        .and_then(|guard| guard.clone())
983}
984
985pub(crate) struct ProcessFilter {
986    child: Child,
987    stdin: ChildStdin,
988    stdout: ChildStdout,
989    capabilities: u8,
990}
991
992pub(crate) enum ProcessFilterOutcome {
993    Filtered(Vec<u8>),
994    Unsupported,
995    Status(String),
996}
997
998pub(crate) struct ProcessFilterFailure {
999    message: String,
1000    protocol: bool,
1001}
1002
1003impl ProcessFilterFailure {
1004    fn protocol(message: impl Into<String>) -> Self {
1005        Self {
1006            message: message.into(),
1007            protocol: true,
1008        }
1009    }
1010}
1011
1012pub(crate) fn run_process_filter(
1013    command: &str,
1014    direction: &str,
1015    path: &[u8],
1016    content: &[u8],
1017    blob: Option<ObjectId>,
1018) -> std::result::Result<ProcessFilterOutcome, ProcessFilterFailure> {
1019    let filters = PROCESS_FILTERS.get_or_init(|| Mutex::new(HashMap::new()));
1020    let mut filters = filters
1021        .lock()
1022        .map_err(|_| ProcessFilterFailure::protocol("process filter cache poisoned"))?;
1023    if !filters.contains_key(command) {
1024        let filter = ProcessFilter::start(command)?;
1025        filters.insert(command.to_string(), filter);
1026    }
1027    let result = filters
1028        .get_mut(command)
1029        .expect("process filter was inserted")
1030        .apply(direction, path, content, blob);
1031    if result.as_ref().is_err_and(|err| err.protocol) {
1032        filters.remove(command);
1033    }
1034    result
1035}
1036
1037impl ProcessFilter {
1038    fn start(command: &str) -> std::result::Result<Self, ProcessFilterFailure> {
1039        let (shell, flag) = if cfg!(windows) {
1040            ("cmd", "/C")
1041        } else {
1042            ("/bin/sh", "-c")
1043        };
1044        let mut child = Command::new(shell)
1045            .arg(flag)
1046            .arg(command)
1047            .stdin(Stdio::piped())
1048            .stdout(Stdio::piped())
1049            .stderr(Stdio::inherit())
1050            .spawn()
1051            .map_err(|err| {
1052                ProcessFilterFailure::protocol(format!(
1053                    "cannot fork to run subprocess '{command}': {err}"
1054                ))
1055            })?;
1056        let mut stdin = child
1057            .stdin
1058            .take()
1059            .ok_or_else(|| ProcessFilterFailure::protocol("process filter stdin unavailable"))?;
1060        let mut stdout = child
1061            .stdout
1062            .take()
1063            .ok_or_else(|| ProcessFilterFailure::protocol("process filter stdout unavailable"))?;
1064
1065        write_pkt_text(&mut stdin, "git-filter-client\n")?;
1066        write_pkt_text(&mut stdin, "version=2\n")?;
1067        write_flush(&mut stdin)?;
1068
1069        let line = read_pkt_text(&mut stdout)?.ok_or_else(|| {
1070            ProcessFilterFailure::protocol(
1071                "Unexpected line '<flush packet>', expected git-filter-server",
1072            )
1073        })?;
1074        if line != "git-filter-server" {
1075            return Err(ProcessFilterFailure::protocol(format!(
1076                "Unexpected line '{line}', expected git-filter-server"
1077            )));
1078        }
1079        let line = read_pkt_text(&mut stdout)?.ok_or_else(|| {
1080            ProcessFilterFailure::protocol("Unexpected line '<flush packet>', expected version")
1081        })?;
1082        if line != "version=2" {
1083            return Err(ProcessFilterFailure::protocol(format!(
1084                "Unexpected line '{line}', expected version"
1085            )));
1086        }
1087        if let Some(line) = read_pkt_text(&mut stdout)? {
1088            return Err(ProcessFilterFailure::protocol(format!(
1089                "Unexpected line '{line}', expected flush"
1090            )));
1091        }
1092
1093        write_pkt_text(&mut stdin, "capability=clean\n")?;
1094        write_pkt_text(&mut stdin, "capability=smudge\n")?;
1095        write_pkt_text(&mut stdin, "capability=delay\n")?;
1096        write_flush(&mut stdin)?;
1097
1098        let mut capabilities = 0;
1099        while let Some(line) = read_pkt_text(&mut stdout)? {
1100            match line.as_str() {
1101                "capability=clean" => capabilities |= PROCESS_CAP_CLEAN,
1102                "capability=smudge" => capabilities |= PROCESS_CAP_SMUDGE,
1103                "capability=delay" => capabilities |= PROCESS_CAP_DELAY,
1104                _ => {}
1105            }
1106        }
1107
1108        Ok(Self {
1109            child,
1110            stdin,
1111            stdout,
1112            capabilities,
1113        })
1114    }
1115
1116    fn apply(
1117        &mut self,
1118        direction: &str,
1119        path: &[u8],
1120        content: &[u8],
1121        blob: Option<ObjectId>,
1122    ) -> std::result::Result<ProcessFilterOutcome, ProcessFilterFailure> {
1123        let wanted = match direction {
1124            "clean" => PROCESS_CAP_CLEAN,
1125            "smudge" => PROCESS_CAP_SMUDGE,
1126            _ => 0,
1127        };
1128        if self.capabilities & wanted == 0 {
1129            return Ok(ProcessFilterOutcome::Unsupported);
1130        }
1131
1132        write_pkt_text(&mut self.stdin, &format!("command={direction}\n"))?;
1133        write_pkt_text(
1134            &mut self.stdin,
1135            &format!("pathname={}\n", String::from_utf8_lossy(path)),
1136        )?;
1137        if direction == "smudge"
1138            && let Some(blob) = blob
1139        {
1140            if let Some(metadata) = current_process_filter_metadata() {
1141                for (key, value) in metadata {
1142                    write_pkt_text(&mut self.stdin, &format!("{key}={value}\n"))?;
1143                }
1144            }
1145            write_pkt_text(&mut self.stdin, &format!("blob={}\n", blob.to_hex()))?;
1146        }
1147        write_flush(&mut self.stdin)?;
1148        write_pkt_content(&mut self.stdin, content)?;
1149        write_flush(&mut self.stdin)?;
1150
1151        let mut status = read_process_status(&mut self.stdout)?.unwrap_or_default();
1152        match status.as_str() {
1153            "success" => {}
1154            "error" | "abort" | "delayed" => return Ok(ProcessFilterOutcome::Status(status)),
1155            other => {
1156                return Err(ProcessFilterFailure::protocol(format!(
1157                    "external filter returned unsupported status '{other}'"
1158                )));
1159            }
1160        }
1161
1162        let output = read_pkt_content(&mut self.stdout)?;
1163        if let Some(next) = read_process_status(&mut self.stdout)? {
1164            status = next;
1165        }
1166        match status.as_str() {
1167            "" | "success" => Ok(ProcessFilterOutcome::Filtered(output)),
1168            "error" | "abort" | "delayed" => Ok(ProcessFilterOutcome::Status(status)),
1169            other => Err(ProcessFilterFailure::protocol(format!(
1170                "external filter returned unsupported status '{other}'"
1171            ))),
1172        }
1173    }
1174}
1175
1176impl Drop for ProcessFilter {
1177    fn drop(&mut self) {
1178        let _ = self.stdin.flush();
1179        let _ = self.child.kill();
1180        let _ = self.child.wait();
1181    }
1182}
1183
1184pub(crate) fn write_pkt_text(
1185    writer: &mut ChildStdin,
1186    text: &str,
1187) -> std::result::Result<(), ProcessFilterFailure> {
1188    write_pkt_data(writer, text.as_bytes())
1189}
1190
1191pub(crate) fn write_pkt_content(
1192    writer: &mut ChildStdin,
1193    content: &[u8],
1194) -> std::result::Result<(), ProcessFilterFailure> {
1195    for chunk in content.chunks(PKT_DATA_MAX) {
1196        write_pkt_data(writer, chunk)?;
1197    }
1198    Ok(())
1199}
1200
1201pub(crate) fn write_pkt_data(
1202    writer: &mut ChildStdin,
1203    data: &[u8],
1204) -> std::result::Result<(), ProcessFilterFailure> {
1205    let len = data.len() + 4;
1206    write!(writer, "{len:04x}")
1207        .and_then(|_| writer.write_all(data))
1208        .map_err(|err| {
1209            ProcessFilterFailure::protocol(format!("process filter write failed: {err}"))
1210        })
1211}
1212
1213pub(crate) fn write_flush(
1214    writer: &mut ChildStdin,
1215) -> std::result::Result<(), ProcessFilterFailure> {
1216    writer
1217        .write_all(b"0000")
1218        .and_then(|_| writer.flush())
1219        .map_err(|err| {
1220            ProcessFilterFailure::protocol(format!("process filter write failed: {err}"))
1221        })
1222}
1223
1224pub(crate) fn read_pkt_text(
1225    reader: &mut ChildStdout,
1226) -> std::result::Result<Option<String>, ProcessFilterFailure> {
1227    let Some(mut data) = read_pkt_data(reader)? else {
1228        return Ok(None);
1229    };
1230    if data.last() == Some(&b'\n') {
1231        data.pop();
1232    }
1233    Ok(Some(String::from_utf8_lossy(&data).into_owned()))
1234}
1235
1236pub(crate) fn read_pkt_content(
1237    reader: &mut ChildStdout,
1238) -> std::result::Result<Vec<u8>, ProcessFilterFailure> {
1239    let mut out = Vec::new();
1240    while let Some(data) = read_pkt_data(reader)? {
1241        out.extend_from_slice(&data);
1242    }
1243    Ok(out)
1244}
1245
1246pub(crate) fn read_pkt_data(
1247    reader: &mut ChildStdout,
1248) -> std::result::Result<Option<Vec<u8>>, ProcessFilterFailure> {
1249    let mut header = [0u8; 4];
1250    reader.read_exact(&mut header).map_err(|err| {
1251        ProcessFilterFailure::protocol(format!("process filter read failed: {err}"))
1252    })?;
1253    let header = std::str::from_utf8(&header)
1254        .map_err(|err| ProcessFilterFailure::protocol(format!("invalid pkt-line header: {err}")))?;
1255    let len = usize::from_str_radix(header, 16)
1256        .map_err(|err| ProcessFilterFailure::protocol(format!("invalid pkt-line length: {err}")))?;
1257    if len == 0 {
1258        return Ok(None);
1259    }
1260    if len < 4 {
1261        return Err(ProcessFilterFailure::protocol(format!(
1262            "invalid pkt-line length {len}"
1263        )));
1264    }
1265    let mut data = vec![0; len - 4];
1266    reader.read_exact(&mut data).map_err(|err| {
1267        ProcessFilterFailure::protocol(format!("process filter read failed: {err}"))
1268    })?;
1269    Ok(Some(data))
1270}
1271
1272pub(crate) fn read_process_status(
1273    reader: &mut ChildStdout,
1274) -> std::result::Result<Option<String>, ProcessFilterFailure> {
1275    let mut status = None;
1276    while let Some(line) = read_pkt_text(reader)? {
1277        if let Some(value) = line.strip_prefix("status=") {
1278            status = Some(value.to_string());
1279        }
1280    }
1281    Ok(status)
1282}
1283
1284/// Minimal POSIX single-quote escaping for substituting `%f` into a shell
1285/// command (used only for the path passed to driver filters).
1286pub(crate) fn shell_quote(value: &str) -> String {
1287    let mut out = String::with_capacity(value.len() + 2);
1288    out.push('\'');
1289    for ch in value.chars() {
1290        if ch == '\'' {
1291            out.push_str("'\\''");
1292        } else {
1293            out.push(ch);
1294        }
1295    }
1296    out.push('\'');
1297    out
1298}
1299
1300/// Apply the *clean* conversion to `content` for `path` (worktree -> blob):
1301/// first the configured `filter.<name>.clean` driver (if any), then CRLF->LF
1302/// normalization when EOL conversion applies.
1303///
1304/// `config` is the repository config (`GitConfig`) and `path` is the
1305/// repository-relative path of the file (forward-slash separated, e.g.
1306/// `src/main.rs`). When no filter or EOL conversion applies the input is
1307/// returned unchanged.
1308///
1309/// A *required* driver (`filter.<name>.required=true`) whose `clean` command is
1310/// missing or fails produces a [`GitError::Command`]; a non-required driver
1311/// failure (or absence of a `clean` command) passes the content through
1312/// unfiltered, matching git.
1313pub fn apply_clean_filter(
1314    worktree_root: impl AsRef<Path>,
1315    git_dir: impl AsRef<Path>,
1316    config: &GitConfig,
1317    path: &[u8],
1318    content: &[u8],
1319) -> Result<Vec<u8>> {
1320    // On clean the worktree file exists, so the live `.gitattributes` chain is
1321    // authoritative. `git_dir` is accepted for symmetry with the smudge entry
1322    // point (which falls back to the index) and for future use.
1323    let _ = git_dir.as_ref();
1324    let checks = filter_attribute_checks(worktree_root.as_ref(), path)?;
1325    apply_clean_filter_with_attributes(config, &checks, path, content)
1326}
1327
1328/// A reusable handle that captures the worktree's `.gitattributes` chain once so
1329/// repeated clean-filter calls (e.g. `hash-object --stdin-paths` hashing many
1330/// paths in one process) don't re-walk the worktree and re-read every
1331/// `.gitattributes`/global config per path.
1332///
1333/// Build it once with [`WorktreeAttributes::from_worktree_root`], then call
1334/// [`WorktreeAttributes::apply_clean_filter`] per path. This mirrors
1335/// [`apply_clean_filter`] exactly except the expensive attribute-source scan is
1336/// amortized across calls.
1337pub struct WorktreeAttributes {
1338    matcher: AttributeMatcher,
1339}
1340
1341impl WorktreeAttributes {
1342    /// Read the worktree's attribute sources once (global/`core.attributesFile`,
1343    /// every in-tree `.gitattributes`, and `$GIT_DIR/info/attributes`).
1344    pub fn from_worktree_root(worktree_root: impl AsRef<Path>) -> Result<Self> {
1345        Ok(Self {
1346            matcher: AttributeMatcher::from_worktree_root(worktree_root.as_ref())?,
1347        })
1348    }
1349
1350    /// Apply the clean conversion to `content` for `path`, reusing the cached
1351    /// attribute chain. Behaviourally identical to [`apply_clean_filter`].
1352    pub fn apply_clean_filter(
1353        &self,
1354        config: &GitConfig,
1355        path: &[u8],
1356        content: &[u8],
1357    ) -> Result<Vec<u8>> {
1358        let checks = self
1359            .matcher
1360            .attributes_for_path(path, &filter_attribute_names(), false);
1361        apply_clean_filter_with_attributes(config, &checks, path, content)
1362    }
1363}
1364
1365/// A reusable handle that captures a *tree's* `.gitattributes` chain once so
1366/// repeated smudge-filter calls (e.g. `git archive` streaming every blob in a
1367/// tree) resolve attributes from the tree being processed rather than the live
1368/// worktree.
1369///
1370/// This is the attribute direction `git archive` uses: upstream unpacks the
1371/// archived tree into a scratch index and sets `GIT_ATTR_INDEX`, so the
1372/// `.gitattributes` that govern conversion come from the *archived tree* (plus
1373/// the global/`core.attributesFile` chain and `$GIT_DIR/info/attributes`), not
1374/// from whatever happens to be checked out. `--worktree-attributes` callers
1375/// should use [`WorktreeAttributes`] instead.
1376///
1377/// Build it once with [`TreeAttributes::from_tree`], then call
1378/// [`TreeAttributes::apply_smudge_filter`] per blob. Behaviourally this mirrors
1379/// [`apply_smudge_filter`] except the attribute source is the supplied tree and
1380/// the expensive source scan is amortized across calls.
1381pub struct TreeAttributes {
1382    matcher: AttributeMatcher,
1383}
1384
1385impl TreeAttributes {
1386    /// Read the attribute sources for `tree_oid` once: the global /
1387    /// `core.attributesFile` chain, every `.gitattributes` blob found while
1388    /// walking `tree_oid`, and `$GIT_DIR/info/attributes`.
1389    ///
1390    /// `attr_root` locates the global config (`read_configured_attributes`);
1391    /// pass the worktree root for a non-bare repo, or the git dir for a bare
1392    /// one. `git_dir` locates `info/attributes` directly (so this works for bare
1393    /// repos, where there is no nested `.git`). No worktree `.gitattributes`
1394    /// files are read — use [`WorktreeAttributes`] for the
1395    /// `--worktree-attributes` direction.
1396    pub fn from_tree(
1397        attr_root: impl AsRef<Path>,
1398        git_dir: impl AsRef<Path>,
1399        db: &FileObjectDatabase,
1400        format: ObjectFormat,
1401        tree_oid: &ObjectId,
1402    ) -> Result<Self> {
1403        let attr_root = attr_root.as_ref();
1404        let git_dir = git_dir.as_ref();
1405        let mut matcher = AttributeMatcher::default();
1406        matcher.configure_case_sensitivity(git_dir);
1407        if !matcher.read_configured_attributes(attr_root, git_dir) {
1408            matcher.read_default_global_attributes();
1409        }
1410        collect_attribute_patterns_from_tree(db, format, tree_oid, Vec::new(), &mut matcher)?;
1411        read_attribute_patterns(
1412            git_dir.join("info").join("attributes"),
1413            &mut matcher,
1414            &[],
1415            b"info/attributes",
1416            false,
1417        );
1418        Ok(Self { matcher })
1419    }
1420
1421    /// Apply the smudge conversion (blob -> worktree: EOL `LF`->`CRLF` plus any
1422    /// configured `filter.<name>.smudge` driver) to `content` for `path`,
1423    /// reusing the cached attribute chain. Behaviourally identical to
1424    /// [`apply_smudge_filter`] except attributes come from the tree this handle
1425    /// was built from.
1426    pub fn apply_smudge_filter(
1427        &self,
1428        config: &GitConfig,
1429        path: &[u8],
1430        content: &[u8],
1431    ) -> Result<Vec<u8>> {
1432        let checks = self
1433            .matcher
1434            .attributes_for_path(path, &filter_attribute_names(), false);
1435        apply_smudge_filter_with_attributes(config, &checks, path, content)
1436    }
1437
1438    pub fn attributes_for_path(&self, path: &[u8], requested: &[Vec<u8>]) -> Vec<AttributeCheck> {
1439        self.matcher.attributes_for_path(path, requested, false)
1440    }
1441
1442    /// True when `path` has the `export-subst` attribute set (git's
1443    /// `check_attr_export_subst`), meaning `git archive` should run
1444    /// `$Format:…$` keyword substitution on its content.
1445    pub fn export_subst_for_path(&self, path: &[u8]) -> bool {
1446        self.attribute_is_set(path, b"export-subst")
1447    }
1448
1449    /// True when `path` has the `export-ignore` attribute set (git's
1450    /// `check_attr_export_ignore`), meaning `git archive` should omit the path
1451    /// (and, for a directory, its whole subtree) from the archive.
1452    pub fn export_ignore_for_path(&self, path: &[u8]) -> bool {
1453        self.attribute_is_set(path, b"export-ignore")
1454    }
1455
1456    fn attribute_is_set(&self, path: &[u8], attribute: &[u8]) -> bool {
1457        let requested = [attribute.to_vec()];
1458        let checks = self.matcher.attributes_for_path(path, &requested, false);
1459        matches!(
1460            checks.first().and_then(|check| check.state.as_ref()),
1461            Some(AttributeState::Set)
1462        )
1463    }
1464
1465    /// The `diff` attribute state for `path` (`Set` for `diff`, `Unset` for
1466    /// `-diff`, `Value(name)` for `diff=<name>`, `None` when unspecified). Used
1467    /// by `git archive`'s zip backend to classify text vs. binary via the
1468    /// path's userdiff driver.
1469    pub fn diff_attribute_for_path(&self, path: &[u8]) -> Option<AttributeState> {
1470        let requested = [b"diff".to_vec()];
1471        let checks = self.matcher.attributes_for_path(path, &requested, false);
1472        checks.into_iter().next().and_then(|check| check.state)
1473    }
1474}
1475
1476/// Like [`apply_clean_filter`] but takes already-resolved attribute checks,
1477/// letting callers that have computed attributes once reuse them.
1478pub fn apply_clean_filter_with_attributes(
1479    config: &GitConfig,
1480    attributes: &[AttributeCheck],
1481    path: &[u8],
1482    content: &[u8],
1483) -> Result<Vec<u8>> {
1484    Ok(apply_clean_filter_with_attributes_cow(config, attributes, path, content)?.into_owned())
1485}
1486
1487/// Borrow-first variant of [`apply_clean_filter_with_attributes`].
1488///
1489/// When no filter or EOL conversion changes the content, the returned value
1490/// borrows `content`; callers that can consume a [`Cow`] avoid allocating for
1491/// the common pass-through case.
1492pub fn apply_clean_filter_with_attributes_cow<'a>(
1493    config: &GitConfig,
1494    attributes: &[AttributeCheck],
1495    path: &[u8],
1496    content: &'a [u8],
1497) -> Result<Cow<'a, [u8]>> {
1498    apply_clean_filter_with_attributes_cow_safecrlf(
1499        config,
1500        attributes,
1501        path,
1502        content,
1503        ConvFlags::Off,
1504        SafeCrlfIndexBlob::None,
1505    )
1506}
1507
1508/// How the safecrlf check should learn whether this path's *current index blob*
1509/// already contains a CRLF (git's `has_crlf_in_index`). Only consulted on the
1510/// `text=auto` / `core.autocrlf` path.
1511pub enum SafeCrlfIndexBlob<'a> {
1512    /// No index blob is available (the staging caller has none, or safecrlf is
1513    /// off) — treated as "no CRLF in index".
1514    None,
1515    /// The path's current index blob, read on demand from this object database
1516    /// only when the auto-crlf decision actually needs it.
1517    Lookup {
1518        odb: &'a FileObjectDatabase,
1519        oid: ObjectId,
1520    },
1521}
1522
1523impl SafeCrlfIndexBlob<'_> {
1524    fn has_crlf(&self) -> bool {
1525        match self {
1526            SafeCrlfIndexBlob::None => false,
1527            SafeCrlfIndexBlob::Lookup { odb, oid } => has_crlf_in_index(odb, oid),
1528        }
1529    }
1530}
1531
1532/// [`apply_clean_filter_with_attributes_cow`] plus git's additive `core.safecrlf`
1533/// round-trip warning (convert.c `crlf_to_git`).
1534///
1535/// The conversion result is byte-for-byte identical to the plain variant;
1536/// `flags`/`index_blob` only drive the stderr warning git prints when a
1537/// CRLF<->LF round-trip would not be reversible. The warning is computed on the
1538/// *post-driver, pre-EOL-conversion* content, matching git's ordering in
1539/// `convert_to_git` (apply_filter -> crlf_to_git).
1540pub fn apply_clean_filter_with_attributes_cow_safecrlf<'a>(
1541    config: &GitConfig,
1542    attributes: &[AttributeCheck],
1543    path: &[u8],
1544    content: &'a [u8],
1545    flags: ConvFlags,
1546    index_blob: SafeCrlfIndexBlob<'_>,
1547) -> Result<Cow<'a, [u8]>> {
1548    // Non-object-writing callers (diff/status comparison): an encoding failure
1549    // is reported but not fatal.
1550    apply_clean_filter_cow_inner(config, attributes, path, content, flags, index_blob, false)
1551}
1552
1553/// Clean conversion core. `write_object` is set on the paths that hash content
1554/// into the object database (add / hash-object): there, an invalid
1555/// `working-tree-encoding` (bad BOM, undecodable bytes) is fatal, mirroring
1556/// convert.c's `CONV_WRITE_OBJECT` die.
1557pub(crate) fn apply_clean_filter_cow_inner<'a>(
1558    config: &GitConfig,
1559    attributes: &[AttributeCheck],
1560    path: &[u8],
1561    content: &'a [u8],
1562    flags: ConvFlags,
1563    index_blob: SafeCrlfIndexBlob<'_>,
1564    write_object: bool,
1565) -> Result<Cow<'a, [u8]>> {
1566    let plan = ContentFilterPlan::resolve(config, attributes);
1567    check_wt_encoding_valid(&plan.encoding)?;
1568    let mut data = Cow::Borrowed(content);
1569    if let Some(driver) = &plan.driver {
1570        data = run_driver(driver, driver.clean.as_deref(), "clean", None, path, data)?;
1571    }
1572    // encode_to_git runs before the EOL pass (convert.c order: filter →
1573    // encode_to_git → crlf_to_git): the worktree charset is decoded to UTF-8 so
1574    // the line-ending stats and conversion below see real LF/CRLF bytes.
1575    data = encode_to_git(&plan.encoding, path, data, write_object)?;
1576    // The safecrlf check scans the (post-driver) buffer once for line-ending
1577    // stats. Gate it tightly so the extra scan never runs on the dominant
1578    // pass-through paths: only when safecrlf is enabled, the path is a real
1579    // conversion candidate (not `CRLF_BINARY`), and the buffer is non-empty.
1580    if flags != ConvFlags::Off && !data.is_empty() && plan.safecrlf_applies() {
1581        let old_stats = gather_convert_stats(&data);
1582        plan.check_safe_crlf_stats(&old_stats, index_blob.has_crlf(), flags, path)?;
1583    }
1584    if plan.convert_eol(&data) {
1585        data = convert_crlf_to_lf_cow(data);
1586    }
1587    if plan.ident {
1588        data = ident_to_git_cow(data);
1589    }
1590    Ok(data)
1591}
1592
1593/// Apply the *smudge* conversion to `content` for `path` (blob -> worktree):
1594/// first LF->CRLF when EOL conversion applies, then the configured
1595/// `filter.<name>.smudge` driver (if any).
1596///
1597/// Semantics mirror [`apply_clean_filter`]: a required driver with a missing or
1598/// failing `smudge` command errors, while a non-required one passes the content
1599/// through.
1600pub fn apply_smudge_filter(
1601    worktree_root: impl AsRef<Path>,
1602    git_dir: impl AsRef<Path>,
1603    format: ObjectFormat,
1604    config: &GitConfig,
1605    path: &[u8],
1606    content: &[u8],
1607) -> Result<Vec<u8>> {
1608    // On smudge (checkout) the worktree file may not exist yet, so resolve the
1609    // attributes from the `.gitattributes` recorded in the index.
1610    let checks =
1611        smudge_attribute_checks_from_index(worktree_root.as_ref(), git_dir.as_ref(), format, path)?;
1612    Ok(
1613        apply_smudge_filter_with_attributes_cow_format(config, &checks, path, content, format)?
1614            .into_owned(),
1615    )
1616}
1617
1618/// Like [`apply_smudge_filter`] but takes already-resolved attribute checks.
1619pub fn apply_smudge_filter_with_attributes(
1620    config: &GitConfig,
1621    attributes: &[AttributeCheck],
1622    path: &[u8],
1623    content: &[u8],
1624) -> Result<Vec<u8>> {
1625    Ok(apply_smudge_filter_with_attributes_cow(config, attributes, path, content)?.into_owned())
1626}
1627
1628/// Borrow-first variant of [`apply_smudge_filter_with_attributes`].
1629///
1630/// When no filter or EOL conversion changes the content, the returned value
1631/// borrows `content`; callers that can consume a [`Cow`] avoid allocating for
1632/// the common pass-through case.
1633pub fn apply_smudge_filter_with_attributes_cow<'a>(
1634    config: &GitConfig,
1635    attributes: &[AttributeCheck],
1636    path: &[u8],
1637    content: &'a [u8],
1638) -> Result<Cow<'a, [u8]>> {
1639    apply_smudge_filter_with_attributes_cow_format(
1640        config,
1641        attributes,
1642        path,
1643        content,
1644        ObjectFormat::Sha1,
1645    )
1646}
1647
1648pub(crate) fn apply_smudge_filter_with_attributes_cow_format<'a>(
1649    config: &GitConfig,
1650    attributes: &[AttributeCheck],
1651    path: &[u8],
1652    content: &'a [u8],
1653    format: ObjectFormat,
1654) -> Result<Cow<'a, [u8]>> {
1655    let plan = ContentFilterPlan::resolve(config, attributes);
1656    check_wt_encoding_valid(&plan.encoding)?;
1657    let mut data = Cow::Borrowed(content);
1658    if plan.ident {
1659        data = ident_to_worktree_cow(format, data)?;
1660    }
1661    if plan.eol == EolConversion::Crlf
1662        && plan.convert_eol(&data)
1663        && plan.will_convert_lf_to_crlf(&data)
1664    {
1665        data = Cow::Owned(convert_lf_to_crlf(&data));
1666    }
1667    // encode_to_worktree runs after the EOL pass (convert.c order:
1668    // crlf_to_worktree → encode_to_worktree → smudge filter): the UTF-8 blob is
1669    // line-ending-converted, then reencoded into the worktree charset.
1670    data = encode_to_worktree(&plan.encoding, path, data)?;
1671    if let Some(driver) = &plan.driver {
1672        data = run_driver(
1673            driver,
1674            driver.smudge.as_deref(),
1675            "smudge",
1676            Some(format),
1677            path,
1678            data,
1679        )?;
1680    }
1681    Ok(data)
1682}
1683
1684/// Execute one direction of a driver filter, honouring the `required` flag.
1685pub(crate) fn run_driver<'a>(
1686    driver: &FilterDriver,
1687    command: Option<&str>,
1688    direction: &str,
1689    format: Option<ObjectFormat>,
1690    path: &[u8],
1691    content: Cow<'a, [u8]>,
1692) -> Result<Cow<'a, [u8]>> {
1693    if let Some(process) = &driver.process {
1694        let blob = if direction == "smudge" {
1695            match format {
1696                Some(format) => {
1697                    Some(EncodedObject::new(ObjectType::Blob, content.to_vec()).object_id(format)?)
1698                }
1699                None => None,
1700            }
1701        } else {
1702            None
1703        };
1704        match run_process_filter(process, direction, path, &content, blob) {
1705            Ok(ProcessFilterOutcome::Filtered(output)) => return Ok(Cow::Owned(output)),
1706            Ok(ProcessFilterOutcome::Unsupported) => {}
1707            Ok(ProcessFilterOutcome::Status(status)) => {
1708                if driver.required {
1709                    return Err(GitError::Command(format!(
1710                        "external filter '{}' returned status {status}",
1711                        process
1712                    )));
1713                }
1714                return Ok(content);
1715            }
1716            Err(err) => {
1717                if err.protocol {
1718                    eprintln!("error: external filter '{}' failed", process);
1719                }
1720                if driver.required {
1721                    return Err(GitError::Command(err.message));
1722                }
1723                return Ok(content);
1724            }
1725        }
1726    }
1727    let Some(command) = command else {
1728        // No command in this direction. Required filters must error; optional
1729        // ones pass content through unchanged.
1730        if driver.required {
1731            let path = String::from_utf8_lossy(path);
1732            let name = String::from_utf8_lossy(&driver.name);
1733            if direction == "clean" {
1734                eprintln!("fatal: {path}: clean filter '{name}' failed");
1735            } else {
1736                eprintln!("fatal: {path}: smudge filter {name} failed");
1737            }
1738            return Err(GitError::Exit(128));
1739        }
1740        return Ok(content);
1741    };
1742    match run_filter_command(command, path, &content) {
1743        Ok(output) => Ok(Cow::Owned(output)),
1744        Err(err) => {
1745            if driver.required {
1746                Err(err)
1747            } else {
1748                // Non-required filter failure: fall back to the unfiltered
1749                // content, matching git's behaviour.
1750                Ok(content)
1751            }
1752        }
1753    }
1754}
1755
1756/// Compute the attributes relevant to content filtering (`text`, `eol`,
1757/// `filter`) for `path` from the worktree `.gitattributes` chain.
1758pub(crate) fn filter_attribute_checks(
1759    worktree_root: &Path,
1760    path: &[u8],
1761) -> Result<Vec<AttributeCheck>> {
1762    let requested = filter_attribute_names();
1763    let mut matcher = AttributeMatcher::default();
1764    let git_dir = worktree_root.join(".git");
1765    matcher.configure_case_sensitivity(&git_dir);
1766    if !matcher.read_configured_attributes(worktree_root, &git_dir) {
1767        matcher.read_default_global_attributes();
1768    }
1769    read_dir_attribute_patterns_for_base(worktree_root, &[], &mut matcher)?;
1770    let mut prefix = Vec::new();
1771    let mut parts = path.split(|byte| *byte == b'/').peekable();
1772    while let Some(part) = parts.next() {
1773        if parts.peek().is_none() {
1774            break;
1775        }
1776        if !prefix.is_empty() {
1777            prefix.push(b'/');
1778        }
1779        prefix.extend_from_slice(part);
1780        let dir = worktree_root.join(repo_path_to_os_path(&prefix)?);
1781        read_dir_attribute_patterns_for_base(&dir, &prefix, &mut matcher)?;
1782    }
1783    read_attribute_patterns(
1784        worktree_root.join(".git").join("info").join("attributes"),
1785        &mut matcher,
1786        &[],
1787        b".git/info/attributes",
1788        false,
1789    );
1790    Ok(matcher.attributes_for_path(path, &requested, false))
1791}
1792
1793/// Compute filtering attributes for a checkout (blob -> worktree).
1794///
1795/// `git checkout -- <pathspec>` / `git restore` materialize through git's
1796/// **default** attr direction, which is `GIT_ATTR_CHECKIN` (attr.c: the static
1797/// `direction` is zero-initialized and `builtin/checkout.c` never overrides it
1798/// for the pathspec path). Under that direction `read_attr` reads each
1799/// `.gitattributes` frame from the **worktree file first**, falling back to the
1800/// staged blob only when no worktree file exists at that directory level
1801/// (sparse-checkout). This is the precedence the smudge filter must use:
1802/// t0027 commits an *empty* root `.gitattributes`, then overwrites the worktree
1803/// copy with `*.txt text eol=crlf` *without re-staging* — and git's checkout
1804/// still honours the worktree copy. Reading the index alone (or index-first)
1805/// made checkout under-convert line endings, because the staged blob was empty.
1806pub(crate) fn smudge_attribute_checks_from_index(
1807    worktree_root: &Path,
1808    git_dir: &Path,
1809    format: ObjectFormat,
1810    path: &[u8],
1811) -> Result<Vec<AttributeCheck>> {
1812    let requested = filter_attribute_names();
1813    let mut matcher = AttributeMatcher::default();
1814    matcher.configure_case_sensitivity(git_dir);
1815    if !matcher.read_configured_attributes(worktree_root, git_dir) {
1816        matcher.read_default_global_attributes();
1817    }
1818
1819    // Build the set of `.gitattributes` blobs the index carries, keyed by the
1820    // directory they govern, so each ancestry frame can prefer the staged copy.
1821    let index_attributes = index_gitattributes_by_base(git_dir, format)?;
1822
1823    // Walk root -> ... -> the file's parent directory, folding each frame's
1824    // `.gitattributes` in shallow-to-deep order so deeper directories win.
1825    fold_checkout_attribute_frame(worktree_root, &[], &index_attributes, &mut matcher)?;
1826    let mut prefix = Vec::new();
1827    let mut parts = path.split(|byte| *byte == b'/').peekable();
1828    while let Some(part) = parts.next() {
1829        if parts.peek().is_none() {
1830            break;
1831        }
1832        if !prefix.is_empty() {
1833            prefix.push(b'/');
1834        }
1835        prefix.extend_from_slice(part);
1836        let dir = worktree_root.join(repo_path_to_os_path(&prefix)?);
1837        fold_checkout_attribute_frame(&dir, &prefix, &index_attributes, &mut matcher)?;
1838    }
1839
1840    read_attribute_patterns(
1841        worktree_root.join(".git").join("info").join("attributes"),
1842        &mut matcher,
1843        &[],
1844        b".git/info/attributes",
1845        false,
1846    );
1847    Ok(matcher.attributes_for_path(path, &requested, false))
1848}
1849
1850/// Fold the `.gitattributes` governing directory `base` (whose on-disk location
1851/// is `dir`) into `matcher`, preferring the worktree file and falling back to
1852/// the staged blob. Mirrors one attr-stack frame under `GIT_ATTR_CHECKIN`
1853/// (git's default direction, used by `checkout -- <pathspec>` / `restore`).
1854pub(crate) fn fold_checkout_attribute_frame(
1855    dir: &Path,
1856    base: &[u8],
1857    index_attributes: &BTreeMap<Vec<u8>, Vec<u8>>,
1858    matcher: &mut AttributeMatcher,
1859) -> Result<()> {
1860    let worktree_file = dir.join(".gitattributes");
1861    let source = attribute_source_for_base(base);
1862    if let Ok(contents) = fs::read(&worktree_file) {
1863        // A worktree `.gitattributes` exists at this level: it wins outright
1864        // (git only consults the index when the worktree file is absent).
1865        read_attribute_patterns_from_bytes(&contents, matcher, base, &source);
1866    } else if let Some(contents) = index_attributes.get(base) {
1867        read_attribute_patterns_from_bytes(contents, matcher, base, &source);
1868    }
1869    Ok(())
1870}
1871
1872/// Read every staged `.gitattributes` blob, keyed by the repo-relative directory
1873/// it governs (`""` for the worktree root). Stage-0 blob entries only.
1874pub(crate) fn index_gitattributes_by_base(
1875    git_dir: &Path,
1876    format: ObjectFormat,
1877) -> Result<BTreeMap<Vec<u8>, Vec<u8>>> {
1878    let mut map = BTreeMap::new();
1879    let index_path = repository_index_path(git_dir);
1880    if !index_path.exists() {
1881        return Ok(map);
1882    }
1883    let db = FileObjectDatabase::from_git_dir(git_dir, format);
1884    let entries = Index::parse(&fs::read(index_path)?, format)?.entries;
1885    for entry in entries {
1886        let is_attributes_file =
1887            entry.path == b".gitattributes" || entry.path.as_bytes().ends_with(b"/.gitattributes");
1888        if index_entry_stage(&entry) != 0
1889            || tree_entry_object_type(entry.mode) != ObjectType::Blob
1890            || !is_attributes_file
1891        {
1892            continue;
1893        }
1894        let base = match entry.path.as_bytes().strip_suffix(b".gitattributes") {
1895            Some(b"") => Vec::new(),
1896            Some(parent) => parent.strip_suffix(b"/").unwrap_or(parent).to_vec(),
1897            None => continue,
1898        };
1899        let object = db
1900            .read_object(&entry.oid)
1901            .map_err(|err| expect_missing_object_kind(err, entry.oid, MissingObjectKind::Blob))?;
1902        if object.object_type == ObjectType::Blob {
1903            map.insert(base, object.body.clone());
1904        }
1905    }
1906    Ok(map)
1907}
1908
1909pub(crate) fn filter_attribute_names() -> Vec<Vec<u8>> {
1910    // `crlf` is git's legacy alias for `text` (convert.c registers both); it is
1911    // consulted as a fallback when `text` is unspecified, so we must resolve it.
1912    vec![
1913        b"text".to_vec(),
1914        b"crlf".to_vec(),
1915        b"ident".to_vec(),
1916        b"eol".to_vec(),
1917        b"filter".to_vec(),
1918        b"working-tree-encoding".to_vec(),
1919    ]
1920}
1921
1922// ---------------------------------------------------------------------------
1923// `ls-files --eol` line-ending information
1924//
1925// Git's `git ls-files --eol` prints, for each path, three fields:
1926//   i/<stat>  — line-ending statistics of the *index* blob content
1927//   w/<stat>  — line-ending statistics of the *worktree* file content
1928//   attr/<a>  — the resolved crlf/eol attribute action (attributes only, no
1929//               config) — `get_convert_attr_ascii` in convert.c
1930// The two stat fields mirror `gather_convert_stats_ascii`; the attr field
1931// mirrors `convert_attrs` up to `ca->attr_action` (i.e. *before* the config
1932// derived `text` -> input/crlf substitution and the `core.autocrlf` fallback).
1933// ---------------------------------------------------------------------------
1934
1935/// Line-ending statistics of a byte buffer, mirroring convert.c `gather_stats`.
1936#[derive(Clone)]
1937pub(crate) struct ConvertStats {
1938    nul: u32,
1939    lonecr: u32,
1940    lonelf: u32,
1941    crlf: u32,
1942    printable: u32,
1943    nonprintable: u32,
1944}
1945
1946pub(crate) fn gather_convert_stats(buf: &[u8]) -> ConvertStats {
1947    let mut stats = ConvertStats {
1948        nul: 0,
1949        lonecr: 0,
1950        lonelf: 0,
1951        crlf: 0,
1952        printable: 0,
1953        nonprintable: 0,
1954    };
1955    let mut i = 0;
1956    while i < buf.len() {
1957        let c = buf[i];
1958        if c == b'\r' {
1959            if buf.get(i + 1) == Some(&b'\n') {
1960                stats.crlf += 1;
1961                i += 1;
1962            } else {
1963                stats.lonecr += 1;
1964            }
1965            i += 1;
1966            continue;
1967        }
1968        if c == b'\n' {
1969            stats.lonelf += 1;
1970            i += 1;
1971            continue;
1972        }
1973        if c == 127 {
1974            // DEL
1975            stats.nonprintable += 1;
1976        } else if c < 32 {
1977            match c {
1978                // BS, HT, ESC and FF are printable.
1979                0x08 | 0x09 | 0x1b | 0x0c => stats.printable += 1,
1980                0 => {
1981                    stats.nul += 1;
1982                    stats.nonprintable += 1;
1983                }
1984                _ => stats.nonprintable += 1,
1985            }
1986        } else {
1987            stats.printable += 1;
1988        }
1989        i += 1;
1990    }
1991    // A trailing EOF (^Z, 0x1a) is not counted as non-printable.
1992    if buf.last() == Some(&0x1a) {
1993        stats.nonprintable = stats.nonprintable.saturating_sub(1);
1994    }
1995    stats
1996}
1997
1998/// Mirror of convert.c `has_crlf_in_index`: whether the blob currently recorded
1999/// in the index for this path is non-binary text containing a CRLF. Used only by
2000/// the auto-crlf safecrlf decision to keep an already-CRLF index blob from being
2001/// silently collapsed. A missing/unreadable blob (or a non-blob entry) counts as
2002/// "no CRLF", matching git's `read_blob_data_from_index` returning NULL.
2003pub(crate) fn has_crlf_in_index(odb: &FileObjectDatabase, oid: &ObjectId) -> bool {
2004    let Ok(object) = odb.read_object(oid) else {
2005        return false;
2006    };
2007    if object.object_type != ObjectType::Blob {
2008        return false;
2009    }
2010    let data = &object.body;
2011    // git short-circuits on the first '\r' via memchr before gathering stats.
2012    if !data.contains(&b'\r') {
2013        return false;
2014    }
2015    let stats = gather_convert_stats(data);
2016    !convert_is_binary(&stats) && stats.crlf > 0
2017}
2018
2019/// Mirror of convert.c `convert_is_binary`: a lone CR or NUL, or a high
2020/// non-printable ratio, marks the content as binary.
2021pub(crate) fn convert_is_binary(stats: &ConvertStats) -> bool {
2022    if stats.lonecr > 0 {
2023        return true;
2024    }
2025    if stats.nul > 0 {
2026        return true;
2027    }
2028    (stats.printable >> 7) < stats.nonprintable
2029}
2030
2031/// The `core.safecrlf` round-trip-warning mode, mirroring git's
2032/// `global_conv_flags_eol` (environment.c). git's *default* — when
2033/// `core.safecrlf` is unset — is [`ConvFlags::Warn`], so the warning fires even
2034/// without any explicit config.
2035#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2036pub enum ConvFlags {
2037    /// `core.safecrlf=false`: never warn.
2038    Off,
2039    /// `core.safecrlf=warn` (and the unset default): emit a warning when a
2040    /// CRLF<->LF round-trip would not be reversible.
2041    Warn,
2042    /// `core.safecrlf=true`: die instead of warn.
2043    Die,
2044}
2045
2046impl ConvFlags {
2047    /// Resolve `core.safecrlf` from config, mirroring environment.c
2048    /// `git_default_core_config`: `warn` -> [`ConvFlags::Warn`], a boolean-true
2049    /// value -> [`ConvFlags::Die`], a boolean-false value -> [`ConvFlags::Off`].
2050    /// When the key is absent git leaves `global_conv_flags_eol` at its initial
2051    /// [`ConvFlags::Warn`], so unset also resolves to [`ConvFlags::Warn`].
2052    pub fn from_config(config: &GitConfig) -> Self {
2053        match config.get("core", None, "safecrlf") {
2054            Some(value) if value.eq_ignore_ascii_case("warn") => ConvFlags::Warn,
2055            Some(_) => {
2056                if config.get_bool("core", None, "safecrlf") == Some(true) {
2057                    ConvFlags::Die
2058                } else {
2059                    ConvFlags::Off
2060                }
2061            }
2062            None => ConvFlags::Warn,
2063        }
2064    }
2065}
2066
2067/// Mirror of convert.c `check_global_conv_flags_eol`: compare the pre-conversion
2068/// `old_stats` against the simulated round-trip `new_stats` and, when the
2069/// CRLF/LF content would not survive a clean+smudge cycle, warn (or die under
2070/// `core.safecrlf=true`).
2071///
2072/// Returns `Err(GitError::Exit(128))` when `flags` is [`ConvFlags::Die`] and the
2073/// round-trip is irreversible (git `die`s with exit 128 here); otherwise prints
2074/// the warning to stderr and returns `Ok(())`. This is a pure stderr-side
2075/// effect: it never changes the bytes written to the object store.
2076pub(crate) fn check_safe_crlf(
2077    old_stats: &ConvertStats,
2078    new_stats: &ConvertStats,
2079    flags: ConvFlags,
2080    path: &[u8],
2081) -> Result<()> {
2082    if flags == ConvFlags::Off {
2083        return Ok(());
2084    }
2085    let display = String::from_utf8_lossy(path);
2086    if old_stats.crlf > 0 && new_stats.crlf == 0 {
2087        // CRLFs would not be restored by checkout.
2088        match flags {
2089            ConvFlags::Die => {
2090                eprintln!("fatal: CRLF would be replaced by LF in {display}");
2091                return Err(GitError::Exit(128));
2092            }
2093            ConvFlags::Warn => {
2094                eprintln!(
2095                    "warning: in the working copy of '{display}', CRLF will be replaced by LF the next time Git touches it"
2096                );
2097            }
2098            ConvFlags::Off => unreachable!("handled above"),
2099        }
2100    } else if old_stats.lonelf > 0 && new_stats.lonelf == 0 {
2101        // CRLFs would be added by checkout.
2102        match flags {
2103            ConvFlags::Die => {
2104                eprintln!("fatal: LF would be replaced by CRLF in {display}");
2105                return Err(GitError::Exit(128));
2106            }
2107            ConvFlags::Warn => {
2108                eprintln!(
2109                    "warning: in the working copy of '{display}', LF will be replaced by CRLF the next time Git touches it"
2110                );
2111            }
2112            ConvFlags::Off => unreachable!("handled above"),
2113        }
2114    }
2115    Ok(())
2116}
2117
2118/// Compute the `i/` or `w/` stat string for `content`, mirroring
2119/// convert.c `gather_convert_stats_ascii`.
2120pub(crate) fn convert_stats_ascii(content: &[u8]) -> &'static str {
2121    if content.is_empty() {
2122        return "none";
2123    }
2124    let stats = gather_convert_stats(content);
2125    if convert_is_binary(&stats) {
2126        return "-text";
2127    }
2128    match (stats.lonelf > 0, stats.crlf > 0) {
2129        (true, false) => "lf",
2130        (false, true) => "crlf",
2131        (true, true) => "mixed",
2132        (false, false) => "none",
2133    }
2134}
2135
2136/// The resolved crlf/eol attribute action for a path, mirroring convert.c
2137/// `convert_attrs` up to `ca->attr_action` (attributes only, no config), and
2138/// `get_convert_attr_ascii` for the ascii spelling.
2139pub(crate) fn convert_attr_ascii(checks: &[AttributeCheck]) -> &'static str {
2140    fn state_of<'a>(checks: &'a [AttributeCheck], name: &[u8]) -> Option<&'a AttributeState> {
2141        checks
2142            .iter()
2143            .find(|check| check.attribute == name)
2144            .and_then(|check| check.state.as_ref())
2145    }
2146
2147    // git_path_check_crlf: ATTR_TRUE -> TEXT, ATTR_FALSE -> BINARY,
2148    // ATTR_UNSET -> (fall through), "input" -> TEXT_INPUT, "auto" -> AUTO,
2149    // anything else -> UNDEFINED.
2150    #[derive(Clone, Copy, PartialEq)]
2151    enum Action {
2152        Undefined,
2153        Binary,
2154        Text,
2155        TextInput,
2156        TextCrlf,
2157        Auto,
2158        AutoCrlf,
2159        AutoInput,
2160    }
2161    fn check_crlf(state: Option<&AttributeState>) -> Action {
2162        match state {
2163            Some(AttributeState::Set) => Action::Text,
2164            Some(AttributeState::Unset) => Action::Binary,
2165            Some(AttributeState::Value(value)) if value == b"input" => Action::TextInput,
2166            Some(AttributeState::Value(value)) if value == b"auto" => Action::Auto,
2167            // ATTR_UNSET / any other value -> CRLF_UNDEFINED.
2168            _ => Action::Undefined,
2169        }
2170    }
2171
2172    // Resolve from the `text` attribute, then fall back to the legacy `crlf`
2173    // alias only when `text` left the action undefined.
2174    let mut action = check_crlf(state_of(checks, b"text"));
2175    if action == Action::Undefined {
2176        action = check_crlf(state_of(checks, b"crlf"));
2177    }
2178
2179    if action != Action::Binary {
2180        // git_path_check_eol: only "lf"/"crlf" values matter.
2181        let eol = match state_of(checks, b"eol") {
2182            Some(AttributeState::Value(value)) if value == b"lf" => Some(false),
2183            Some(AttributeState::Value(value)) if value == b"crlf" => Some(true),
2184            _ => None,
2185        };
2186        action = match (action, eol) {
2187            (Action::Auto, Some(false)) => Action::AutoInput,
2188            (Action::Auto, Some(true)) => Action::AutoCrlf,
2189            (_, Some(false)) if action != Action::Auto => Action::TextInput,
2190            (_, Some(true)) if action != Action::Auto => Action::TextCrlf,
2191            _ => action,
2192        };
2193    }
2194
2195    match action {
2196        Action::Undefined => "",
2197        Action::Binary => "-text",
2198        Action::Text => "text",
2199        Action::TextInput => "text eol=lf",
2200        Action::TextCrlf => "text eol=crlf",
2201        Action::Auto => "text=auto",
2202        Action::AutoCrlf => "text=auto eol=crlf",
2203        Action::AutoInput => "text=auto eol=lf",
2204    }
2205}
2206
2207/// The three `ls-files --eol` fields for a single path.
2208pub struct EolInfo {
2209    /// Stat of the index blob (`i/...`); empty when there is no index blob.
2210    pub index: &'static str,
2211    /// Stat of the worktree file (`w/...`); empty when the file is absent.
2212    pub worktree: &'static str,
2213    /// Resolved crlf/eol attribute action (`attr/...`).
2214    pub attr: &'static str,
2215}
2216
2217impl EolInfo {
2218    /// Format as git's `ls-files --eol` prefix: `i/%-5s w/%-5s attr/%-17s\t`.
2219    pub fn format_prefix(&self) -> String {
2220        format!(
2221            "i/{:<5} w/{:<5} attr/{:<17}\t",
2222            self.index, self.worktree, self.attr
2223        )
2224    }
2225}
2226
2227/// Compute the `ls-files --eol` info for `path`.
2228///
2229/// `index_content` is the raw index blob bytes (None when the path has no
2230/// index entry or is not a regular file). The worktree file is read from
2231/// `worktree_root/path`; if it is absent or not a regular file the `w/` field
2232/// is empty. Attributes are resolved from the worktree `.gitattributes` chain
2233/// via `attr_checks`.
2234pub fn eol_info_for_path(
2235    worktree_root: impl AsRef<Path>,
2236    path: &[u8],
2237    index_content: Option<&[u8]>,
2238    attr_checks: &[AttributeCheck],
2239) -> EolInfo {
2240    let index = index_content.map(convert_stats_ascii).unwrap_or("");
2241
2242    let worktree_root = worktree_root.as_ref();
2243    let worktree = match repo_path_to_os_path(path) {
2244        Ok(rel) => {
2245            let absolute = worktree_root.join(rel);
2246            match fs::symlink_metadata(&absolute) {
2247                // git: only regular files get a `w/` stat (lstat + S_ISREG).
2248                Ok(meta) if meta.file_type().is_file() => match fs::read(&absolute) {
2249                    Ok(content) => convert_stats_ascii_owned(&content),
2250                    Err(_) => "",
2251                },
2252                _ => "",
2253            }
2254        }
2255        Err(_) => "",
2256    };
2257
2258    let attr = convert_attr_ascii(attr_checks);
2259
2260    EolInfo {
2261        index,
2262        worktree,
2263        attr,
2264    }
2265}
2266
2267/// `convert_stats_ascii` over an owned buffer; the result is a `'static` str so
2268/// the buffer can be dropped.
2269pub(crate) fn convert_stats_ascii_owned(content: &[u8]) -> &'static str {
2270    convert_stats_ascii(content)
2271}
2272
2273/// Resolve the crlf/eol/text/filter attributes for `path` from the worktree
2274/// `.gitattributes` chain (the set `ls-files --eol` needs for its `attr/`
2275/// field).
2276pub fn eol_attribute_checks(
2277    worktree_root: impl AsRef<Path>,
2278    path: &[u8],
2279) -> Result<Vec<AttributeCheck>> {
2280    filter_attribute_checks(worktree_root.as_ref(), path)
2281}