Skip to main content

objects/util/
git_tree_name.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Git tree-entry name classification shared by import engines.
3
4use crate::object::validate_tree_entry_name;
5
6#[derive(Clone, Debug, Eq, PartialEq)]
7pub enum GitTreeNameClassification {
8    Representable(String),
9    NeedsLossy(GitTreeNameLossy),
10}
11
12#[derive(Clone, Debug, Eq, PartialEq)]
13pub struct GitTreeNameLossy {
14    pub name: String,
15    pub action: GitTreeNameLossyAction,
16    pub reason: &'static str,
17}
18
19#[derive(Clone, Copy, Debug, Eq, PartialEq)]
20pub enum GitTreeNameLossyAction {
21    Dropped,
22    Converted,
23}
24
25pub fn classify_git_tree_name(raw_name: &[u8]) -> GitTreeNameClassification {
26    let (name, utf8_lossy) = match std::str::from_utf8(raw_name) {
27        Ok(name) => (name.to_string(), false),
28        Err(_) => (String::from_utf8_lossy(raw_name).into_owned(), true),
29    };
30
31    // Validate the FINAL name (after any UTF-8 replacement) against the
32    // canonical tree-name validator, so this classifier's representable set
33    // can never drift from what Heddle will actually store (path separators
34    // '/' and '\', '.'/'..', control bytes, empty). Critically, a name that
35    // is invalid UTF-8 AND otherwise unrepresentable (e.g. `bad\<0xff>` ->
36    // lossy `bad\<U+FFFD>` still containing a backslash) must be Dropped, not
37    // silently persisted as Converted.
38    match (validate_tree_entry_name(&name), utf8_lossy) {
39        (Ok(()), false) => GitTreeNameClassification::Representable(name),
40        (Ok(()), true) => GitTreeNameClassification::NeedsLossy(GitTreeNameLossy {
41            name,
42            action: GitTreeNameLossyAction::Converted,
43            reason: "tree entry name is not valid UTF-8 and was converted with replacement characters",
44        }),
45        (Err(_), _) => GitTreeNameClassification::NeedsLossy(GitTreeNameLossy {
46            name,
47            action: GitTreeNameLossyAction::Dropped,
48            reason: "tree entry name is not representable in Heddle",
49        }),
50    }
51}
52
53#[cfg(test)]
54mod tests {
55    use super::*;
56
57    /// Close-the-class guard: the classifier's `Representable` verdict must be
58    /// EXACTLY the set `validate_tree_entry_name` accepts. If the two ever
59    /// diverge (as they did for `\\` before this fix), this fails.
60    #[test]
61    fn representable_iff_validator_accepts() {
62        let cases = [
63            "ok.txt",
64            "with space",
65            "ünïcödé",
66            "",
67            ".",
68            "..",
69            "a/b",
70            "a\\b",
71            "ctrl\u{0001}",
72            "del\u{7f}",
73        ];
74        for c in cases {
75            let classified_representable = matches!(
76                classify_git_tree_name(c.as_bytes()),
77                GitTreeNameClassification::Representable(_)
78            );
79            let validator_accepts = validate_tree_entry_name(c).is_ok();
80            assert_eq!(
81                classified_representable, validator_accepts,
82                "classifier/validator disagree on {c:?}"
83            );
84        }
85    }
86
87    #[test]
88    fn backslash_name_is_not_representable() {
89        assert!(matches!(
90            classify_git_tree_name(b"foo\\bar"),
91            GitTreeNameClassification::NeedsLossy(_)
92        ));
93    }
94
95    #[test]
96    fn invalid_utf8_is_converted_not_dropped() {
97        match classify_git_tree_name(&[b'a', 0xff, b'b']) {
98            GitTreeNameClassification::NeedsLossy(lossy) => {
99                assert_eq!(lossy.action, GitTreeNameLossyAction::Converted);
100            }
101            other => panic!("expected NeedsLossy/Converted, got {other:?}"),
102        }
103    }
104
105    #[test]
106    fn invalid_utf8_that_stays_unrepresentable_after_conversion_is_dropped() {
107        // `bad\<0xff>`: invalid UTF-8 AND contains a backslash. Lossy UTF-8
108        // conversion replaces the 0xff but the backslash survives, so the
109        // converted name is still rejected by validate_tree_entry_name and
110        // must be Dropped — never silently persisted as Converted.
111        match classify_git_tree_name(b"bad\\\xff") {
112            GitTreeNameClassification::NeedsLossy(lossy) => {
113                assert_eq!(lossy.action, GitTreeNameLossyAction::Dropped);
114            }
115            other => panic!("expected NeedsLossy/Dropped, got {other:?}"),
116        }
117    }
118}