Skip to main content

grit_lib/
commit_encoding.rs

1//! Git commit encoding labels (`encoding` header, `i18n.commitEncoding`) mapped to codecs.
2//!
3//! Git's `ISO-8859-1` is strict Latin-1; `encoding_rs` maps that label to Windows-1252, so we
4//! handle Latin-1 separately.
5
6use encoding_rs::Encoding;
7
8fn is_iso_8859_1(label: &str) -> bool {
9    matches!(
10        label.trim().to_ascii_lowercase().as_str(),
11        "iso-8859-1" | "iso8859-1" | "latin1" | "latin-1"
12    )
13}
14
15fn decode_latin1(bytes: &[u8]) -> String {
16    let mut s = String::with_capacity(bytes.len());
17    for &b in bytes {
18        s.push(char::from_u32(u32::from(b)).unwrap_or('\u{FFFD}'));
19    }
20    s
21}
22
23fn encode_latin1_lossy(unicode: &str) -> Vec<u8> {
24    unicode
25        .chars()
26        .map(|c| {
27            let cp = u32::from(c);
28            if cp <= 0xFF {
29                cp as u8
30            } else {
31                b'?'
32            }
33        })
34        .collect()
35}
36
37/// Find the offset of the first byte that is not part of a strictly valid UTF-8
38/// sequence, mirroring Git's `find_invalid_utf8` (commit.c).
39///
40/// This is stricter than [`core::str::from_utf8`]: in addition to rejecting
41/// malformed/overlong sequences and surrogates, it also rejects the Unicode
42/// non-characters `U+xxFFFE`, `U+xxFFFF`, and the range `U+FDD0..=U+FDEF`, which
43/// Rust's standard library accepts. Returns `None` when the whole buffer is valid.
44#[must_use]
45pub fn find_invalid_utf8(buf: &[u8]) -> Option<usize> {
46    const MAX_CODEPOINT: [u32; 4] = [0x7f, 0x7ff, 0xffff, 0x10ffff];
47    let mut i = 0usize;
48    while i < buf.len() {
49        let c = buf[i];
50        let bad_offset = i;
51        i += 1;
52        // Simple US-ASCII? No worries.
53        if c < 0x80 {
54            continue;
55        }
56        // Count how many more high bits are set: that's how many more bytes
57        // this sequence should have.
58        let mut bytes = 0usize;
59        let mut cc = c;
60        while cc & 0x40 != 0 {
61            cc <<= 1;
62            bytes += 1;
63        }
64        // Must be between 1 and 3 more bytes.
65        if !(1..=3).contains(&bytes) {
66            return Some(bad_offset);
67        }
68        // Do we have that many bytes?
69        if buf.len() - i < bytes {
70            return Some(bad_offset);
71        }
72        let mut codepoint = (u32::from(cc) & 0x7f) >> bytes;
73        let min_val = MAX_CODEPOINT[bytes - 1] + 1;
74        let max_val = MAX_CODEPOINT[bytes];
75        // Verify that they are good continuation bytes.
76        for _ in 0..bytes {
77            let b = buf[i];
78            codepoint = (codepoint << 6) | (u32::from(b) & 0x3f);
79            if b & 0xc0 != 0x80 {
80                return Some(bad_offset);
81            }
82            i += 1;
83        }
84        if codepoint < min_val || codepoint > max_val {
85            return Some(bad_offset);
86        }
87        // Reject the UTF-16 surrogate block (U+D800..=U+DFFF): it has no
88        // legal UTF-8 encoding.
89        if codepoint & 0x1f_f800 == 0xd800 {
90            return Some(bad_offset);
91        }
92        // The last two code points of every plane (..FFFE and ..FFFF) are
93        // permanent non-characters.
94        if codepoint & 0xfffe == 0xfffe {
95            return Some(bad_offset);
96        }
97        // So is anything in the range U+FDD0..=U+FDEF.
98        if (0xfdd0..=0xfdef).contains(&codepoint) {
99            return Some(bad_offset);
100        }
101    }
102    None
103}
104
105/// Whether `buf` is strictly valid UTF-8 per Git's rules (see [`find_invalid_utf8`]).
106#[must_use]
107pub fn is_strict_utf8(buf: &[u8]) -> bool {
108    find_invalid_utf8(buf).is_none()
109}
110
111/// Git stores the commit message body with a trailing newline when non-empty.
112#[must_use]
113pub fn ensure_body_trailing_newline(mut bytes: Vec<u8>) -> Vec<u8> {
114    if !bytes.is_empty() && !bytes.ends_with(b"\n") {
115        bytes.push(b'\n');
116    }
117    bytes
118}
119
120/// Whether `label` names an encoding Git can decode (ISO-8859-1 or any encoding
121/// resolvable via [`resolve`]). Unknown names (e.g. the test's `non-utf-8`) return
122/// false, matching Git's `logmsg_reencode` no-op fallback.
123pub fn is_known_encoding(label: &str) -> bool {
124    is_iso_8859_1(label) || resolve(label).is_some()
125}
126
127/// Resolve an encoding label the way Git uses it in config and commit objects.
128///
129/// Git accepts names like `eucJP` that [`Encoding::for_label`] does not recognize.
130/// ISO-8859-1 is handled separately as strict Latin-1 and returns `None`.
131#[must_use]
132pub fn resolve(label: &str) -> Option<&'static Encoding> {
133    let t = label.trim();
134    if t.is_empty() || is_iso_8859_1(t) {
135        return None;
136    }
137    let normalized = t.replace('_', "-");
138    let lower = normalized.to_ascii_lowercase();
139    let mapped = match lower.as_str() {
140        "eucjp" => "euc-jp",
141        "cp932" | "mskanji" | "sjis" => "shift_jis",
142        _ => normalized.as_str(),
143    };
144    Encoding::for_label(mapped.as_bytes()).or_else(|| Encoding::for_label(t.as_bytes()))
145}
146
147/// Encode `unicode` for storage in a commit message body using Git's encoding name.
148#[must_use]
149pub fn encode_unicode(label: &str, unicode: &str) -> Option<Vec<u8>> {
150    let t = label.trim();
151    let raw = if is_iso_8859_1(t) {
152        encode_latin1_lossy(unicode)
153    } else {
154        let enc = resolve(t)?;
155        let (cow, _, _) = enc.encode(unicode);
156        cow.into_owned()
157    };
158    Some(ensure_body_trailing_newline(raw))
159}
160
161/// Encode a single header field (author/committer line) without adding a trailing newline.
162#[must_use]
163pub fn encode_header_text(label: &str, unicode: &str) -> Option<Vec<u8>> {
164    let t = label.trim();
165    if is_iso_8859_1(t) {
166        return Some(encode_latin1_lossy(unicode));
167    }
168    let enc = resolve(t)?;
169    let (cow, _, _) = enc.encode(unicode);
170    Some(cow.into_owned())
171}
172
173/// Decode `bytes` using Git's encoding name, or lossy UTF-8 if unknown.
174#[must_use]
175pub fn decode_bytes(label: Option<&str>, bytes: &[u8]) -> String {
176    if let Some(l) = label {
177        if is_iso_8859_1(l) {
178            return decode_latin1(bytes);
179        }
180        if let Some(enc) = resolve(l) {
181            let (cow, _) = enc.decode_without_bom_handling(bytes);
182            return cow.into_owned();
183        }
184    }
185    String::from_utf8_lossy(bytes).into_owned()
186}
187
188/// Re-encode `unicode` from UTF-8 into `output_label`, or `None` if unsupported.
189#[must_use]
190pub fn reencode_utf8_to_label(output_label: &str, unicode: &str) -> Option<Vec<u8>> {
191    encode_header_text(output_label, unicode)
192}
193
194/// Prepare a commit message for storage per `i18n.commitEncoding` (or equivalent).
195///
196/// When the configured encoding is not UTF-8, returns [`Some`] raw bytes for the body
197/// and sets `encoding` in the commit object; otherwise UTF-8 is stored without an
198/// `encoding` header.
199#[must_use]
200pub fn finalize_stored_commit_message(
201    message: String,
202    commit_encoding: Option<&str>,
203) -> (String, Option<String>, Option<Vec<u8>>) {
204    let is_utf8 = match commit_encoding {
205        None => true,
206        Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
207    };
208    if is_utf8 {
209        return (message, None, None);
210    }
211    let Some(label) = commit_encoding.filter(|s| !s.trim().is_empty()) else {
212        return (message, None, None);
213    };
214    let Some(raw) = encode_unicode(label, &message) else {
215        return (message, None, None);
216    };
217    (message, Some(label.to_owned()), Some(raw))
218}
219
220/// Decode `=?charset?q?...?=` encoded-words in an email display name (before `<`).
221///
222/// Used when applying patches: `git format-patch` emits RFC 2047 in `From:`; the stored
223/// commit author should be the decoded Unicode form.
224#[must_use]
225pub fn decode_rfc2047_mailbox_from_line(from: &str) -> String {
226    let from = from.trim();
227    let Some(lt) = from.find('<') else {
228        return decode_rfc2047_encoded_words(from);
229    };
230    let name = from[..lt].trim();
231    let tail = &from[lt..];
232    let decoded = decode_rfc2047_encoded_words(name);
233    if decoded.is_empty() {
234        tail.trim_start().to_string()
235    } else {
236        format!("{decoded} {tail}")
237    }
238}
239
240fn decode_rfc2047_encoded_words(s: &str) -> String {
241    let mut out = String::new();
242    let mut rest = s;
243    while let Some(start) = rest.find("=?") {
244        out.push_str(&rest[..start]);
245        rest = &rest[start + 2..];
246        let Some(d1) = rest.find('?') else {
247            out.push_str("=?");
248            out.push_str(rest);
249            return out;
250        };
251        let charset = &rest[..d1];
252        let after_cs = &rest[d1 + 1..];
253        let Some(d2) = after_cs.find('?') else {
254            out.push_str("=?");
255            out.push_str(rest);
256            return out;
257        };
258        let encoding = after_cs[..d2].to_ascii_lowercase();
259        let after_enc = &after_cs[d2 + 1..];
260        let Some(end) = after_enc.find("?=") else {
261            out.push_str("=?");
262            out.push_str(rest);
263            return out;
264        };
265        let payload = &after_enc[..end];
266        rest = &after_enc[end + 2..];
267        if encoding == "q" {
268            let bytes = decode_quoted_printable_soft(payload);
269            out.push_str(&decode_bytes(Some(charset), &bytes));
270        } else if encoding == "b" {
271            if let Some(bytes) = base64_decode_rfc2047(payload) {
272                out.push_str(&decode_bytes(Some(charset), &bytes));
273            }
274        }
275    }
276    out.push_str(rest);
277    out
278}
279
280fn decode_quoted_printable_soft(payload: &str) -> Vec<u8> {
281    let mut out = Vec::new();
282    let mut it = payload.as_bytes().iter().copied().peekable();
283    while let Some(b) = it.next() {
284        if b == b'_' {
285            out.push(b' ');
286        } else if b == b'=' {
287            let h1 = it.next();
288            let h2 = it.next();
289            if let (Some(a), Some(c)) = (h1, h2) {
290                if let (Some(hi), Some(lo)) = (hex_nibble(a), hex_nibble(c)) {
291                    out.push((hi << 4) | lo);
292                    continue;
293                }
294            }
295            out.push(b'=');
296        } else {
297            out.push(b);
298        }
299    }
300    out
301}
302
303fn hex_nibble(b: u8) -> Option<u8> {
304    match b {
305        b'0'..=b'9' => Some(b - b'0'),
306        b'a'..=b'f' => Some(b - b'a' + 10),
307        b'A'..=b'F' => Some(b - b'A' + 10),
308        _ => None,
309    }
310}
311
312fn base64_decode_rfc2047(input: &str) -> Option<Vec<u8>> {
313    const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
314    let mut output = Vec::new();
315    let mut buf: u32 = 0;
316    let mut bits: u32 = 0;
317    for &byte in input.as_bytes() {
318        if byte == b'=' {
319            break;
320        }
321        if byte.is_ascii_whitespace() {
322            continue;
323        }
324        let val = TABLE.iter().position(|&c| c == byte)? as u32;
325        buf = (buf << 6) | val;
326        bits += 6;
327        if bits >= 8 {
328            bits -= 8;
329            output.push((buf >> bits) as u8);
330            buf &= (1 << bits) - 1;
331        }
332    }
333    Some(output)
334}
335
336/// Raw `author` / `committer` header payloads for a new commit object.
337///
338/// When `encoding` is unset or UTF-8, returns empty vectors so
339/// [`crate::objects::serialize_commit`] writes the Unicode [`String`] fields as UTF-8.
340/// When `encoding` is non-UTF-8, encodes the full identity lines (name, email, timestamp)
341/// for storage in that charset.
342#[must_use]
343pub fn identity_raw_for_serialized_commit(
344    encoding: &Option<String>,
345    author: &str,
346    committer: &str,
347) -> (Vec<u8>, Vec<u8>) {
348    let is_utf8 = match encoding.as_deref() {
349        None => true,
350        Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
351    };
352    if is_utf8 {
353        return (Vec::new(), Vec::new());
354    }
355    let Some(label) = encoding.as_deref() else {
356        return (Vec::new(), Vec::new());
357    };
358    let author_raw = encode_header_text(label, author).unwrap_or_default();
359    let committer_raw = encode_header_text(label, committer).unwrap_or_default();
360    (author_raw, committer_raw)
361}
362
363/// Unicode commit message body for display (for example, `format-patch`).
364///
365/// Uses `raw_message` when set; otherwise returns `message`.
366#[must_use]
367pub fn commit_message_unicode_for_display(
368    encoding: Option<&str>,
369    message: &str,
370    raw_message: Option<&[u8]>,
371) -> String {
372    if let Some(raw) = raw_message {
373        decode_bytes(encoding, raw)
374    } else {
375        message.to_owned()
376    }
377}
378
379#[cfg(test)]
380mod tests {
381    use super::*;
382
383    #[test]
384    fn strict_utf8_accepts_plain_ascii_and_multibyte() {
385        assert!(is_strict_utf8(b"Commit message\n"));
386        // Valid multi-byte UTF-8 (Latin small letter a with acute, CJK).
387        assert!(is_strict_utf8("Ábçdèfg はれひほふ".as_bytes()));
388        // ISO-2022-JP is a 7-bit encoding using ESC control bytes; valid UTF-8.
389        assert!(is_strict_utf8(b"\x1b$B$O$l$R$[$U\x1b(B"));
390    }
391
392    #[test]
393    fn strict_utf8_rejects_surrogates() {
394        // Encoded surrogate U+D800 (ED A0 80) — invalid in UTF-8.
395        assert_eq!(find_invalid_utf8(b"abc\xed\xa0\x80"), Some(3));
396        assert!(!is_strict_utf8(b"\xed\xa0\x80"));
397    }
398
399    #[test]
400    fn strict_utf8_rejects_overlong_sequences() {
401        // Overlong encoding of U+0029 and the C0 A0 "fake space".
402        assert!(!is_strict_utf8(b"\xe0\x82\xa9"));
403        assert!(!is_strict_utf8(b"\xc0\xa0"));
404    }
405
406    #[test]
407    fn strict_utf8_rejects_noncharacters_rust_would_accept() {
408        // U+10FFFE non-character: F4 8F BF BE.
409        assert!(core::str::from_utf8(b"\xf4\x8f\xbf\xbe").is_ok());
410        assert!(!is_strict_utf8(b"\xf4\x8f\xbf\xbe"));
411        // U+FDD0 (in the U+FDD0..=U+FDEF non-character block): EF B7 90.
412        assert!(core::str::from_utf8(b"\xef\xb7\x90").is_ok());
413        assert!(!is_strict_utf8(b"\xef\xb7\x90"));
414    }
415
416    #[test]
417    fn latin1_round_trips_through_encode_and_decode() {
418        let unicode = "Áéí óú";
419        let encoded = encode_header_text("ISO8859-1", unicode).expect("latin1 encodes");
420        assert_eq!(encoded, vec![0xC1, 0xE9, 0xED, 0x20, 0xF3, 0xFA]);
421        assert_eq!(decode_bytes(Some("ISO8859-1"), &encoded), unicode);
422    }
423}