Skip to main content

grit_lib/
commit_encoding.rs

1//! Git commit encoding labels (`encoding` header, `i18n.commitEncoding`) mapped to codecs.
2//!
3//! Git's `ISO-8859-1` is strict Latin-1; `encoding_rs` maps that label to Windows-1252, so we
4//! handle Latin-1 separately.
5
6use encoding_rs::Encoding;
7
8fn is_iso_8859_1(label: &str) -> bool {
9    matches!(
10        label.trim().to_ascii_lowercase().as_str(),
11        "iso-8859-1" | "iso8859-1" | "latin1" | "latin-1"
12    )
13}
14
15fn decode_latin1(bytes: &[u8]) -> String {
16    let mut s = String::with_capacity(bytes.len());
17    for &b in bytes {
18        s.push(char::from_u32(u32::from(b)).unwrap_or('\u{FFFD}'));
19    }
20    s
21}
22
23fn encode_latin1_lossy(unicode: &str) -> Vec<u8> {
24    unicode
25        .chars()
26        .map(|c| {
27            let cp = u32::from(c);
28            if cp <= 0xFF {
29                cp as u8
30            } else {
31                b'?'
32            }
33        })
34        .collect()
35}
36
37/// Find the offset of the first byte that is not part of a strictly valid UTF-8
38/// sequence, mirroring Git's `find_invalid_utf8` (commit.c).
39///
40/// This is stricter than [`core::str::from_utf8`]: in addition to rejecting
41/// malformed/overlong sequences and surrogates, it also rejects the Unicode
42/// non-characters `U+xxFFFE`, `U+xxFFFF`, and the range `U+FDD0..=U+FDEF`, which
43/// Rust's standard library accepts. Returns `None` when the whole buffer is valid.
44#[must_use]
45pub fn find_invalid_utf8(buf: &[u8]) -> Option<usize> {
46    const MAX_CODEPOINT: [u32; 4] = [0x7f, 0x7ff, 0xffff, 0x10ffff];
47    let mut i = 0usize;
48    while i < buf.len() {
49        let c = buf[i];
50        let bad_offset = i;
51        i += 1;
52        // Simple US-ASCII? No worries.
53        if c < 0x80 {
54            continue;
55        }
56        // Count how many more high bits are set: that's how many more bytes
57        // this sequence should have.
58        let mut bytes = 0usize;
59        let mut cc = c;
60        while cc & 0x40 != 0 {
61            cc <<= 1;
62            bytes += 1;
63        }
64        // Must be between 1 and 3 more bytes.
65        if !(1..=3).contains(&bytes) {
66            return Some(bad_offset);
67        }
68        // Do we have that many bytes?
69        if buf.len() - i < bytes {
70            return Some(bad_offset);
71        }
72        let mut codepoint = (u32::from(cc) & 0x7f) >> bytes;
73        let min_val = MAX_CODEPOINT[bytes - 1] + 1;
74        let max_val = MAX_CODEPOINT[bytes];
75        // Verify that they are good continuation bytes.
76        for _ in 0..bytes {
77            let b = buf[i];
78            codepoint = (codepoint << 6) | (u32::from(b) & 0x3f);
79            if b & 0xc0 != 0x80 {
80                return Some(bad_offset);
81            }
82            i += 1;
83        }
84        if codepoint < min_val || codepoint > max_val {
85            return Some(bad_offset);
86        }
87        // Surrogates are only for UTF-16 and cannot be encoded in UTF-8.
88        if codepoint & 0x1f_f800 == 0xd800 {
89            return Some(bad_offset);
90        }
91        // U+xxFFFE and U+xxFFFF are guaranteed non-characters.
92        if codepoint & 0xfffe == 0xfffe {
93            return Some(bad_offset);
94        }
95        // So is anything in the range U+FDD0..=U+FDEF.
96        if (0xfdd0..=0xfdef).contains(&codepoint) {
97            return Some(bad_offset);
98        }
99    }
100    None
101}
102
103/// Whether `buf` is strictly valid UTF-8 per Git's rules (see [`find_invalid_utf8`]).
104#[must_use]
105pub fn is_strict_utf8(buf: &[u8]) -> bool {
106    find_invalid_utf8(buf).is_none()
107}
108
109/// Git stores the commit message body with a trailing newline when non-empty.
110#[must_use]
111pub fn ensure_body_trailing_newline(mut bytes: Vec<u8>) -> Vec<u8> {
112    if !bytes.is_empty() && !bytes.ends_with(b"\n") {
113        bytes.push(b'\n');
114    }
115    bytes
116}
117
118/// Whether `label` names an encoding Git can decode (ISO-8859-1 or any encoding
119/// resolvable via [`resolve`]). Unknown names (e.g. the test's `non-utf-8`) return
120/// false, matching Git's `logmsg_reencode` no-op fallback.
121pub fn is_known_encoding(label: &str) -> bool {
122    is_iso_8859_1(label) || resolve(label).is_some()
123}
124
125/// Resolve an encoding label the way Git uses it in config and commit objects.
126///
127/// Git accepts names like `eucJP` that [`Encoding::for_label`] does not recognize.
128/// ISO-8859-1 is handled separately as strict Latin-1 and returns `None`.
129#[must_use]
130pub fn resolve(label: &str) -> Option<&'static Encoding> {
131    let t = label.trim();
132    if t.is_empty() || is_iso_8859_1(t) {
133        return None;
134    }
135    let normalized = t.replace('_', "-");
136    let lower = normalized.to_ascii_lowercase();
137    let mapped = match lower.as_str() {
138        "eucjp" => "euc-jp",
139        "cp932" | "mskanji" | "sjis" => "shift_jis",
140        _ => normalized.as_str(),
141    };
142    Encoding::for_label(mapped.as_bytes()).or_else(|| Encoding::for_label(t.as_bytes()))
143}
144
145/// Encode `unicode` for storage in a commit message body using Git's encoding name.
146#[must_use]
147pub fn encode_unicode(label: &str, unicode: &str) -> Option<Vec<u8>> {
148    let t = label.trim();
149    let raw = if is_iso_8859_1(t) {
150        encode_latin1_lossy(unicode)
151    } else {
152        let enc = resolve(t)?;
153        let (cow, _, _) = enc.encode(unicode);
154        cow.into_owned()
155    };
156    Some(ensure_body_trailing_newline(raw))
157}
158
159/// Encode a single header field (author/committer line) without adding a trailing newline.
160#[must_use]
161pub fn encode_header_text(label: &str, unicode: &str) -> Option<Vec<u8>> {
162    let t = label.trim();
163    if is_iso_8859_1(t) {
164        return Some(encode_latin1_lossy(unicode));
165    }
166    let enc = resolve(t)?;
167    let (cow, _, _) = enc.encode(unicode);
168    Some(cow.into_owned())
169}
170
171/// Decode `bytes` using Git's encoding name, or lossy UTF-8 if unknown.
172#[must_use]
173pub fn decode_bytes(label: Option<&str>, bytes: &[u8]) -> String {
174    if let Some(l) = label {
175        if is_iso_8859_1(l) {
176            return decode_latin1(bytes);
177        }
178        if let Some(enc) = resolve(l) {
179            let (cow, _) = enc.decode_without_bom_handling(bytes);
180            return cow.into_owned();
181        }
182    }
183    String::from_utf8_lossy(bytes).into_owned()
184}
185
186/// Re-encode `unicode` from UTF-8 into `output_label`, or `None` if unsupported.
187#[must_use]
188pub fn reencode_utf8_to_label(output_label: &str, unicode: &str) -> Option<Vec<u8>> {
189    encode_header_text(output_label, unicode)
190}
191
192/// Prepare a commit message for storage per `i18n.commitEncoding` (or equivalent).
193///
194/// When the configured encoding is not UTF-8, returns [`Some`] raw bytes for the body
195/// and sets `encoding` in the commit object; otherwise UTF-8 is stored without an
196/// `encoding` header.
197#[must_use]
198pub fn finalize_stored_commit_message(
199    message: String,
200    commit_encoding: Option<&str>,
201) -> (String, Option<String>, Option<Vec<u8>>) {
202    let is_utf8 = match commit_encoding {
203        None => true,
204        Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
205    };
206    if is_utf8 {
207        return (message, None, None);
208    }
209    let Some(label) = commit_encoding.filter(|s| !s.trim().is_empty()) else {
210        return (message, None, None);
211    };
212    let Some(raw) = encode_unicode(label, &message) else {
213        return (message, None, None);
214    };
215    (message, Some(label.to_owned()), Some(raw))
216}
217
218/// Decode `=?charset?q?...?=` encoded-words in an email display name (before `<`).
219///
220/// Used when applying patches: `git format-patch` emits RFC 2047 in `From:`; the stored
221/// commit author should be the decoded Unicode form.
222#[must_use]
223pub fn decode_rfc2047_mailbox_from_line(from: &str) -> String {
224    let from = from.trim();
225    let Some(lt) = from.find('<') else {
226        return decode_rfc2047_encoded_words(from);
227    };
228    let name = from[..lt].trim();
229    let tail = &from[lt..];
230    let decoded = decode_rfc2047_encoded_words(name);
231    if decoded.is_empty() {
232        tail.trim_start().to_string()
233    } else {
234        format!("{decoded} {tail}")
235    }
236}
237
238fn decode_rfc2047_encoded_words(s: &str) -> String {
239    let mut out = String::new();
240    let mut rest = s;
241    while let Some(start) = rest.find("=?") {
242        out.push_str(&rest[..start]);
243        rest = &rest[start + 2..];
244        let Some(d1) = rest.find('?') else {
245            out.push_str("=?");
246            out.push_str(rest);
247            return out;
248        };
249        let charset = &rest[..d1];
250        let after_cs = &rest[d1 + 1..];
251        let Some(d2) = after_cs.find('?') else {
252            out.push_str("=?");
253            out.push_str(rest);
254            return out;
255        };
256        let encoding = after_cs[..d2].to_ascii_lowercase();
257        let after_enc = &after_cs[d2 + 1..];
258        let Some(end) = after_enc.find("?=") else {
259            out.push_str("=?");
260            out.push_str(rest);
261            return out;
262        };
263        let payload = &after_enc[..end];
264        rest = &after_enc[end + 2..];
265        if encoding == "q" {
266            let bytes = decode_quoted_printable_soft(payload);
267            out.push_str(&decode_bytes(Some(charset), &bytes));
268        } else if encoding == "b" {
269            if let Some(bytes) = base64_decode_rfc2047(payload) {
270                out.push_str(&decode_bytes(Some(charset), &bytes));
271            }
272        }
273    }
274    out.push_str(rest);
275    out
276}
277
278fn decode_quoted_printable_soft(payload: &str) -> Vec<u8> {
279    let mut out = Vec::new();
280    let mut it = payload.as_bytes().iter().copied().peekable();
281    while let Some(b) = it.next() {
282        if b == b'_' {
283            out.push(b' ');
284        } else if b == b'=' {
285            let h1 = it.next();
286            let h2 = it.next();
287            if let (Some(a), Some(c)) = (h1, h2) {
288                if let (Some(hi), Some(lo)) = (hex_nibble(a), hex_nibble(c)) {
289                    out.push((hi << 4) | lo);
290                    continue;
291                }
292            }
293            out.push(b'=');
294        } else {
295            out.push(b);
296        }
297    }
298    out
299}
300
301fn hex_nibble(b: u8) -> Option<u8> {
302    match b {
303        b'0'..=b'9' => Some(b - b'0'),
304        b'a'..=b'f' => Some(b - b'a' + 10),
305        b'A'..=b'F' => Some(b - b'A' + 10),
306        _ => None,
307    }
308}
309
310fn base64_decode_rfc2047(input: &str) -> Option<Vec<u8>> {
311    const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
312    let mut output = Vec::new();
313    let mut buf: u32 = 0;
314    let mut bits: u32 = 0;
315    for &byte in input.as_bytes() {
316        if byte == b'=' {
317            break;
318        }
319        if byte.is_ascii_whitespace() {
320            continue;
321        }
322        let val = TABLE.iter().position(|&c| c == byte)? as u32;
323        buf = (buf << 6) | val;
324        bits += 6;
325        if bits >= 8 {
326            bits -= 8;
327            output.push((buf >> bits) as u8);
328            buf &= (1 << bits) - 1;
329        }
330    }
331    Some(output)
332}
333
334/// Raw `author` / `committer` header payloads for a new commit object.
335///
336/// When `encoding` is unset or UTF-8, returns empty vectors so
337/// [`crate::objects::serialize_commit`] writes the Unicode [`String`] fields as UTF-8.
338/// When `encoding` is non-UTF-8, encodes the full identity lines (name, email, timestamp)
339/// for storage in that charset.
340#[must_use]
341pub fn identity_raw_for_serialized_commit(
342    encoding: &Option<String>,
343    author: &str,
344    committer: &str,
345) -> (Vec<u8>, Vec<u8>) {
346    let is_utf8 = match encoding.as_deref() {
347        None => true,
348        Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
349    };
350    if is_utf8 {
351        return (Vec::new(), Vec::new());
352    }
353    let Some(label) = encoding.as_deref() else {
354        return (Vec::new(), Vec::new());
355    };
356    let author_raw = encode_header_text(label, author).unwrap_or_default();
357    let committer_raw = encode_header_text(label, committer).unwrap_or_default();
358    (author_raw, committer_raw)
359}
360
361/// Unicode commit message body for display (for example, `format-patch`).
362///
363/// Uses `raw_message` when set; otherwise returns `message`.
364#[must_use]
365pub fn commit_message_unicode_for_display(
366    encoding: Option<&str>,
367    message: &str,
368    raw_message: Option<&[u8]>,
369) -> String {
370    if let Some(raw) = raw_message {
371        decode_bytes(encoding, raw)
372    } else {
373        message.to_owned()
374    }
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    #[test]
382    fn strict_utf8_accepts_plain_ascii_and_multibyte() {
383        assert!(is_strict_utf8(b"Commit message\n"));
384        // Valid multi-byte UTF-8 (Latin small letter a with acute, CJK).
385        assert!(is_strict_utf8("Ábçdèfg はれひほふ".as_bytes()));
386        // ISO-2022-JP is a 7-bit encoding using ESC control bytes; valid UTF-8.
387        assert!(is_strict_utf8(b"\x1b$B$O$l$R$[$U\x1b(B"));
388    }
389
390    #[test]
391    fn strict_utf8_rejects_surrogates() {
392        // Encoded surrogate U+D800 (ED A0 80) — invalid in UTF-8.
393        assert_eq!(find_invalid_utf8(b"abc\xed\xa0\x80"), Some(3));
394        assert!(!is_strict_utf8(b"\xed\xa0\x80"));
395    }
396
397    #[test]
398    fn strict_utf8_rejects_overlong_sequences() {
399        // Overlong encoding of U+0029 and the C0 A0 "fake space".
400        assert!(!is_strict_utf8(b"\xe0\x82\xa9"));
401        assert!(!is_strict_utf8(b"\xc0\xa0"));
402    }
403
404    #[test]
405    fn strict_utf8_rejects_noncharacters_rust_would_accept() {
406        // U+10FFFE non-character: F4 8F BF BE.
407        assert!(core::str::from_utf8(b"\xf4\x8f\xbf\xbe").is_ok());
408        assert!(!is_strict_utf8(b"\xf4\x8f\xbf\xbe"));
409        // U+FDD0 (in the U+FDD0..=U+FDEF non-character block): EF B7 90.
410        assert!(core::str::from_utf8(b"\xef\xb7\x90").is_ok());
411        assert!(!is_strict_utf8(b"\xef\xb7\x90"));
412    }
413
414    #[test]
415    fn latin1_round_trips_through_encode_and_decode() {
416        let unicode = "Áéí óú";
417        let encoded = encode_header_text("ISO8859-1", unicode).expect("latin1 encodes");
418        assert_eq!(encoded, vec![0xC1, 0xE9, 0xED, 0x20, 0xF3, 0xFA]);
419        assert_eq!(decode_bytes(Some("ISO8859-1"), &encoded), unicode);
420    }
421}