grit_lib/
commit_encoding.rs1use encoding_rs::Encoding;
7
8fn is_iso_8859_1(label: &str) -> bool {
9 matches!(
10 label.trim().to_ascii_lowercase().as_str(),
11 "iso-8859-1" | "iso8859-1" | "latin1" | "latin-1"
12 )
13}
14
15fn decode_latin1(bytes: &[u8]) -> String {
16 let mut s = String::with_capacity(bytes.len());
17 for &b in bytes {
18 s.push(char::from_u32(u32::from(b)).unwrap_or('\u{FFFD}'));
19 }
20 s
21}
22
23fn encode_latin1_lossy(unicode: &str) -> Vec<u8> {
24 unicode
25 .chars()
26 .map(|c| {
27 let cp = u32::from(c);
28 if cp <= 0xFF {
29 cp as u8
30 } else {
31 b'?'
32 }
33 })
34 .collect()
35}
36
37#[must_use]
45pub fn find_invalid_utf8(buf: &[u8]) -> Option<usize> {
46 const MAX_CODEPOINT: [u32; 4] = [0x7f, 0x7ff, 0xffff, 0x10ffff];
47 let mut i = 0usize;
48 while i < buf.len() {
49 let c = buf[i];
50 let bad_offset = i;
51 i += 1;
52 if c < 0x80 {
54 continue;
55 }
56 let mut bytes = 0usize;
59 let mut cc = c;
60 while cc & 0x40 != 0 {
61 cc <<= 1;
62 bytes += 1;
63 }
64 if !(1..=3).contains(&bytes) {
66 return Some(bad_offset);
67 }
68 if buf.len() - i < bytes {
70 return Some(bad_offset);
71 }
72 let mut codepoint = (u32::from(cc) & 0x7f) >> bytes;
73 let min_val = MAX_CODEPOINT[bytes - 1] + 1;
74 let max_val = MAX_CODEPOINT[bytes];
75 for _ in 0..bytes {
77 let b = buf[i];
78 codepoint = (codepoint << 6) | (u32::from(b) & 0x3f);
79 if b & 0xc0 != 0x80 {
80 return Some(bad_offset);
81 }
82 i += 1;
83 }
84 if codepoint < min_val || codepoint > max_val {
85 return Some(bad_offset);
86 }
87 if codepoint & 0x1f_f800 == 0xd800 {
89 return Some(bad_offset);
90 }
91 if codepoint & 0xfffe == 0xfffe {
93 return Some(bad_offset);
94 }
95 if (0xfdd0..=0xfdef).contains(&codepoint) {
97 return Some(bad_offset);
98 }
99 }
100 None
101}
102
103#[must_use]
105pub fn is_strict_utf8(buf: &[u8]) -> bool {
106 find_invalid_utf8(buf).is_none()
107}
108
109#[must_use]
111pub fn ensure_body_trailing_newline(mut bytes: Vec<u8>) -> Vec<u8> {
112 if !bytes.is_empty() && !bytes.ends_with(b"\n") {
113 bytes.push(b'\n');
114 }
115 bytes
116}
117
118pub fn is_known_encoding(label: &str) -> bool {
122 is_iso_8859_1(label) || resolve(label).is_some()
123}
124
125#[must_use]
130pub fn resolve(label: &str) -> Option<&'static Encoding> {
131 let t = label.trim();
132 if t.is_empty() || is_iso_8859_1(t) {
133 return None;
134 }
135 let normalized = t.replace('_', "-");
136 let lower = normalized.to_ascii_lowercase();
137 let mapped = match lower.as_str() {
138 "eucjp" => "euc-jp",
139 "cp932" | "mskanji" | "sjis" => "shift_jis",
140 _ => normalized.as_str(),
141 };
142 Encoding::for_label(mapped.as_bytes()).or_else(|| Encoding::for_label(t.as_bytes()))
143}
144
145#[must_use]
147pub fn encode_unicode(label: &str, unicode: &str) -> Option<Vec<u8>> {
148 let t = label.trim();
149 let raw = if is_iso_8859_1(t) {
150 encode_latin1_lossy(unicode)
151 } else {
152 let enc = resolve(t)?;
153 let (cow, _, _) = enc.encode(unicode);
154 cow.into_owned()
155 };
156 Some(ensure_body_trailing_newline(raw))
157}
158
159#[must_use]
161pub fn encode_header_text(label: &str, unicode: &str) -> Option<Vec<u8>> {
162 let t = label.trim();
163 if is_iso_8859_1(t) {
164 return Some(encode_latin1_lossy(unicode));
165 }
166 let enc = resolve(t)?;
167 let (cow, _, _) = enc.encode(unicode);
168 Some(cow.into_owned())
169}
170
171#[must_use]
173pub fn decode_bytes(label: Option<&str>, bytes: &[u8]) -> String {
174 if let Some(l) = label {
175 if is_iso_8859_1(l) {
176 return decode_latin1(bytes);
177 }
178 if let Some(enc) = resolve(l) {
179 let (cow, _) = enc.decode_without_bom_handling(bytes);
180 return cow.into_owned();
181 }
182 }
183 String::from_utf8_lossy(bytes).into_owned()
184}
185
186#[must_use]
188pub fn reencode_utf8_to_label(output_label: &str, unicode: &str) -> Option<Vec<u8>> {
189 encode_header_text(output_label, unicode)
190}
191
192#[must_use]
198pub fn finalize_stored_commit_message(
199 message: String,
200 commit_encoding: Option<&str>,
201) -> (String, Option<String>, Option<Vec<u8>>) {
202 let is_utf8 = match commit_encoding {
203 None => true,
204 Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
205 };
206 if is_utf8 {
207 return (message, None, None);
208 }
209 let Some(label) = commit_encoding.filter(|s| !s.trim().is_empty()) else {
210 return (message, None, None);
211 };
212 let Some(raw) = encode_unicode(label, &message) else {
213 return (message, None, None);
214 };
215 (message, Some(label.to_owned()), Some(raw))
216}
217
218#[must_use]
223pub fn decode_rfc2047_mailbox_from_line(from: &str) -> String {
224 let from = from.trim();
225 let Some(lt) = from.find('<') else {
226 return decode_rfc2047_encoded_words(from);
227 };
228 let name = from[..lt].trim();
229 let tail = &from[lt..];
230 let decoded = decode_rfc2047_encoded_words(name);
231 if decoded.is_empty() {
232 tail.trim_start().to_string()
233 } else {
234 format!("{decoded} {tail}")
235 }
236}
237
238fn decode_rfc2047_encoded_words(s: &str) -> String {
239 let mut out = String::new();
240 let mut rest = s;
241 while let Some(start) = rest.find("=?") {
242 out.push_str(&rest[..start]);
243 rest = &rest[start + 2..];
244 let Some(d1) = rest.find('?') else {
245 out.push_str("=?");
246 out.push_str(rest);
247 return out;
248 };
249 let charset = &rest[..d1];
250 let after_cs = &rest[d1 + 1..];
251 let Some(d2) = after_cs.find('?') else {
252 out.push_str("=?");
253 out.push_str(rest);
254 return out;
255 };
256 let encoding = after_cs[..d2].to_ascii_lowercase();
257 let after_enc = &after_cs[d2 + 1..];
258 let Some(end) = after_enc.find("?=") else {
259 out.push_str("=?");
260 out.push_str(rest);
261 return out;
262 };
263 let payload = &after_enc[..end];
264 rest = &after_enc[end + 2..];
265 if encoding == "q" {
266 let bytes = decode_quoted_printable_soft(payload);
267 out.push_str(&decode_bytes(Some(charset), &bytes));
268 } else if encoding == "b" {
269 if let Some(bytes) = base64_decode_rfc2047(payload) {
270 out.push_str(&decode_bytes(Some(charset), &bytes));
271 }
272 }
273 }
274 out.push_str(rest);
275 out
276}
277
278fn decode_quoted_printable_soft(payload: &str) -> Vec<u8> {
279 let mut out = Vec::new();
280 let mut it = payload.as_bytes().iter().copied().peekable();
281 while let Some(b) = it.next() {
282 if b == b'_' {
283 out.push(b' ');
284 } else if b == b'=' {
285 let h1 = it.next();
286 let h2 = it.next();
287 if let (Some(a), Some(c)) = (h1, h2) {
288 if let (Some(hi), Some(lo)) = (hex_nibble(a), hex_nibble(c)) {
289 out.push((hi << 4) | lo);
290 continue;
291 }
292 }
293 out.push(b'=');
294 } else {
295 out.push(b);
296 }
297 }
298 out
299}
300
301fn hex_nibble(b: u8) -> Option<u8> {
302 match b {
303 b'0'..=b'9' => Some(b - b'0'),
304 b'a'..=b'f' => Some(b - b'a' + 10),
305 b'A'..=b'F' => Some(b - b'A' + 10),
306 _ => None,
307 }
308}
309
310fn base64_decode_rfc2047(input: &str) -> Option<Vec<u8>> {
311 const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
312 let mut output = Vec::new();
313 let mut buf: u32 = 0;
314 let mut bits: u32 = 0;
315 for &byte in input.as_bytes() {
316 if byte == b'=' {
317 break;
318 }
319 if byte.is_ascii_whitespace() {
320 continue;
321 }
322 let val = TABLE.iter().position(|&c| c == byte)? as u32;
323 buf = (buf << 6) | val;
324 bits += 6;
325 if bits >= 8 {
326 bits -= 8;
327 output.push((buf >> bits) as u8);
328 buf &= (1 << bits) - 1;
329 }
330 }
331 Some(output)
332}
333
334#[must_use]
341pub fn identity_raw_for_serialized_commit(
342 encoding: &Option<String>,
343 author: &str,
344 committer: &str,
345) -> (Vec<u8>, Vec<u8>) {
346 let is_utf8 = match encoding.as_deref() {
347 None => true,
348 Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
349 };
350 if is_utf8 {
351 return (Vec::new(), Vec::new());
352 }
353 let Some(label) = encoding.as_deref() else {
354 return (Vec::new(), Vec::new());
355 };
356 let author_raw = encode_header_text(label, author).unwrap_or_default();
357 let committer_raw = encode_header_text(label, committer).unwrap_or_default();
358 (author_raw, committer_raw)
359}
360
361#[must_use]
365pub fn commit_message_unicode_for_display(
366 encoding: Option<&str>,
367 message: &str,
368 raw_message: Option<&[u8]>,
369) -> String {
370 if let Some(raw) = raw_message {
371 decode_bytes(encoding, raw)
372 } else {
373 message.to_owned()
374 }
375}
376
377#[cfg(test)]
378mod tests {
379 use super::*;
380
381 #[test]
382 fn strict_utf8_accepts_plain_ascii_and_multibyte() {
383 assert!(is_strict_utf8(b"Commit message\n"));
384 assert!(is_strict_utf8("Ábçdèfg はれひほふ".as_bytes()));
386 assert!(is_strict_utf8(b"\x1b$B$O$l$R$[$U\x1b(B"));
388 }
389
390 #[test]
391 fn strict_utf8_rejects_surrogates() {
392 assert_eq!(find_invalid_utf8(b"abc\xed\xa0\x80"), Some(3));
394 assert!(!is_strict_utf8(b"\xed\xa0\x80"));
395 }
396
397 #[test]
398 fn strict_utf8_rejects_overlong_sequences() {
399 assert!(!is_strict_utf8(b"\xe0\x82\xa9"));
401 assert!(!is_strict_utf8(b"\xc0\xa0"));
402 }
403
404 #[test]
405 fn strict_utf8_rejects_noncharacters_rust_would_accept() {
406 assert!(core::str::from_utf8(b"\xf4\x8f\xbf\xbe").is_ok());
408 assert!(!is_strict_utf8(b"\xf4\x8f\xbf\xbe"));
409 assert!(core::str::from_utf8(b"\xef\xb7\x90").is_ok());
411 assert!(!is_strict_utf8(b"\xef\xb7\x90"));
412 }
413
414 #[test]
415 fn latin1_round_trips_through_encode_and_decode() {
416 let unicode = "Áéí óú";
417 let encoded = encode_header_text("ISO8859-1", unicode).expect("latin1 encodes");
418 assert_eq!(encoded, vec![0xC1, 0xE9, 0xED, 0x20, 0xF3, 0xFA]);
419 assert_eq!(decode_bytes(Some("ISO8859-1"), &encoded), unicode);
420 }
421}