grit_lib/
commit_encoding.rs1use encoding_rs::Encoding;
7
8fn is_iso_8859_1(label: &str) -> bool {
9 matches!(
10 label.trim().to_ascii_lowercase().as_str(),
11 "iso-8859-1" | "iso8859-1" | "latin1" | "latin-1"
12 )
13}
14
15fn decode_latin1(bytes: &[u8]) -> String {
16 let mut s = String::with_capacity(bytes.len());
17 for &b in bytes {
18 s.push(char::from_u32(u32::from(b)).unwrap_or('\u{FFFD}'));
19 }
20 s
21}
22
23fn encode_latin1_lossy(unicode: &str) -> Vec<u8> {
24 unicode
25 .chars()
26 .map(|c| {
27 let cp = u32::from(c);
28 if cp <= 0xFF {
29 cp as u8
30 } else {
31 b'?'
32 }
33 })
34 .collect()
35}
36
37#[must_use]
45pub fn find_invalid_utf8(buf: &[u8]) -> Option<usize> {
46 const MAX_CODEPOINT: [u32; 4] = [0x7f, 0x7ff, 0xffff, 0x10ffff];
47 let mut i = 0usize;
48 while i < buf.len() {
49 let c = buf[i];
50 let bad_offset = i;
51 i += 1;
52 if c < 0x80 {
54 continue;
55 }
56 let mut bytes = 0usize;
59 let mut cc = c;
60 while cc & 0x40 != 0 {
61 cc <<= 1;
62 bytes += 1;
63 }
64 if !(1..=3).contains(&bytes) {
66 return Some(bad_offset);
67 }
68 if buf.len() - i < bytes {
70 return Some(bad_offset);
71 }
72 let mut codepoint = (u32::from(cc) & 0x7f) >> bytes;
73 let min_val = MAX_CODEPOINT[bytes - 1] + 1;
74 let max_val = MAX_CODEPOINT[bytes];
75 for _ in 0..bytes {
77 let b = buf[i];
78 codepoint = (codepoint << 6) | (u32::from(b) & 0x3f);
79 if b & 0xc0 != 0x80 {
80 return Some(bad_offset);
81 }
82 i += 1;
83 }
84 if codepoint < min_val || codepoint > max_val {
85 return Some(bad_offset);
86 }
87 if codepoint & 0x1f_f800 == 0xd800 {
90 return Some(bad_offset);
91 }
92 if codepoint & 0xfffe == 0xfffe {
95 return Some(bad_offset);
96 }
97 if (0xfdd0..=0xfdef).contains(&codepoint) {
99 return Some(bad_offset);
100 }
101 }
102 None
103}
104
105#[must_use]
107pub fn is_strict_utf8(buf: &[u8]) -> bool {
108 find_invalid_utf8(buf).is_none()
109}
110
111#[must_use]
113pub fn ensure_body_trailing_newline(mut bytes: Vec<u8>) -> Vec<u8> {
114 if !bytes.is_empty() && !bytes.ends_with(b"\n") {
115 bytes.push(b'\n');
116 }
117 bytes
118}
119
120pub fn is_known_encoding(label: &str) -> bool {
124 is_iso_8859_1(label) || resolve(label).is_some()
125}
126
127#[must_use]
132pub fn resolve(label: &str) -> Option<&'static Encoding> {
133 let t = label.trim();
134 if t.is_empty() || is_iso_8859_1(t) {
135 return None;
136 }
137 let normalized = t.replace('_', "-");
138 let lower = normalized.to_ascii_lowercase();
139 let mapped = match lower.as_str() {
140 "eucjp" => "euc-jp",
141 "cp932" | "mskanji" | "sjis" => "shift_jis",
142 _ => normalized.as_str(),
143 };
144 Encoding::for_label(mapped.as_bytes()).or_else(|| Encoding::for_label(t.as_bytes()))
145}
146
147#[must_use]
149pub fn encode_unicode(label: &str, unicode: &str) -> Option<Vec<u8>> {
150 let t = label.trim();
151 let raw = if is_iso_8859_1(t) {
152 encode_latin1_lossy(unicode)
153 } else {
154 let enc = resolve(t)?;
155 let (cow, _, _) = enc.encode(unicode);
156 cow.into_owned()
157 };
158 Some(ensure_body_trailing_newline(raw))
159}
160
161#[must_use]
163pub fn encode_header_text(label: &str, unicode: &str) -> Option<Vec<u8>> {
164 let t = label.trim();
165 if is_iso_8859_1(t) {
166 return Some(encode_latin1_lossy(unicode));
167 }
168 let enc = resolve(t)?;
169 let (cow, _, _) = enc.encode(unicode);
170 Some(cow.into_owned())
171}
172
173#[must_use]
175pub fn decode_bytes(label: Option<&str>, bytes: &[u8]) -> String {
176 if let Some(l) = label {
177 if is_iso_8859_1(l) {
178 return decode_latin1(bytes);
179 }
180 if let Some(enc) = resolve(l) {
181 let (cow, _) = enc.decode_without_bom_handling(bytes);
182 return cow.into_owned();
183 }
184 }
185 String::from_utf8_lossy(bytes).into_owned()
186}
187
188#[must_use]
190pub fn reencode_utf8_to_label(output_label: &str, unicode: &str) -> Option<Vec<u8>> {
191 encode_header_text(output_label, unicode)
192}
193
194#[must_use]
200pub fn finalize_stored_commit_message(
201 message: String,
202 commit_encoding: Option<&str>,
203) -> (String, Option<String>, Option<Vec<u8>>) {
204 let is_utf8 = match commit_encoding {
205 None => true,
206 Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
207 };
208 if is_utf8 {
209 return (message, None, None);
210 }
211 let Some(label) = commit_encoding.filter(|s| !s.trim().is_empty()) else {
212 return (message, None, None);
213 };
214 let Some(raw) = encode_unicode(label, &message) else {
215 return (message, None, None);
216 };
217 (message, Some(label.to_owned()), Some(raw))
218}
219
220#[must_use]
225pub fn decode_rfc2047_mailbox_from_line(from: &str) -> String {
226 let from = from.trim();
227 let Some(lt) = from.find('<') else {
228 return decode_rfc2047_encoded_words(from);
229 };
230 let name = from[..lt].trim();
231 let tail = &from[lt..];
232 let decoded = decode_rfc2047_encoded_words(name);
233 if decoded.is_empty() {
234 tail.trim_start().to_string()
235 } else {
236 format!("{decoded} {tail}")
237 }
238}
239
240fn decode_rfc2047_encoded_words(s: &str) -> String {
241 let mut out = String::new();
242 let mut rest = s;
243 while let Some(start) = rest.find("=?") {
244 out.push_str(&rest[..start]);
245 rest = &rest[start + 2..];
246 let Some(d1) = rest.find('?') else {
247 out.push_str("=?");
248 out.push_str(rest);
249 return out;
250 };
251 let charset = &rest[..d1];
252 let after_cs = &rest[d1 + 1..];
253 let Some(d2) = after_cs.find('?') else {
254 out.push_str("=?");
255 out.push_str(rest);
256 return out;
257 };
258 let encoding = after_cs[..d2].to_ascii_lowercase();
259 let after_enc = &after_cs[d2 + 1..];
260 let Some(end) = after_enc.find("?=") else {
261 out.push_str("=?");
262 out.push_str(rest);
263 return out;
264 };
265 let payload = &after_enc[..end];
266 rest = &after_enc[end + 2..];
267 if encoding == "q" {
268 let bytes = decode_quoted_printable_soft(payload);
269 out.push_str(&decode_bytes(Some(charset), &bytes));
270 } else if encoding == "b" {
271 if let Some(bytes) = base64_decode_rfc2047(payload) {
272 out.push_str(&decode_bytes(Some(charset), &bytes));
273 }
274 }
275 }
276 out.push_str(rest);
277 out
278}
279
280fn decode_quoted_printable_soft(payload: &str) -> Vec<u8> {
281 let mut out = Vec::new();
282 let mut it = payload.as_bytes().iter().copied().peekable();
283 while let Some(b) = it.next() {
284 if b == b'_' {
285 out.push(b' ');
286 } else if b == b'=' {
287 let h1 = it.next();
288 let h2 = it.next();
289 if let (Some(a), Some(c)) = (h1, h2) {
290 if let (Some(hi), Some(lo)) = (hex_nibble(a), hex_nibble(c)) {
291 out.push((hi << 4) | lo);
292 continue;
293 }
294 }
295 out.push(b'=');
296 } else {
297 out.push(b);
298 }
299 }
300 out
301}
302
303fn hex_nibble(b: u8) -> Option<u8> {
304 match b {
305 b'0'..=b'9' => Some(b - b'0'),
306 b'a'..=b'f' => Some(b - b'a' + 10),
307 b'A'..=b'F' => Some(b - b'A' + 10),
308 _ => None,
309 }
310}
311
312fn base64_decode_rfc2047(input: &str) -> Option<Vec<u8>> {
313 const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
314 let mut output = Vec::new();
315 let mut buf: u32 = 0;
316 let mut bits: u32 = 0;
317 for &byte in input.as_bytes() {
318 if byte == b'=' {
319 break;
320 }
321 if byte.is_ascii_whitespace() {
322 continue;
323 }
324 let val = TABLE.iter().position(|&c| c == byte)? as u32;
325 buf = (buf << 6) | val;
326 bits += 6;
327 if bits >= 8 {
328 bits -= 8;
329 output.push((buf >> bits) as u8);
330 buf &= (1 << bits) - 1;
331 }
332 }
333 Some(output)
334}
335
336#[must_use]
343pub fn identity_raw_for_serialized_commit(
344 encoding: &Option<String>,
345 author: &str,
346 committer: &str,
347) -> (Vec<u8>, Vec<u8>) {
348 let is_utf8 = match encoding.as_deref() {
349 None => true,
350 Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
351 };
352 if is_utf8 {
353 return (Vec::new(), Vec::new());
354 }
355 let Some(label) = encoding.as_deref() else {
356 return (Vec::new(), Vec::new());
357 };
358 let author_raw = encode_header_text(label, author).unwrap_or_default();
359 let committer_raw = encode_header_text(label, committer).unwrap_or_default();
360 (author_raw, committer_raw)
361}
362
363#[must_use]
367pub fn commit_message_unicode_for_display(
368 encoding: Option<&str>,
369 message: &str,
370 raw_message: Option<&[u8]>,
371) -> String {
372 if let Some(raw) = raw_message {
373 decode_bytes(encoding, raw)
374 } else {
375 message.to_owned()
376 }
377}
378
379#[cfg(test)]
380mod tests {
381 use super::*;
382
383 #[test]
384 fn strict_utf8_accepts_plain_ascii_and_multibyte() {
385 assert!(is_strict_utf8(b"Commit message\n"));
386 assert!(is_strict_utf8("Ábçdèfg はれひほふ".as_bytes()));
388 assert!(is_strict_utf8(b"\x1b$B$O$l$R$[$U\x1b(B"));
390 }
391
392 #[test]
393 fn strict_utf8_rejects_surrogates() {
394 assert_eq!(find_invalid_utf8(b"abc\xed\xa0\x80"), Some(3));
396 assert!(!is_strict_utf8(b"\xed\xa0\x80"));
397 }
398
399 #[test]
400 fn strict_utf8_rejects_overlong_sequences() {
401 assert!(!is_strict_utf8(b"\xe0\x82\xa9"));
403 assert!(!is_strict_utf8(b"\xc0\xa0"));
404 }
405
406 #[test]
407 fn strict_utf8_rejects_noncharacters_rust_would_accept() {
408 assert!(core::str::from_utf8(b"\xf4\x8f\xbf\xbe").is_ok());
410 assert!(!is_strict_utf8(b"\xf4\x8f\xbf\xbe"));
411 assert!(core::str::from_utf8(b"\xef\xb7\x90").is_ok());
413 assert!(!is_strict_utf8(b"\xef\xb7\x90"));
414 }
415
416 #[test]
417 fn latin1_round_trips_through_encode_and_decode() {
418 let unicode = "Áéí óú";
419 let encoded = encode_header_text("ISO8859-1", unicode).expect("latin1 encodes");
420 assert_eq!(encoded, vec![0xC1, 0xE9, 0xED, 0x20, 0xF3, 0xFA]);
421 assert_eq!(decode_bytes(Some("ISO8859-1"), &encoded), unicode);
422 }
423}