use encoding_rs::Encoding;
fn is_iso_8859_1(label: &str) -> bool {
matches!(
label.trim().to_ascii_lowercase().as_str(),
"iso-8859-1" | "iso8859-1" | "latin1" | "latin-1"
)
}
fn decode_latin1(bytes: &[u8]) -> String {
let mut s = String::with_capacity(bytes.len());
for &b in bytes {
s.push(char::from_u32(u32::from(b)).unwrap_or('\u{FFFD}'));
}
s
}
fn encode_latin1_lossy(unicode: &str) -> Vec<u8> {
unicode
.chars()
.map(|c| {
let cp = u32::from(c);
if cp <= 0xFF {
cp as u8
} else {
b'?'
}
})
.collect()
}
#[must_use]
pub fn find_invalid_utf8(buf: &[u8]) -> Option<usize> {
const MAX_CODEPOINT: [u32; 4] = [0x7f, 0x7ff, 0xffff, 0x10ffff];
let mut i = 0usize;
while i < buf.len() {
let c = buf[i];
let bad_offset = i;
i += 1;
if c < 0x80 {
continue;
}
let mut bytes = 0usize;
let mut cc = c;
while cc & 0x40 != 0 {
cc <<= 1;
bytes += 1;
}
if !(1..=3).contains(&bytes) {
return Some(bad_offset);
}
if buf.len() - i < bytes {
return Some(bad_offset);
}
let mut codepoint = (u32::from(cc) & 0x7f) >> bytes;
let min_val = MAX_CODEPOINT[bytes - 1] + 1;
let max_val = MAX_CODEPOINT[bytes];
for _ in 0..bytes {
let b = buf[i];
codepoint = (codepoint << 6) | (u32::from(b) & 0x3f);
if b & 0xc0 != 0x80 {
return Some(bad_offset);
}
i += 1;
}
if codepoint < min_val || codepoint > max_val {
return Some(bad_offset);
}
if codepoint & 0x1f_f800 == 0xd800 {
return Some(bad_offset);
}
if codepoint & 0xfffe == 0xfffe {
return Some(bad_offset);
}
if (0xfdd0..=0xfdef).contains(&codepoint) {
return Some(bad_offset);
}
}
None
}
#[must_use]
pub fn is_strict_utf8(buf: &[u8]) -> bool {
find_invalid_utf8(buf).is_none()
}
#[must_use]
pub fn ensure_body_trailing_newline(mut bytes: Vec<u8>) -> Vec<u8> {
if !bytes.is_empty() && !bytes.ends_with(b"\n") {
bytes.push(b'\n');
}
bytes
}
pub fn is_known_encoding(label: &str) -> bool {
is_iso_8859_1(label) || resolve(label).is_some()
}
#[must_use]
pub fn resolve(label: &str) -> Option<&'static Encoding> {
let t = label.trim();
if t.is_empty() || is_iso_8859_1(t) {
return None;
}
let normalized = t.replace('_', "-");
let lower = normalized.to_ascii_lowercase();
let mapped = match lower.as_str() {
"eucjp" => "euc-jp",
"cp932" | "mskanji" | "sjis" => "shift_jis",
_ => normalized.as_str(),
};
Encoding::for_label(mapped.as_bytes()).or_else(|| Encoding::for_label(t.as_bytes()))
}
#[must_use]
pub fn encode_unicode(label: &str, unicode: &str) -> Option<Vec<u8>> {
let t = label.trim();
let raw = if is_iso_8859_1(t) {
encode_latin1_lossy(unicode)
} else {
let enc = resolve(t)?;
let (cow, _, _) = enc.encode(unicode);
cow.into_owned()
};
Some(ensure_body_trailing_newline(raw))
}
#[must_use]
pub fn encode_header_text(label: &str, unicode: &str) -> Option<Vec<u8>> {
let t = label.trim();
if is_iso_8859_1(t) {
return Some(encode_latin1_lossy(unicode));
}
let enc = resolve(t)?;
let (cow, _, _) = enc.encode(unicode);
Some(cow.into_owned())
}
#[must_use]
pub fn decode_bytes(label: Option<&str>, bytes: &[u8]) -> String {
if let Some(l) = label {
if is_iso_8859_1(l) {
return decode_latin1(bytes);
}
if let Some(enc) = resolve(l) {
let (cow, _) = enc.decode_without_bom_handling(bytes);
return cow.into_owned();
}
}
String::from_utf8_lossy(bytes).into_owned()
}
#[must_use]
pub fn reencode_utf8_to_label(output_label: &str, unicode: &str) -> Option<Vec<u8>> {
encode_header_text(output_label, unicode)
}
#[must_use]
pub fn finalize_stored_commit_message(
message: String,
commit_encoding: Option<&str>,
) -> (String, Option<String>, Option<Vec<u8>>) {
let is_utf8 = match commit_encoding {
None => true,
Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
};
if is_utf8 {
return (message, None, None);
}
let Some(label) = commit_encoding.filter(|s| !s.trim().is_empty()) else {
return (message, None, None);
};
let Some(raw) = encode_unicode(label, &message) else {
return (message, None, None);
};
(message, Some(label.to_owned()), Some(raw))
}
#[must_use]
pub fn decode_rfc2047_mailbox_from_line(from: &str) -> String {
let from = from.trim();
let Some(lt) = from.find('<') else {
return decode_rfc2047_encoded_words(from);
};
let name = from[..lt].trim();
let tail = &from[lt..];
let decoded = decode_rfc2047_encoded_words(name);
if decoded.is_empty() {
tail.trim_start().to_string()
} else {
format!("{decoded} {tail}")
}
}
fn decode_rfc2047_encoded_words(s: &str) -> String {
let mut out = String::new();
let mut rest = s;
while let Some(start) = rest.find("=?") {
out.push_str(&rest[..start]);
rest = &rest[start + 2..];
let Some(d1) = rest.find('?') else {
out.push_str("=?");
out.push_str(rest);
return out;
};
let charset = &rest[..d1];
let after_cs = &rest[d1 + 1..];
let Some(d2) = after_cs.find('?') else {
out.push_str("=?");
out.push_str(rest);
return out;
};
let encoding = after_cs[..d2].to_ascii_lowercase();
let after_enc = &after_cs[d2 + 1..];
let Some(end) = after_enc.find("?=") else {
out.push_str("=?");
out.push_str(rest);
return out;
};
let payload = &after_enc[..end];
rest = &after_enc[end + 2..];
if encoding == "q" {
let bytes = decode_quoted_printable_soft(payload);
out.push_str(&decode_bytes(Some(charset), &bytes));
} else if encoding == "b" {
if let Some(bytes) = base64_decode_rfc2047(payload) {
out.push_str(&decode_bytes(Some(charset), &bytes));
}
}
}
out.push_str(rest);
out
}
fn decode_quoted_printable_soft(payload: &str) -> Vec<u8> {
let mut out = Vec::new();
let mut it = payload.as_bytes().iter().copied().peekable();
while let Some(b) = it.next() {
if b == b'_' {
out.push(b' ');
} else if b == b'=' {
let h1 = it.next();
let h2 = it.next();
if let (Some(a), Some(c)) = (h1, h2) {
if let (Some(hi), Some(lo)) = (hex_nibble(a), hex_nibble(c)) {
out.push((hi << 4) | lo);
continue;
}
}
out.push(b'=');
} else {
out.push(b);
}
}
out
}
fn hex_nibble(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(b - b'a' + 10),
b'A'..=b'F' => Some(b - b'A' + 10),
_ => None,
}
}
fn base64_decode_rfc2047(input: &str) -> Option<Vec<u8>> {
const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
let mut output = Vec::new();
let mut buf: u32 = 0;
let mut bits: u32 = 0;
for &byte in input.as_bytes() {
if byte == b'=' {
break;
}
if byte.is_ascii_whitespace() {
continue;
}
let val = TABLE.iter().position(|&c| c == byte)? as u32;
buf = (buf << 6) | val;
bits += 6;
if bits >= 8 {
bits -= 8;
output.push((buf >> bits) as u8);
buf &= (1 << bits) - 1;
}
}
Some(output)
}
#[must_use]
pub fn identity_raw_for_serialized_commit(
encoding: &Option<String>,
author: &str,
committer: &str,
) -> (Vec<u8>, Vec<u8>) {
let is_utf8 = match encoding.as_deref() {
None => true,
Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
};
if is_utf8 {
return (Vec::new(), Vec::new());
}
let Some(label) = encoding.as_deref() else {
return (Vec::new(), Vec::new());
};
let author_raw = encode_header_text(label, author).unwrap_or_default();
let committer_raw = encode_header_text(label, committer).unwrap_or_default();
(author_raw, committer_raw)
}
#[must_use]
pub fn commit_message_unicode_for_display(
encoding: Option<&str>,
message: &str,
raw_message: Option<&[u8]>,
) -> String {
if let Some(raw) = raw_message {
decode_bytes(encoding, raw)
} else {
message.to_owned()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strict_utf8_accepts_plain_ascii_and_multibyte() {
assert!(is_strict_utf8(b"Commit message\n"));
assert!(is_strict_utf8("Ábçdèfg はれひほふ".as_bytes()));
assert!(is_strict_utf8(b"\x1b$B$O$l$R$[$U\x1b(B"));
}
#[test]
fn strict_utf8_rejects_surrogates() {
assert_eq!(find_invalid_utf8(b"abc\xed\xa0\x80"), Some(3));
assert!(!is_strict_utf8(b"\xed\xa0\x80"));
}
#[test]
fn strict_utf8_rejects_overlong_sequences() {
assert!(!is_strict_utf8(b"\xe0\x82\xa9"));
assert!(!is_strict_utf8(b"\xc0\xa0"));
}
#[test]
fn strict_utf8_rejects_noncharacters_rust_would_accept() {
assert!(core::str::from_utf8(b"\xf4\x8f\xbf\xbe").is_ok());
assert!(!is_strict_utf8(b"\xf4\x8f\xbf\xbe"));
assert!(core::str::from_utf8(b"\xef\xb7\x90").is_ok());
assert!(!is_strict_utf8(b"\xef\xb7\x90"));
}
#[test]
fn latin1_round_trips_through_encode_and_decode() {
let unicode = "Áéí óú";
let encoded = encode_header_text("ISO8859-1", unicode).expect("latin1 encodes");
assert_eq!(encoded, vec![0xC1, 0xE9, 0xED, 0x20, 0xF3, 0xFA]);
assert_eq!(decode_bytes(Some("ISO8859-1"), &encoded), unicode);
}
}