#![doc = include_str!("../README.md")]
#![deny(missing_docs)]
#![deny(rustdoc::broken_intra_doc_links)]
use std::borrow::Cow;
use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
pub fn encode(input: &str) -> Cow<'_, str> {
if input.is_ascii() && !input.as_bytes().windows(2).any(|w| w == b"=?") {
return Cow::Borrowed(input);
}
let encoded = B64.encode(input.as_bytes());
let mut out = String::with_capacity(12 + encoded.len());
out.push_str("=?UTF-8?B?");
out.push_str(&encoded);
out.push_str("?=");
Cow::Owned(out)
}
pub fn decode(input: &[u8]) -> Cow<'_, str> {
if !contains_encoded_word(input) {
return match std::str::from_utf8(input) {
Ok(s) => Cow::Borrowed(s),
Err(_) => Cow::Owned(String::from_utf8_lossy(input).into_owned()),
};
}
let mut out = String::with_capacity(input.len());
let mut cursor = 0usize;
let mut last_was_encoded = false;
let mut pending_ws_start: Option<usize> = None;
while cursor < input.len() {
match find_encoded_word_start(input, cursor) {
Some(start) => {
if start > cursor {
let raw = &input[cursor..start];
if last_was_encoded && raw.iter().all(|&b| matches!(b, b' ' | b'\t')) {
pending_ws_start = Some(start); } else {
if let Some(ws_start) = pending_ws_start {
let _ = ws_start;
pending_ws_start = None;
}
push_lossy(&mut out, raw);
}
}
match find_encoded_word_end(input, start) {
Some((charset, encoding, text, end)) => {
decode_encoded_word(&mut out, charset, encoding, text);
cursor = end;
last_was_encoded = true;
pending_ws_start = None;
}
None => {
out.push('=');
out.push('?');
cursor = start + 2;
last_was_encoded = false;
}
}
}
None => {
let raw = &input[cursor..];
push_lossy(&mut out, raw);
break;
}
}
}
Cow::Owned(out)
}
fn contains_encoded_word(input: &[u8]) -> bool {
let mut i = 0;
while i + 1 < input.len() {
if input[i] == b'=' && input[i + 1] == b'?' {
return true;
}
i += 1;
}
false
}
fn find_encoded_word_start(input: &[u8], from: usize) -> Option<usize> {
let mut i = from;
while i + 1 < input.len() {
if input[i] == b'=' && input[i + 1] == b'?' {
return Some(i);
}
i += 1;
}
None
}
fn find_encoded_word_end(input: &[u8], start: usize) -> Option<(&[u8], u8, &[u8], usize)> {
let charset_start = start + 2;
if charset_start >= input.len() {
return None;
}
let q1 = (charset_start..input.len()).find(|&i| input[i] == b'?')?;
let charset = &input[charset_start..q1];
if charset.is_empty() {
return None;
}
let encoding_byte_pos = q1 + 1;
if encoding_byte_pos >= input.len() {
return None;
}
let encoding = input[encoding_byte_pos];
if !matches!(encoding, b'B' | b'b' | b'Q' | b'q') {
return None;
}
let q2 = encoding_byte_pos + 1;
if q2 >= input.len() || input[q2] != b'?' {
return None;
}
let text_start = q2 + 1;
let mut i = text_start;
while i + 1 < input.len() {
if input[i] == b'?' && input[i + 1] == b'=' {
return Some((charset, encoding, &input[text_start..i], i + 2));
}
i += 1;
}
None
}
fn decode_encoded_word(out: &mut String, charset: &[u8], encoding: u8, text: &[u8]) {
let raw_bytes = match encoding {
b'B' | b'b' => match B64.decode(text) {
Ok(b) => b,
Err(_) => {
push_lossy(out, text);
return;
}
},
b'Q' | b'q' => decode_q(text),
_ => return,
};
convert_to_utf8(out, charset, &raw_bytes);
}
fn decode_q(text: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(text.len());
let mut i = 0;
while i < text.len() {
match text[i] {
b'_' => {
out.push(b' ');
i += 1;
}
b'=' if i + 2 < text.len() => {
let hi = hex_nibble(text[i + 1]);
let lo = hex_nibble(text[i + 2]);
match (hi, lo) {
(Some(h), Some(l)) => {
out.push((h << 4) | l);
i += 3;
}
_ => {
out.push(b'=');
i += 1;
}
}
}
_ => {
out.push(text[i]);
i += 1;
}
}
}
out
}
#[inline]
fn hex_nibble(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'A'..=b'F' => Some(b - b'A' + 10),
b'a'..=b'f' => Some(b - b'a' + 10),
_ => None,
}
}
fn convert_to_utf8(out: &mut String, charset: &[u8], bytes: &[u8]) {
let encoding = encoding_rs::Encoding::for_label(charset);
let encoding = encoding.unwrap_or(encoding_rs::UTF_8);
let (cow, _, _) = encoding.decode(bytes);
out.push_str(&cow);
}
fn push_lossy(out: &mut String, bytes: &[u8]) {
match std::str::from_utf8(bytes) {
Ok(s) => out.push_str(s),
Err(_) => out.push_str(&String::from_utf8_lossy(bytes)),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn encode_ascii_is_borrowed() {
let r = encode("Hello World");
assert_eq!(r, "Hello World");
assert!(matches!(r, Cow::Borrowed(_)));
}
#[test]
fn encode_japanese() {
let r = encode("日本語");
assert_eq!(r, "=?UTF-8?B?5pel5pys6Kqe?=");
}
#[test]
fn encode_roundtrip_via_decode() {
let original = "café — 日本語 — émoji 🦀";
let encoded = encode(original);
let decoded = decode(encoded.as_bytes());
assert_eq!(decoded, original);
}
#[test]
fn encode_empty_string() {
let r = encode("");
assert_eq!(r, "");
assert!(matches!(r, Cow::Borrowed(_)));
}
#[test]
fn encode_pure_emoji() {
let r = encode("🦀🚀");
assert!(r.starts_with("=?UTF-8?B?"));
assert!(r.ends_with("?="));
let decoded = decode(r.as_bytes());
assert_eq!(decoded, "🦀🚀");
}
#[test]
fn plain_ascii_is_borrowed() {
let r = decode(b"hello world");
assert_eq!(r, "hello world");
assert!(matches!(r, Cow::Borrowed(_)));
}
#[test]
fn utf8_no_encoding_returns_borrowed() {
let r = decode("héllo".as_bytes());
assert_eq!(r, "héllo");
assert!(matches!(r, Cow::Borrowed(_)));
}
#[test]
fn base64_utf8() {
let r = decode(b"=?UTF-8?B?VGVzdA==?=");
assert_eq!(r, "Test");
}
#[test]
fn quoted_printable_utf8() {
let r = decode(b"=?UTF-8?Q?Hello=20World?=");
assert_eq!(r, "Hello World");
}
#[test]
fn q_underscore_is_space() {
let r = decode(b"=?UTF-8?Q?Hello_World?=");
assert_eq!(r, "Hello World");
}
#[test]
fn q_lowercase_encoding_marker() {
let r = decode(b"=?utf-8?q?ohai?=");
assert_eq!(r, "ohai");
}
#[test]
fn b_lowercase_encoding_marker() {
let r = decode(b"=?utf-8?b?dGVzdA==?=");
assert_eq!(r, "test");
}
#[test]
fn iso_8859_1() {
let r = decode(b"=?iso-8859-1?B?Y2Fm6Q==?=");
assert_eq!(r, "café");
}
#[test]
fn iso_2022_jp_japanese() {
let r = decode(b"=?ISO-2022-JP?B?GyRCJDMkcyRLJEEkTxsoQg==?=");
assert_eq!(r, "こんにちは");
}
#[test]
fn mixed_ascii_and_encoded() {
let r = decode(b"Prefix =?UTF-8?B?VGVzdA==?= Suffix");
assert_eq!(r, "Prefix Test Suffix");
}
#[test]
fn adjacent_encoded_words_collapse_whitespace() {
let r = decode(b"=?UTF-8?B?aGVsbG8=?= =?UTF-8?B?d29ybGQ=?=");
assert_eq!(r, "helloworld");
}
#[test]
fn whitespace_preserved_around_ascii_run() {
let r = decode(b"=?UTF-8?B?aGVsbG8=?= mid =?UTF-8?B?d29ybGQ=?=");
assert_eq!(r, "hello mid world");
}
#[test]
fn malformed_no_closing_returns_literal_lead_in() {
let r = decode(b"=?UTF-8?B?VGVzdA");
assert!(r.starts_with("=?"));
}
#[test]
fn malformed_empty_charset_kept_literal() {
let r = decode(b"=??B?VGVzdA==?=");
assert!(r.starts_with("=?"));
}
#[test]
fn malformed_unknown_encoding_kept_literal() {
let r = decode(b"=?UTF-8?X?garbage?=");
assert!(r.starts_with("=?"));
}
#[test]
fn empty_input_returns_empty() {
assert_eq!(decode(b""), "");
}
#[test]
fn invalid_utf8_in_unencoded_returns_lossy() {
let r = decode(&[0xFF, 0xFE, b'h', b'i']);
assert!(r.contains("hi"));
}
#[test]
fn q_encoding_malformed_hex() {
let r = decode(b"=?UTF-8?Q?abc=ZZdef?=");
assert!(r.contains("abc"));
assert!(r.contains("def"));
}
#[test]
fn unknown_charset_falls_through_to_utf8() {
let r = decode(b"=?x-fake-charset?B?aGVsbG8=?=");
assert_eq!(r, "hello");
}
#[test]
fn q_encoding_with_latin1_chars() {
let r = decode(b"=?iso-8859-1?Q?caf=E9?=");
assert_eq!(r, "café");
}
#[test]
fn empty_encoded_word_body() {
let r = decode(b"=?UTF-8?B??=");
assert_eq!(r, "");
}
#[test]
fn adjacent_words_different_charsets_no_collapse() {
let r = decode(b"=?UTF-8?B?aGk=?= =?iso-8859-1?B?aGk=?=");
assert_eq!(r, "hihi");
}
#[test]
fn encoded_word_at_very_start_of_input() {
let r = decode(b"=?UTF-8?B?aGVsbG8=?= trailing text");
assert_eq!(r, "hello trailing text");
}
#[test]
fn encoded_word_at_very_end_of_input() {
let r = decode(b"leading text =?UTF-8?B?aGVsbG8=?=");
assert_eq!(r, "leading text hello");
}
#[test]
fn encoded_word_in_middle_of_quoted_string() {
let r = decode(b"\"=?UTF-8?B?aGVsbG8=?=\" <addr@example.com>");
assert!(r.contains("hello"));
assert!(r.contains("<addr@example.com>"));
}
#[test]
fn charset_case_insensitive_match() {
let r1 = decode(b"=?UTF-8?B?aGk=?=");
let r2 = decode(b"=?utf-8?B?aGk=?=");
let r3 = decode(b"=?Utf-8?B?aGk=?=");
let r4 = decode(b"=?UtF-8?B?aGk=?=");
assert_eq!(r1, r2);
assert_eq!(r2, r3);
assert_eq!(r3, r4);
}
#[test]
fn shift_jis_japanese_decode() {
let r = decode(b"=?Shift_JIS?B?g2WDWINn?=");
assert_eq!(r, "テスト");
}
#[test]
fn euc_jp_japanese_decode() {
let r = decode(b"=?EUC-JP?B?pcaluaXI?=");
assert_eq!(r, "テスト");
}
#[test]
fn big5_chinese_decode() {
let r = decode(b"=?Big5?B?p0GmbA==?=");
assert!(!r.is_empty());
}
#[test]
fn q_encoding_uppercase_hex() {
let r = decode(b"=?UTF-8?Q?=E6=97=A5=E6=9C=AC=E8=AA=9E?=");
assert_eq!(r, "日本語");
}
#[test]
fn q_encoding_lowercase_hex_tolerated() {
let r = decode(b"=?UTF-8?Q?=e6=97=a5?=");
assert_eq!(r, "日");
}
#[test]
fn encoded_word_with_underscore_and_equals() {
let r = decode(b"=?UTF-8?Q?Hello_World=21?=");
assert_eq!(r, "Hello World!");
}
#[test]
fn encode_preserves_short_ascii() {
let r = encode("test");
assert_eq!(r, "test");
assert!(matches!(r, Cow::Borrowed(_)));
}
#[test]
fn encode_decode_roundtrip_iso_2022_jp_via_utf8_wrapping() {
let original = "明日午前9時の会議";
let encoded = encode(original);
let decoded = decode(encoded.as_bytes());
assert_eq!(decoded, original);
}
#[test]
fn encode_string_with_mixed_ascii_and_unicode() {
let r = encode("Hello 世界");
assert!(r.starts_with("=?UTF-8?B?"));
let back = decode(r.as_bytes());
assert_eq!(back, "Hello 世界");
}
}