Skip to main content

mailrs_rfc2047/
lib.rs

1#![doc = include_str!("../README.md")]
2#![deny(missing_docs)]
3#![deny(rustdoc::broken_intra_doc_links)]
4
5//! Internal layout: [`decode`] is the entry point. It scans for
6//! `=?charset?(B|Q)?text?=` tokens and replaces them with their UTF-8
7//! decoding; ASCII runs are copied unchanged. Charset → UTF-8
8//! conversion goes through `encoding_rs::Encoding::for_label`.
9
10use std::borrow::Cow;
11
12use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
13
14/// Encode a UTF-8 string as an RFC 2047 encoded-word **if and only if**
15/// it contains non-ASCII bytes. Pure-ASCII inputs are returned as
16/// borrowed `Cow::Borrowed` unchanged — no allocation, no wrapping.
17///
18/// The encoded form uses Base64 (`B`) with the UTF-8 charset:
19/// `=?UTF-8?B?<base64>?=`. This is the wire-form most receivers
20/// recognize. (Q encoding would sometimes produce shorter output for
21/// mostly-ASCII strings with a few non-ASCII chars, but the size
22/// difference is small and Base64 is robust across every charset.)
23///
24/// ```
25/// use mailrs_rfc2047::encode;
26/// // ASCII passes through borrowed, no allocation.
27/// assert_eq!(encode("Hello"), "Hello");
28/// // Non-ASCII becomes a UTF-8 Base64 encoded-word.
29/// assert_eq!(encode("日本語"), "=?UTF-8?B?5pel5pys6Kqe?=");
30/// ```
31///
32/// This is the inverse of [`decode`]: feeding `encode(decode(x))` back
33/// through `decode` returns the original string (idempotent for ASCII
34/// input, identity-modulo-canonicalization for encoded input).
35pub fn encode(input: &str) -> Cow<'_, str> {
36    // Fast path: pure ASCII *and* free of any literal `=?` sequence —
37    // `=?` is the start-of-encoded-word marker RFC 2047 §2 reserves, so
38    // an ASCII input that already contains it cannot safely pass through
39    // borrowed (decoding the unchanged output would interpret the
40    // literal `=?` as an encoded-word start and corrupt the payload —
41    // found via fuzz, see CHANGELOG 1.1.2).
42    if input.is_ascii() && !input.as_bytes().windows(2).any(|w| w == b"=?") {
43        return Cow::Borrowed(input);
44    }
45    let encoded = B64.encode(input.as_bytes());
46    // Output layout: "=?UTF-8?B?" + base64 + "?=" — fixed 12 byte overhead
47    // around the base64 output.
48    let mut out = String::with_capacity(12 + encoded.len());
49    out.push_str("=?UTF-8?B?");
50    out.push_str(&encoded);
51    out.push_str("?=");
52    Cow::Owned(out)
53}
54
55/// Decode an RFC 2047 encoded header value into UTF-8.
56///
57/// If the input contains no `=?…?=` tokens, the original byte slice is
58/// returned as a `Cow::Borrowed` `&str` without allocation (provided
59/// the bytes were already valid UTF-8). Otherwise each encoded-word is
60/// decoded according to RFC 2047 §3 (Q + B encodings) and joined
61/// into the result.
62///
63/// **Whitespace between adjacent encoded-words is collapsed** per
64/// RFC 2047 §6.2: `=?utf-8?B?one?= =?utf-8?B?two?=` produces `onetwo`,
65/// not `one two`. Whitespace between an encoded-word and a regular
66/// ASCII run is preserved.
67///
68/// Charsets recognized: all WHATWG Encoding labels (UTF-8,
69/// ISO-8859-*, Windows-*, ISO-2022-JP, Shift_JIS, EUC-JP, EUC-KR,
70/// Big5, GB18030, …). Unknown charsets fall through to lossy UTF-8.
71///
72/// ```
73/// use mailrs_rfc2047::decode;
74/// assert_eq!(decode(b"Plain ASCII"), "Plain ASCII");
75/// assert_eq!(
76///     decode(b"=?UTF-8?B?VGVzdA==?="),
77///     "Test",
78/// );
79/// assert_eq!(
80///     decode(b"=?UTF-8?Q?Hello=20World?="),
81///     "Hello World",
82/// );
83/// ```
84pub fn decode(input: &[u8]) -> Cow<'_, str> {
85    // Fast-path: no encoded-word tokens. Return borrowed UTF-8 (or
86    // lossy if the input isn't valid UTF-8).
87    if !contains_encoded_word(input) {
88        return match std::str::from_utf8(input) {
89            Ok(s) => Cow::Borrowed(s),
90            Err(_) => Cow::Owned(String::from_utf8_lossy(input).into_owned()),
91        };
92    }
93
94    let mut out = String::with_capacity(input.len());
95    let mut cursor = 0usize;
96    let mut last_was_encoded = false;
97    let mut pending_ws_start: Option<usize> = None;
98
99    while cursor < input.len() {
100        match find_encoded_word_start(input, cursor) {
101            Some(start) => {
102                // Anything between `cursor` and `start` is raw text.
103                if start > cursor {
104                    let raw = &input[cursor..start];
105                    // RFC 2047 §6.2: drop whitespace between two
106                    // adjacent encoded-words. Detect by checking
107                    // whether the raw run is whitespace-only AND the
108                    // previous token was encoded.
109                    if last_was_encoded && raw.iter().all(|&b| matches!(b, b' ' | b'\t')) {
110                        pending_ws_start = Some(start); // skip this run
111                    } else {
112                        // emit pending whitespace if any (was held back
113                        // because we thought it might be between two
114                        // encoded words, but turns out it's followed
115                        // by regular text)
116                        if let Some(ws_start) = pending_ws_start {
117                            // ws_start..cursor: nothing to do, the ws
118                            // we skipped was actually mid-run; we
119                            // already drained that segment.
120                            let _ = ws_start;
121                            pending_ws_start = None;
122                        }
123                        push_lossy(&mut out, raw);
124                    }
125                }
126                match find_encoded_word_end(input, start) {
127                    Some((charset, encoding, text, end)) => {
128                        decode_encoded_word(&mut out, charset, encoding, text);
129                        cursor = end;
130                        last_was_encoded = true;
131                        pending_ws_start = None;
132                    }
133                    None => {
134                        // Malformed `=?` start without a matching
135                        // `?=`. Emit the `=?` literally and continue.
136                        out.push('=');
137                        out.push('?');
138                        cursor = start + 2;
139                        last_was_encoded = false;
140                    }
141                }
142            }
143            None => {
144                let raw = &input[cursor..];
145                push_lossy(&mut out, raw);
146                break;
147            }
148        }
149    }
150
151    Cow::Owned(out)
152}
153
154/// Quick scan: does the input contain `=?` (the encoded-word lead-in)
155/// anywhere? Used by [`decode`]'s fast path.
156fn contains_encoded_word(input: &[u8]) -> bool {
157    let mut i = 0;
158    while i + 1 < input.len() {
159        if input[i] == b'=' && input[i + 1] == b'?' {
160            return true;
161        }
162        i += 1;
163    }
164    false
165}
166
167/// Locate the next `=?` in `input` starting at `from`. Returns the
168/// offset of the `=` byte.
169fn find_encoded_word_start(input: &[u8], from: usize) -> Option<usize> {
170    let mut i = from;
171    while i + 1 < input.len() {
172        if input[i] == b'=' && input[i + 1] == b'?' {
173            return Some(i);
174        }
175        i += 1;
176    }
177    None
178}
179
180/// Parse one encoded-word starting at `start` (which points at `=`
181/// of `=?`).
182///
183/// Returns `(charset, encoding, text, end)` where `end` is the byte
184/// after the closing `?=`. Returns `None` if the token is malformed
185/// (no two `?` separators or no closing `?=` before EOF).
186fn find_encoded_word_end(input: &[u8], start: usize) -> Option<(&[u8], u8, &[u8], usize)> {
187    // After "=?", scan for the next "?" to delimit charset.
188    let charset_start = start + 2;
189    if charset_start >= input.len() {
190        return None;
191    }
192    let q1 = (charset_start..input.len()).find(|&i| input[i] == b'?')?;
193    let charset = &input[charset_start..q1];
194    if charset.is_empty() {
195        return None;
196    }
197    let encoding_byte_pos = q1 + 1;
198    if encoding_byte_pos >= input.len() {
199        return None;
200    }
201    let encoding = input[encoding_byte_pos];
202    if !matches!(encoding, b'B' | b'b' | b'Q' | b'q') {
203        return None;
204    }
205    let q2 = encoding_byte_pos + 1;
206    if q2 >= input.len() || input[q2] != b'?' {
207        return None;
208    }
209    let text_start = q2 + 1;
210    // Find closing `?=`
211    let mut i = text_start;
212    while i + 1 < input.len() {
213        if input[i] == b'?' && input[i + 1] == b'=' {
214            return Some((charset, encoding, &input[text_start..i], i + 2));
215        }
216        i += 1;
217    }
218    None
219}
220
221/// Decode one encoded-word's text into `out`. Lossy fallback if the
222/// charset is unknown or decode fails.
223fn decode_encoded_word(out: &mut String, charset: &[u8], encoding: u8, text: &[u8]) {
224    let raw_bytes = match encoding {
225        b'B' | b'b' => match B64.decode(text) {
226            Ok(b) => b,
227            Err(_) => {
228                // Malformed base64 — copy raw text lossy.
229                push_lossy(out, text);
230                return;
231            }
232        },
233        b'Q' | b'q' => decode_q(text),
234        _ => return,
235    };
236    convert_to_utf8(out, charset, &raw_bytes);
237}
238
239/// Decode a Q (quoted-printable-style) encoded word body per
240/// RFC 2047 §4.2:
241///   - `_` → space
242///   - `=XX` → byte 0xXX (hex)
243///   - everything else: literal byte
244fn decode_q(text: &[u8]) -> Vec<u8> {
245    let mut out = Vec::with_capacity(text.len());
246    let mut i = 0;
247    while i < text.len() {
248        match text[i] {
249            b'_' => {
250                out.push(b' ');
251                i += 1;
252            }
253            b'=' if i + 2 < text.len() => {
254                let hi = hex_nibble(text[i + 1]);
255                let lo = hex_nibble(text[i + 2]);
256                match (hi, lo) {
257                    (Some(h), Some(l)) => {
258                        out.push((h << 4) | l);
259                        i += 3;
260                    }
261                    _ => {
262                        // Malformed `=XY` — emit literal.
263                        out.push(b'=');
264                        i += 1;
265                    }
266                }
267            }
268            _ => {
269                out.push(text[i]);
270                i += 1;
271            }
272        }
273    }
274    out
275}
276
277#[inline]
278fn hex_nibble(b: u8) -> Option<u8> {
279    match b {
280        b'0'..=b'9' => Some(b - b'0'),
281        b'A'..=b'F' => Some(b - b'A' + 10),
282        b'a'..=b'f' => Some(b - b'a' + 10),
283        _ => None,
284    }
285}
286
287/// Convert `bytes` from `charset` to UTF-8, appending to `out`.
288fn convert_to_utf8(out: &mut String, charset: &[u8], bytes: &[u8]) {
289    let encoding = encoding_rs::Encoding::for_label(charset);
290    let encoding = encoding.unwrap_or(encoding_rs::UTF_8);
291    let (cow, _, _) = encoding.decode(bytes);
292    out.push_str(&cow);
293}
294
295/// Append `bytes` to `out` as UTF-8, replacing invalid sequences.
296fn push_lossy(out: &mut String, bytes: &[u8]) {
297    match std::str::from_utf8(bytes) {
298        Ok(s) => out.push_str(s),
299        Err(_) => out.push_str(&String::from_utf8_lossy(bytes)),
300    }
301}
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306
307    #[test]
308    fn encode_ascii_is_borrowed() {
309        let r = encode("Hello World");
310        assert_eq!(r, "Hello World");
311        assert!(matches!(r, Cow::Borrowed(_)));
312    }
313
314    #[test]
315    fn encode_japanese() {
316        let r = encode("日本語");
317        assert_eq!(r, "=?UTF-8?B?5pel5pys6Kqe?=");
318    }
319
320    #[test]
321    fn encode_roundtrip_via_decode() {
322        let original = "café — 日本語 — émoji 🦀";
323        let encoded = encode(original);
324        // Decode it back; should match original.
325        let decoded = decode(encoded.as_bytes());
326        assert_eq!(decoded, original);
327    }
328
329    #[test]
330    fn encode_empty_string() {
331        let r = encode("");
332        assert_eq!(r, "");
333        assert!(matches!(r, Cow::Borrowed(_)));
334    }
335
336    #[test]
337    fn encode_pure_emoji() {
338        let r = encode("🦀🚀");
339        // It will be a UTF-8 Base64 encoded-word.
340        assert!(r.starts_with("=?UTF-8?B?"));
341        assert!(r.ends_with("?="));
342        // And it decodes back identically.
343        let decoded = decode(r.as_bytes());
344        assert_eq!(decoded, "🦀🚀");
345    }
346
347    #[test]
348    fn plain_ascii_is_borrowed() {
349        let r = decode(b"hello world");
350        assert_eq!(r, "hello world");
351        assert!(matches!(r, Cow::Borrowed(_)));
352    }
353
354    #[test]
355    fn utf8_no_encoding_returns_borrowed() {
356        let r = decode("héllo".as_bytes());
357        assert_eq!(r, "héllo");
358        assert!(matches!(r, Cow::Borrowed(_)));
359    }
360
361    #[test]
362    fn base64_utf8() {
363        let r = decode(b"=?UTF-8?B?VGVzdA==?=");
364        assert_eq!(r, "Test");
365    }
366
367    #[test]
368    fn quoted_printable_utf8() {
369        let r = decode(b"=?UTF-8?Q?Hello=20World?=");
370        assert_eq!(r, "Hello World");
371    }
372
373    #[test]
374    fn q_underscore_is_space() {
375        let r = decode(b"=?UTF-8?Q?Hello_World?=");
376        assert_eq!(r, "Hello World");
377    }
378
379    #[test]
380    fn q_lowercase_encoding_marker() {
381        let r = decode(b"=?utf-8?q?ohai?=");
382        assert_eq!(r, "ohai");
383    }
384
385    #[test]
386    fn b_lowercase_encoding_marker() {
387        let r = decode(b"=?utf-8?b?dGVzdA==?=");
388        assert_eq!(r, "test");
389    }
390
391    #[test]
392    fn iso_8859_1() {
393        // "café" encoded as ISO-8859-1 then base64'd:
394        //   c=0x63 a=0x61 f=0x66 é=0xE9 → Y2Fm6Q==
395        let r = decode(b"=?iso-8859-1?B?Y2Fm6Q==?=");
396        assert_eq!(r, "café");
397    }
398
399    #[test]
400    fn iso_2022_jp_japanese() {
401        // "こんにちは" in ISO-2022-JP via B encoding.
402        // The actual bytes: ESC $ B then JIS-encoded chars then ESC ( B.
403        let r = decode(b"=?ISO-2022-JP?B?GyRCJDMkcyRLJEEkTxsoQg==?=");
404        assert_eq!(r, "こんにちは");
405    }
406
407    #[test]
408    fn mixed_ascii_and_encoded() {
409        let r = decode(b"Prefix =?UTF-8?B?VGVzdA==?= Suffix");
410        assert_eq!(r, "Prefix Test Suffix");
411    }
412
413    #[test]
414    fn adjacent_encoded_words_collapse_whitespace() {
415        // RFC 2047 §6.2: whitespace between two encoded-words is dropped.
416        let r = decode(b"=?UTF-8?B?aGVsbG8=?= =?UTF-8?B?d29ybGQ=?=");
417        assert_eq!(r, "helloworld");
418    }
419
420    #[test]
421    fn whitespace_preserved_around_ascii_run() {
422        let r = decode(b"=?UTF-8?B?aGVsbG8=?= mid =?UTF-8?B?d29ybGQ=?=");
423        assert_eq!(r, "hello mid world");
424    }
425
426    #[test]
427    fn malformed_no_closing_returns_literal_lead_in() {
428        let r = decode(b"=?UTF-8?B?VGVzdA");
429        // `=?` is treated as literal then the rest follows
430        assert!(r.starts_with("=?"));
431    }
432
433    #[test]
434    fn malformed_empty_charset_kept_literal() {
435        let r = decode(b"=??B?VGVzdA==?=");
436        // Cannot resolve empty charset; emit as literal lead-in.
437        assert!(r.starts_with("=?"));
438    }
439
440    #[test]
441    fn malformed_unknown_encoding_kept_literal() {
442        let r = decode(b"=?UTF-8?X?garbage?=");
443        // X is not B or Q — treat the `=?` as literal.
444        assert!(r.starts_with("=?"));
445    }
446
447    #[test]
448    fn empty_input_returns_empty() {
449        assert_eq!(decode(b""), "");
450    }
451
452    #[test]
453    fn invalid_utf8_in_unencoded_returns_lossy() {
454        let r = decode(&[0xFF, 0xFE, b'h', b'i']);
455        // Lossy substitution adds replacement chars; "hi" survives.
456        assert!(r.contains("hi"));
457    }
458
459    #[test]
460    fn q_encoding_malformed_hex() {
461        let r = decode(b"=?UTF-8?Q?abc=ZZdef?=");
462        // Malformed =ZZ is emitted as literal '=' then continues.
463        assert!(r.contains("abc"));
464        assert!(r.contains("def"));
465    }
466
467    #[test]
468    fn unknown_charset_falls_through_to_utf8() {
469        let r = decode(b"=?x-fake-charset?B?aGVsbG8=?=");
470        // Unknown charset: encoding_rs::for_label returns None;
471        // we fall back to UTF-8 decode of the raw bytes.
472        assert_eq!(r, "hello");
473    }
474
475    // ===== additional edge cases =====
476
477    #[test]
478    fn q_encoding_with_latin1_chars() {
479        // "café" with Latin-1: c=0x63 a=0x61 f=0x66 é=0xE9.
480        // Q-encoded: cafe=E9 (the é becomes =E9)
481        let r = decode(b"=?iso-8859-1?Q?caf=E9?=");
482        assert_eq!(r, "café");
483    }
484
485    #[test]
486    fn empty_encoded_word_body() {
487        // =?UTF-8?B??= — empty text. Base64 decodes empty to empty.
488        let r = decode(b"=?UTF-8?B??=");
489        assert_eq!(r, "");
490    }
491
492    #[test]
493    fn adjacent_words_different_charsets_no_collapse() {
494        // Whitespace collapse only applies when CONSECUTIVE encoded
495        // words exist; different charsets is still "consecutive" per
496        // RFC 2047 §6.2. Our impl collapses uniformly. Test the
497        // behavior is consistent.
498        let r = decode(b"=?UTF-8?B?aGk=?= =?iso-8859-1?B?aGk=?=");
499        // Both decode to "hi", whitespace dropped between
500        assert_eq!(r, "hihi");
501    }
502
503    #[test]
504    fn encoded_word_at_very_start_of_input() {
505        let r = decode(b"=?UTF-8?B?aGVsbG8=?= trailing text");
506        assert_eq!(r, "hello trailing text");
507    }
508
509    #[test]
510    fn encoded_word_at_very_end_of_input() {
511        let r = decode(b"leading text =?UTF-8?B?aGVsbG8=?=");
512        assert_eq!(r, "leading text hello");
513    }
514
515    #[test]
516    fn encoded_word_in_middle_of_quoted_string() {
517        // Real-world senders embed =?...?= inside what looks like a
518        // quoted display-name. We decode the encoded-word regardless
519        // of context.
520        let r = decode(b"\"=?UTF-8?B?aGVsbG8=?=\" <addr@example.com>");
521        // The unquoted display name decodes.
522        assert!(r.contains("hello"));
523        assert!(r.contains("<addr@example.com>"));
524    }
525
526    #[test]
527    fn charset_case_insensitive_match() {
528        // encoding_rs::for_label is case-insensitive
529        let r1 = decode(b"=?UTF-8?B?aGk=?=");
530        let r2 = decode(b"=?utf-8?B?aGk=?=");
531        let r3 = decode(b"=?Utf-8?B?aGk=?=");
532        let r4 = decode(b"=?UtF-8?B?aGk=?=");
533        assert_eq!(r1, r2);
534        assert_eq!(r2, r3);
535        assert_eq!(r3, r4);
536    }
537
538    #[test]
539    fn shift_jis_japanese_decode() {
540        // "テスト" (Test) in Shift_JIS via Base64.
541        // Shift_JIS bytes for テスト = 83 65 83 58 83 67
542        let r = decode(b"=?Shift_JIS?B?g2WDWINn?=");
543        assert_eq!(r, "テスト");
544    }
545
546    #[test]
547    fn euc_jp_japanese_decode() {
548        // "テスト" in EUC-JP via Base64.
549        // EUC-JP bytes for テスト: A5 C6 A5 B9 A5 C8
550        let r = decode(b"=?EUC-JP?B?pcaluaXI?=");
551        assert_eq!(r, "テスト");
552    }
553
554    #[test]
555    fn big5_chinese_decode() {
556        // "你好" (Hello) in Big5 via Base64.
557        // Big5 bytes for 你好: A7 41 A6 6E
558        let r = decode(b"=?Big5?B?p0GmbA==?=");
559        // Some Big5 mappings vary; just verify a non-empty UTF-8 result.
560        assert!(!r.is_empty());
561    }
562
563    #[test]
564    fn q_encoding_uppercase_hex() {
565        let r = decode(b"=?UTF-8?Q?=E6=97=A5=E6=9C=AC=E8=AA=9E?=");
566        // Hex E6 97 A5 E6 9C AC E8 AA 9E = "日本語" in UTF-8
567        assert_eq!(r, "日本語");
568    }
569
570    #[test]
571    fn q_encoding_lowercase_hex_tolerated() {
572        // RFC 2047 §4.2 says hex chars are uppercase; some senders
573        // ship lowercase. Be lenient on decode.
574        let r = decode(b"=?UTF-8?Q?=e6=97=a5?=");
575        // Just first 3 hex bytes E6 97 A5 = "日" (Japanese kanji for sun/day)
576        assert_eq!(r, "日");
577    }
578
579    #[test]
580    fn encoded_word_with_underscore_and_equals() {
581        // "Hello World!" in Q: H, e, l, l, o, _, W, o, r, l, d, =21
582        // _ becomes space, =21 = '!'
583        let r = decode(b"=?UTF-8?Q?Hello_World=21?=");
584        assert_eq!(r, "Hello World!");
585    }
586
587    // ===== encode tests =====
588
589    #[test]
590    fn encode_preserves_short_ascii() {
591        // Short ASCII strings borrow without allocation.
592        let r = encode("test");
593        assert_eq!(r, "test");
594        assert!(matches!(r, Cow::Borrowed(_)));
595    }
596
597    #[test]
598    fn encode_decode_roundtrip_iso_2022_jp_via_utf8_wrapping() {
599        // We encode as UTF-8 Base64 regardless of input. So Japanese
600        // input encoded by us decodes back to original.
601        let original = "明日午前9時の会議";
602        let encoded = encode(original);
603        let decoded = decode(encoded.as_bytes());
604        assert_eq!(decoded, original);
605    }
606
607    #[test]
608    fn encode_string_with_mixed_ascii_and_unicode() {
609        // Any non-ASCII char triggers full encoding (not partial).
610        let r = encode("Hello 世界");
611        assert!(r.starts_with("=?UTF-8?B?"));
612        let back = decode(r.as_bytes());
613        assert_eq!(back, "Hello 世界");
614    }
615}