encoded_words/
encoded_words.rs

1//! Routines for manipulating RFC2047 encoded words.
2//!
3//! An ecoded word looks like this: `=?charset[*lang]?cte?encoded_string?=`.
4
5use regex::bytes::{Captures, NoExpand, Regex};
6use thiserror::Error;
7
8use crate::charset::Charset;
9use crate::defects::Defect;
10
11// -- Quoted Printable
12
13// regex based decoder.
14
15lazy_static::lazy_static! {
16    static ref Q_BYTE_RE_1: Regex = Regex::new(r"(_)").unwrap();
17    static ref Q_BYTE_RE_2: Regex = Regex::new(r"=([a-fA-F0-9]{2})").unwrap();
18}
19
20fn decode_q<T: AsRef<[u8]>>(encoded: T) -> Vec<u8> {
21    let one = Q_BYTE_RE_1.replace_all(encoded.as_ref(), NoExpand(b" "));
22    Q_BYTE_RE_2
23        .replace_all(one.as_ref(), |caps: &Captures| {
24            hex::decode(caps[1].as_ref()).expect("invalid regex capture")
25        })
26        .to_vec()
27}
28
29fn write_q_byte<T: std::fmt::Write>(mut writer: T, byte: u8) -> std::fmt::Result {
30    match byte {
31        b' ' => writer.write_char('_'),
32        b'-' | b'!' | b'*' | b'+' | b'/' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => {
33            writer.write_char(byte as char)
34        }
35        _ => write!(writer, "={:02X}", byte),
36    }
37}
38
39fn encode_q<T: AsRef<[u8]>>(bstring: T) -> String {
40    let mut out = String::with_capacity(bstring.as_ref().len());
41
42    for byte in bstring.as_ref() {
43        write_q_byte(&mut out, *byte).expect("String writes always succeed");
44    }
45
46    out
47}
48
49fn len_q<T: AsRef<[u8]>>(bstring: T) -> usize {
50    bstring.as_ref().iter().copied().map(len_q_byte).sum()
51}
52
53fn len_q_byte(byte: u8) -> usize {
54    match byte {
55        b' ' => 1,
56        b'-' | b'!' | b'*' | b'+' | b'/' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => 1,
57        _ => 3,
58    }
59}
60
61// -- Base64
62
63fn decode_b<T: AsRef<[u8]>>(encoded: T) -> (Vec<u8>, Vec<Defect>) {
64    let mut defects = Vec::new();
65
66    let config =
67        base64::Config::new(base64::CharacterSet::Standard, true).decode_allow_trailing_bits(true);
68    // First try the good case.
69    match base64::decode_config(&encoded, config) {
70        Ok(decoded) => {
71            let pad_err = encoded.as_ref().len() % 4;
72            if pad_err > 0 {
73                defects.push(Defect::InvalidBase64Padding);
74            }
75
76            (decoded, defects)
77        }
78        Err(err) => match err {
79            base64::DecodeError::InvalidByte(_offset, byte) => {
80                defects.push(Defect::InvalidBase64Characters { byte });
81
82                // filter out invalid characters
83                let encoded: Vec<u8> = encoded
84                    .as_ref()
85                    .iter()
86                    .copied()
87                    .filter(|b| match b {
88                        0..=42 => false,
89                        43 => true,
90                        44..=46 => false,
91                        47..=57 => true,
92                        58..=64 => false,
93                        65..=90 => true,
94                        91..=96 => false,
95                        97..=122 => true,
96                        _ => false,
97                    })
98                    .collect();
99
100                if encoded.len() % 4 > 0 {
101                    defects.push(Defect::InvalidBase64Padding);
102                }
103
104                match base64::decode_config(&encoded, config) {
105                    Ok(decoded) => (decoded, defects),
106                    Err(_err) => {
107                        // giving up
108                        (encoded.to_vec(), defects)
109                    }
110                }
111            }
112            base64::DecodeError::InvalidLastSymbol(_offset, _byte) => {
113                unreachable!("config disables this error");
114            }
115            base64::DecodeError::InvalidLength => {
116                // Nothing we can do
117                defects.push(Defect::InvalidBase64Length);
118                (encoded.as_ref().to_vec(), defects)
119            }
120        },
121    }
122}
123
124fn encode_b<T: AsRef<[u8]>>(bstring: T) -> String {
125    base64::encode(&bstring)
126}
127
128fn len_b<T: AsRef<[u8]>>(bstring: T) -> usize {
129    let len = bstring.as_ref().len();
130    let groups_of_3 = len / 3;
131    let leftover = len % 3;
132
133    // 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
134    let padding_len = if leftover > 0 { 4 } else { 0 };
135    groups_of_3 * 4 + padding_len
136}
137
138/// The result from decoding an encoded word.
139#[derive(Debug, Clone, PartialEq)]
140pub struct DecodingResult {
141    pub decoded: String,
142    pub charset: Charset,
143    pub lang: String,
144    pub defects: Vec<Defect>,
145}
146
147#[derive(Clone, Debug, Error, PartialEq, Eq)]
148pub enum DecodingError {
149    #[error("Malformed input")]
150    MalformedInput,
151    #[error("Unknown charset {}", charset)]
152    UnknownCharset { charset: String },
153}
154
155/// Decode encoded word and return (string, charset, lang, defects) tuple.
156///
157/// An RFC 2047/2243 encoded word has the form: `=?charset*lang?cte?encoded_string?=`
158///
159/// where '*lang' may be omitted but the other parts may not be.
160///
161/// This function expects exactly such a string (that is, it does not check the
162/// syntax and may raise errors if the string is not well formed), and returns
163/// the encoded_string decoded first from its Content Transfer Encoding and
164/// then from the resulting bytes into unicode using the specified charset.  If
165/// the cte-decoded string does not successfully decode using the specified
166/// character set, a defect is added to the defects list and the unknown octets
167/// are replaced by the unicode 'unknown' character \\uFDFF.
168///
169/// The specified charset and language are returned.  The default for language,
170/// which is rarely if ever encountered, is the empty string.
171pub fn decode<T: AsRef<str>>(ew: T) -> Result<DecodingResult, DecodingError> {
172    let mut split = ew.as_ref().split('?');
173    let _ = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
174    let charset = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
175    let cte = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
176    let cte_string = split.next().ok_or_else(|| DecodingError::MalformedInput)?;
177
178    let (charset, lang) = if let Some(index) = charset.find('*') {
179        let (charset, lang) = charset.split_at(index);
180        (charset, &lang[1..])
181    } else {
182        (charset, "")
183    };
184
185    let mut defects = Vec::new();
186
187    let charset = if charset == "latin-1" {
188        // For some resason latin-1 is not repored
189        Charset::for_label(b"latin1").unwrap()
190    } else {
191        match Charset::for_label(charset.as_bytes()) {
192            Some(c) => c,
193            None => {
194                if charset != "unknown-8bit" {
195                    defects.push(Defect::InvalidCharset {
196                        charset: charset.into(),
197                    })
198                }
199                Charset::Ascii
200            }
201        }
202    };
203
204    let cte = cte.to_lowercase();
205
206    // Recover the original bytes and do CTE decoding.
207    let (bstring, has_invalid_ascii) = Charset::Ascii.encode(cte_string);
208    if has_invalid_ascii {
209        defects.push(Defect::UndecodableBytes);
210    }
211    let (bstring, new_defects) = match cte.as_str() {
212        "q" => (decode_q(bstring), Vec::new()),
213        "b" => decode_b(bstring),
214        _ => return Err(DecodingError::MalformedInput),
215    };
216    defects.extend_from_slice(&new_defects);
217
218    // Turn the CTE decoded bytes into unicode.
219    let (decoded, has_invalid_bytes) = charset.decode_without_bom_handling(&bstring);
220
221    if has_invalid_bytes {
222        defects.push(Defect::UndecodableBytes);
223    }
224
225    Ok(DecodingResult {
226        decoded: decoded.into(),
227        charset,
228        lang: lang.into(),
229        defects,
230    })
231}
232
233/// Flags for types of header encodings
234pub enum EncodingFlag {
235    /// Quoted printable encoding.
236    QuotedPrintable,
237    /// Base64 encoding.
238    Base64,
239    /// The shorter of `QuotedPrintable` or `Base64`, but only for headers.
240    Shortest,
241}
242
243#[derive(Debug, Copy, Clone, PartialEq, Eq)]
244pub enum Encoding {
245    QuotedPrintable,
246    Base64,
247}
248
249impl Encoding {
250    pub fn decode<T: AsRef<[u8]>>(self, ew: T) -> (Vec<u8>, Vec<Defect>) {
251        match self {
252            Encoding::QuotedPrintable => (decode_q(ew), Vec::new()),
253            Encoding::Base64 => decode_b(ew),
254        }
255    }
256
257    pub fn encode<T: AsRef<[u8]>>(self, bstring: T) -> String {
258        match self {
259            Encoding::QuotedPrintable => encode_q(bstring),
260            Encoding::Base64 => encode_b(bstring),
261        }
262    }
263    pub fn char(self) -> char {
264        match self {
265            Encoding::QuotedPrintable => 'q',
266            Encoding::Base64 => 'b',
267        }
268    }
269}
270
271/// Encode string using the CTE encoding that produces the shorter result.
272///
273/// Produces an RFC 2047/2243 encoded word of the form: `=?charset*lang?cte?encoded_string?=`
274///
275/// where '*lang' is omitted unless the 'lang' parameter is given a value.
276/// Optional argument charset (defaults to utf-8) specifies the charset to use
277/// to encode the string to binary before CTE encoding it.  Optional argument
278/// 'encoding' is the cte specifier for the encoding that should be used ('q'
279/// or 'b'); if it is None (the default) the encoding which produces the
280/// shortest encoded sequence is used, except that 'q' is preferred if it is up
281/// to five characters longer.  Optional argument 'lang' (default '') gives the
282/// RFC 2243 language string to specify in the encoded word.
283pub fn encode<T: AsRef<str>>(
284    ew: T,
285    charset: Option<Charset>,
286    encoding_flag: EncodingFlag,
287    lang: Option<&str>,
288) -> String {
289    // TODO: is using Charset the right option here? Need to handle utf-7 somehow.
290    let charset = charset.unwrap_or_else(|| Charset::Encoding(encoding_rs::UTF_8));
291    let (bstring, _) = charset.encode(ew.as_ref());
292
293    let encoding = match encoding_flag {
294        EncodingFlag::Base64 => Encoding::Base64,
295        EncodingFlag::QuotedPrintable => Encoding::QuotedPrintable,
296        EncodingFlag::Shortest => {
297            let q_len = len_q(&bstring);
298            let b_len = len_b(&bstring);
299
300            // Bias toward q. 5 is arbitrary.
301            if q_len as isize - (b_len as isize) < 5 {
302                Encoding::QuotedPrintable
303            } else {
304                Encoding::Base64
305            }
306        }
307    };
308
309    let encoded = encoding.encode(&bstring);
310    if let Some(lang) = lang {
311        format!(
312            "=?{}*{}?{}?{}?=",
313            charset.name().to_lowercase(),
314            lang,
315            encoding.char(),
316            encoded
317        )
318    } else {
319        format!(
320            "=?{}?{}?{}?=",
321            charset.name().to_lowercase(),
322            encoding.char(),
323            encoded
324        )
325    }
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn test_decode_q_no_encoded() {
334        assert_eq!(&decode_q(b"foobar"), b"foobar");
335    }
336
337    #[test]
338    fn test_decode_q_spaces() {
339        assert_eq!(&decode_q(b"foo=20bar=20"), b"foo bar ");
340        assert_eq!(&decode_q(b"foo_bar_"), b"foo bar ");
341    }
342
343    #[test]
344    fn test_decode_q_encoded() {
345        assert_eq!(&decode_q(b"foo=20=20=21=2Cbar"), b"foo  !,bar");
346    }
347
348    #[test]
349    fn test_decode_b_simple() {
350        assert_eq!(decode_b(b"Zm9v"), (b"foo".to_vec(), Vec::new()));
351    }
352
353    #[test]
354    fn test_decode_b_missing_padding() {
355        // 1 missing padding character
356        assert_eq!(
357            decode_b(b"dmk"),
358            (b"vi".to_vec(), vec![Defect::InvalidBase64Padding])
359        );
360        // 2 missing padding characters
361        assert_eq!(
362            decode_b(b"dg"),
363            (b"v".to_vec(), vec![Defect::InvalidBase64Padding])
364        );
365    }
366
367    #[test]
368    fn test_decode_b_invalid_character() {
369        assert_eq!(
370            decode_b(b"dm\x01k==="),
371            (
372                b"vi".to_vec(),
373                vec![
374                    Defect::InvalidBase64Characters { byte: b'\x01' },
375                    Defect::InvalidBase64Padding
376                ]
377            )
378        );
379    }
380
381    #[test]
382    fn test_decode_b_invalid_character_and_bad_padding() {
383        assert_eq!(
384            decode_b(b"dm\x01k"),
385            (
386                b"vi".to_vec(),
387                vec![
388                    Defect::InvalidBase64Characters { byte: b'\x01' },
389                    Defect::InvalidBase64Padding
390                ]
391            )
392        );
393    }
394
395    #[test]
396    fn test_decode_b_invalid_length() {
397        assert_eq!(
398            decode_b(b"abcde"),
399            (b"abcde".to_vec(), vec![Defect::InvalidBase64Length])
400        );
401    }
402
403    #[test]
404    fn test_decode_wrong_format_input() {
405        assert_eq!(decode("=?badone?="), Err(DecodingError::MalformedInput));
406        assert_eq!(decode("=?"), Err(DecodingError::MalformedInput));
407        assert_eq!(decode(""), Err(DecodingError::MalformedInput));
408        assert_eq!(
409            decode("=?utf-9?X?somevalue?="),
410            Err(DecodingError::MalformedInput)
411        );
412    }
413
414    #[test]
415    fn test_decode_simple_q() {
416        assert_eq!(
417            decode("=?us-ascii?q?foo?=").unwrap(),
418            DecodingResult {
419                decoded: "foo".into(),
420                charset: Charset::Ascii,
421                lang: "".into(),
422                defects: Vec::new(),
423            }
424        );
425    }
426
427    #[test]
428    fn test_decode_simple_b() {
429        assert_eq!(
430            decode("=?us-ascii?b?dmk=?=").unwrap(),
431            DecodingResult {
432                decoded: "vi".into(),
433                charset: Charset::Ascii,
434                lang: "".into(),
435                defects: Vec::new(),
436            }
437        );
438    }
439
440    #[test]
441    fn test_decode_case_ignored_q() {
442        assert_eq!(
443            decode("=?us-ascii?Q?foo?=").unwrap(),
444            DecodingResult {
445                decoded: "foo".into(),
446                charset: Charset::Ascii,
447                lang: "".into(),
448                defects: Vec::new(),
449            }
450        );
451    }
452
453    #[test]
454    fn test_decode_case_ignored_b() {
455        assert_eq!(
456            decode("=?us-ascii?B?dmk=?=").unwrap(),
457            DecodingResult {
458                decoded: "vi".into(),
459                charset: Charset::Ascii,
460                lang: "".into(),
461                defects: Vec::new(),
462            }
463        );
464    }
465
466    #[test]
467    fn test_decode_non_trivial_q() {
468        assert_eq!(
469            decode("=?latin-1?q?=20F=fcr=20Elise=20?=").unwrap(),
470            DecodingResult {
471                decoded: " Für Elise ".into(),
472                charset: Charset::for_label(b"latin1").unwrap(),
473                lang: "".into(),
474                defects: Vec::new(),
475            }
476        );
477    }
478
479    #[test]
480    fn test_decode_escaped_bytes_preserved_q() {
481        assert_eq!(
482            decode("=?us-ascii?q?=20\u{AC}foo?=").unwrap(),
483            DecodingResult {
484                decoded: " \u{AC}foo".into(),
485                charset: Charset::Ascii,
486                lang: "".into(),
487                defects: vec![/*Defect::UndecodableBytes*/],
488            }
489        );
490    }
491
492    #[test]
493    fn test_decode_undecodable_bytes_ignored_with_defect_b() {
494        assert_eq!(
495            decode("=?us-ascii?b?dm\u{AC}k?=").unwrap(),
496            DecodingResult {
497                decoded: "vi".into(),
498                charset: Charset::Ascii,
499                lang: "".into(),
500                defects: vec![
501                    Defect::InvalidBase64Characters { byte: 172 },
502                    Defect::InvalidBase64Padding
503                ],
504            }
505        );
506    }
507
508    #[test]
509    fn test_decode_invalid_bytes_ignored_with_defect_b() {
510        assert_eq!(
511            decode("=?us-ascii?b?dm\x01k===?=").unwrap(),
512            DecodingResult {
513                decoded: "vi".into(),
514                charset: Charset::Ascii,
515                lang: "".into(),
516                defects: vec![
517                    Defect::InvalidBase64Characters { byte: 1 },
518                    Defect::InvalidBase64Padding
519                ],
520            }
521        );
522    }
523
524    #[test]
525    fn test_decode_padding_defect_b() {
526        assert_eq!(
527            decode("=?us-ascii?b?dmk?=").unwrap(),
528            DecodingResult {
529                decoded: "vi".into(),
530                charset: Charset::Ascii,
531                lang: "".into(),
532                defects: vec![Defect::InvalidBase64Padding],
533            }
534        );
535    }
536
537    #[test]
538    fn test_decode_nonnull_lang() {
539        assert_eq!(
540            decode("=?us-ascii*jive?q?test?=").unwrap(),
541            DecodingResult {
542                decoded: "test".into(),
543                charset: Charset::Ascii,
544                lang: "jive".into(),
545                defects: vec![],
546            }
547        );
548    }
549
550    #[test]
551    fn test_decode_unknown_8bit_charset() {
552        assert_eq!(
553            decode("=?unknown-8bit?q?foo=ACbar?=").unwrap(),
554            DecodingResult {
555                decoded: "foo\u{ac}bar".into(),
556                charset: Charset::Unknown8Bit,
557                lang: "".into(),
558                defects: vec![],
559            }
560        );
561    }
562
563    #[test]
564    fn test_decode_unknown_charset() {
565        assert_eq!(
566            decode("=?foobar?q?foo=ACbar?=").unwrap(),
567            DecodingResult {
568                decoded: "foo\u{ac}bar".into(),
569                charset: Charset::Ascii,
570                lang: "".into(),
571                defects: vec![Defect::InvalidCharset {
572                    charset: "foobar".into()
573                }],
574            }
575        );
576    }
577
578    #[test]
579    fn test_decode_nonascii_q() {
580        assert_eq!(
581            decode("=?utf-8?q?=C3=89ric?=").unwrap(),
582            DecodingResult {
583                decoded: "Éric".into(),
584                charset: Charset::for_label(b"utf-8").unwrap(),
585                lang: "".into(),
586                defects: vec![],
587            }
588        );
589    }
590
591    #[test]
592    fn test_encode_q_all_safe() {
593        assert_eq!(&encode_q(b"foobar"), "foobar");
594    }
595
596    #[test]
597    fn test_encode_q_spaces() {
598        assert_eq!(&encode_q(b"foo bar "), "foo_bar_");
599    }
600
601    #[test]
602    fn test_encode_q_encodables() {
603        assert_eq!(&encode_q(b"foo  ,,bar"), "foo__=2C=2Cbar");
604        assert_eq!(len_q(b"foo  ,,bar"), b"foo__=2C=2Cbar".len());
605    }
606
607    #[test]
608    fn test_encode_b_simple() {
609        assert_eq!(&encode_b(b"foo"), "Zm9v");
610        assert_eq!(len_b(b"foo"), b"Zm9v".len());
611    }
612
613    #[test]
614    fn test_encode_b_padding() {
615        assert_eq!(&encode_b(b"vi"), "dmk=");
616        assert_eq!(len_b(b"vi"), b"dmk=".len());
617    }
618
619    #[test]
620    fn test_encode_simple_q() {
621        assert_eq!(
622            &encode(
623                "foo",
624                Some(encoding_rs::UTF_8.into()),
625                EncodingFlag::QuotedPrintable,
626                None,
627            ),
628            "=?utf-8?q?foo?="
629        );
630    }
631
632    #[test]
633    fn test_encode_simple_b() {
634        assert_eq!(
635            &encode(
636                "foo",
637                Some(encoding_rs::UTF_8.into()),
638                EncodingFlag::Base64,
639                None,
640            ),
641            "=?utf-8?b?Zm9v?="
642        );
643    }
644
645    #[test]
646    fn test_encode_auto_q() {
647        assert_eq!(
648            &encode(
649                "foo",
650                Some(encoding_rs::UTF_8.into()),
651                EncodingFlag::Shortest,
652                None,
653            ),
654            "=?utf-8?q?foo?="
655        );
656    }
657
658    #[test]
659    fn test_encode_auto_q_if_short_mostly_safe() {
660        assert_eq!(
661            &encode(
662                "vi.",
663                Some(encoding_rs::UTF_8.into()),
664                EncodingFlag::Shortest,
665                None,
666            ),
667            "=?utf-8?q?vi=2E?="
668        );
669    }
670
671    #[test]
672    fn test_encode_auto_b_if_enough_unsafe() {
673        assert_eq!(
674            &encode(
675                ".....",
676                Some(encoding_rs::UTF_8.into()),
677                EncodingFlag::Shortest,
678                None,
679            ),
680            "=?utf-8?b?Li4uLi4=?="
681        );
682    }
683
684    #[test]
685    fn test_encode_auto_b_if_long_unsafe() {
686        assert_eq!(
687            &encode(
688                "vi.vi.vi.vi.vi.",
689                Some(encoding_rs::UTF_8.into()),
690                EncodingFlag::Shortest,
691                None,
692            ),
693            "=?utf-8?b?dmkudmkudmkudmkudmku?="
694        );
695    }
696
697    #[test]
698    fn test_encode_auto_q_if_mostly_safe() {
699        assert_eq!(
700            &encode(
701                "vi vi vi.vi ",
702                Some(encoding_rs::UTF_8.into()),
703                EncodingFlag::Shortest,
704                None,
705            ),
706            "=?utf-8?q?vi_vi_vi=2Evi_?="
707        );
708    }
709
710    #[test]
711    fn test_encode_utf8_default() {
712        assert_eq!(
713            &encode("foo", None, EncodingFlag::Shortest, None,),
714            "=?utf-8?q?foo?="
715        );
716    }
717
718    #[test]
719    fn test_encode_lang() {
720        assert_eq!(
721            &encode("foo", None, EncodingFlag::Shortest, Some("jive")),
722            "=?utf-8*jive?q?foo?="
723        );
724    }
725}