Skip to main content

eml_codec/text/
encoding.rs

1#[cfg(feature = "arbitrary")]
2use arbitrary::Arbitrary;
3use bounded_static::ToStatic;
4
5use base64::{engine::general_purpose, Engine as _};
6use nom::{
7    branch::alt,
8    bytes::complete::{tag, take, take_while, take_while1},
9    character::complete::one_of,
10    character::is_alphanumeric,
11    combinator::{all_consuming, map, map_parser, opt, recognize},
12    multi::{many0, many1, separated_list1},
13    sequence::{delimited, preceded, terminated, tuple},
14    IResult,
15};
16use std::borrow::Cow;
17use std::fmt;
18
19use crate::i18n::ContainsUtf8;
20use crate::print::{print_seq, Formatter, Print, ToStringFromPrint};
21use crate::text::ascii;
22use crate::text::charset::EmailCharset;
23use crate::text::utf8::take_utf8_while1;
24use crate::text::whitespace::{self, cfws, fws};
25use crate::text::words;
26#[cfg(feature = "arbitrary")]
27use crate::{
28    arbitrary_utils::{arbitrary_vec_nonempty, arbitrary_vec_where},
29    fuzz_eq::FuzzEq,
30};
31
32// Context in which an encoded word is parsed.
33//
34// `Phrase` is more strict than `Comment`, which is more strict than `Unstructured`.
35// ("more strict" == "allows less inputs")
36#[derive(Clone, Copy)]
37pub enum Context {
38    Phrase,
39    Comment,
40    Unstructured,
41}
42
43pub fn encoded_word(ctx: Context) -> impl FnMut(&[u8]) -> IResult<&[u8], EncodedWord<'_>> {
44    move |input| delimited(opt(cfws), encoded_word_plain(ctx), opt(cfws))(input)
45}
46
47// NOTE: this is used in the comment syntax, so should not
48// recurse and call CFWS itself, for parsing efficiency reasons.
49pub fn encoded_word_plain(ctx: Context) -> impl FnMut(&[u8]) -> IResult<&[u8], EncodedWord<'_>> {
50    move |input| map(separated_list1(fws, encoded_word_token(ctx)), EncodedWord)(input)
51}
52
53pub fn encoded_word_token(
54    ctx: Context,
55) -> impl FnMut(&[u8]) -> IResult<&[u8], EncodedWordToken<'_>> {
56    move |input| {
57        // An encoded word is always a special case of an atom-like token. Which characters are
58        // allowed in this atom token depends on the context, so we first read the atom, then try to
59        // parse it fully as an encoded word.
60        map_parser(
61            // read an atom-like token
62            encoded_word_token_atom(ctx),
63            // ...which must fully represent an encoded word
64            all_consuming(alt((encoded_word_token_quoted, encoded_word_token_base64))),
65        )(input)
66    }
67}
68
69fn encoded_word_token_atom(ctx: Context) -> impl FnMut(&[u8]) -> IResult<&[u8], &[u8]> {
70    move |input| {
71        // use `recognize` as this will be re-parsed by the encoded-word
72        // combinators, and all our parsing combinators work on &[u8]s.
73        //
74        // XXX if invalid utf-8 is present, this makes `take_utf8_while1`
75        // unnecessarily allocate a string for the result that is then
76        // discarded.
77        match ctx {
78            // mirrors words::atom
79            Context::Phrase => recognize(take_utf8_while1(words::is_atext))(input),
80            // mirrors whitespace::ctext
81            Context::Comment => recognize(take_utf8_while1(whitespace::is_ctext))(input),
82            // mirrors misc_token::obs_utext_token (non-obs case)
83            Context::Unstructured => recognize(take_utf8_while1(words::is_vchar))(input),
84        }
85    }
86}
87
88pub fn encoded_word_token_quoted(input: &[u8]) -> IResult<&[u8], EncodedWordToken<'_>> {
89    let (rest, (_, charset, _, _, _, txt, _)) = tuple((
90        tag("=?"),
91        words::mime_atom_plain,
92        tag("?"),
93        one_of("Qq"),
94        tag("?"),
95        ptext,
96        tag("?="),
97    ))(input)?;
98
99    let parsed = EncodedWordToken::Quoted(QuotedWord {
100        enc: charset.0.into(),
101        chunks: txt,
102    });
103    Ok((rest, parsed))
104}
105
106pub fn encoded_word_token_base64(input: &[u8]) -> IResult<&[u8], EncodedWordToken<'_>> {
107    let (rest, (_, charset, _, _, _, txt, _)) = tuple((
108        tag("=?"),
109        words::mime_atom_plain,
110        tag("?"),
111        one_of("Bb"),
112        tag("?"),
113        btext,
114        tag("?="),
115    ))(input)?;
116
117    let parsed = EncodedWordToken::Base64(Base64Word {
118        enc: charset.0.into(),
119        content: Cow::Borrowed(txt),
120    });
121    Ok((rest, parsed))
122}
123
124/// Represents an encoded word.
125#[derive(Clone, ContainsUtf8, Debug, PartialEq, ToStatic, ToStringFromPrint)]
126#[cfg_attr(feature = "arbitrary", derive(FuzzEq))]
127#[contains_utf8(false)]
128pub struct EncodedWord<'a>(pub Vec<EncodedWordToken<'a>>); // must be non-empty
129
130impl<'a> EncodedWord<'a> {
131    /// Returns the data represented by this `EncodedWord`, encoded into UTF8
132    pub fn data(&self) -> String {
133        self.0
134            .iter()
135            .map(|tok| tok.data())
136            .collect::<Vec<_>>()
137            .join("")
138    }
139
140    /// Build an encoded word from UTF-8 chars. Uses the UTF-8 charset and
141    /// quoted encoding.
142    pub fn from_chars<I>(chars: I) -> Self
143    where
144        I: IntoIterator<Item = char>,
145    {
146        const HEADER: &[u8] = b"=?UTF-8?Q?";
147        const FOOTER: &[u8] = b"?=";
148        // specified in RFC2047
149        const MAX_LEN: usize = 75;
150
151        let mut tokens: Vec<EncodedWordToken> = vec![];
152        let mut cur_chunks: Vec<QuotedChunk> = vec![];
153        let mut cur_word_len = 0;
154        let mut char_bytes: [u8; 4] = [0; 4];
155
156        for c in chars {
157            if HEADER.len() + cur_word_len + FOOTER.len() > MAX_LEN - 3
158            /* max size minus room for the next encoded byte */
159            {
160                let mut w = QuotedWord {
161                    enc: EmailCharset::utf8(),
162                    chunks: vec![],
163                };
164                std::mem::swap(&mut w.chunks, &mut cur_chunks);
165                tokens.push(EncodedWordToken::Quoted(w));
166                cur_word_len = 0;
167            }
168
169            if c.is_ascii() && is_qchar_safe_strict(c as u8) {
170                if let Some(QuotedChunk::Safe(s)) = cur_chunks.last_mut() {
171                    let s = s.to_mut();
172                    s.push(c as u8)
173                } else {
174                    cur_chunks.push(QuotedChunk::Safe(vec![c as u8].into()));
175                }
176                cur_word_len += 1;
177            } else if c == char::from(ascii::SP) {
178                // space has a special treatment (RFC2047, 4.2, (2))
179                cur_chunks.push(QuotedChunk::Space);
180                cur_word_len += 1;
181            } else {
182                c.encode_utf8(&mut char_bytes);
183                let c_bytes = &char_bytes[0..c.len_utf8()];
184                if let Some(QuotedChunk::Encoded(e)) = cur_chunks.last_mut() {
185                    e.extend_from_slice(c_bytes)
186                } else {
187                    cur_chunks.push(QuotedChunk::Encoded(c_bytes.to_vec()))
188                }
189                // each encoded byte uses three characters (=XX)
190                cur_word_len += 3 * c.len_utf8();
191            }
192        }
193
194        tokens.push(EncodedWordToken::Quoted(QuotedWord {
195            enc: EmailCharset::utf8(),
196            chunks: cur_chunks,
197        }));
198
199        EncodedWord(tokens)
200    }
201}
202impl<'a> Print for EncodedWord<'a> {
203    fn print(&self, fmt: &mut impl Formatter) {
204        print_seq(fmt, &self.0, Formatter::write_fws)
205    }
206}
207
208#[cfg(feature = "arbitrary")]
209impl<'a> Arbitrary<'a> for EncodedWord<'a> {
210    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
211        Ok(EncodedWord(arbitrary_vec_nonempty(u)?))
212    }
213}
214
215#[derive(PartialEq, Debug, Clone, ToStatic)]
216#[cfg_attr(feature = "arbitrary", derive(FuzzEq, Arbitrary))]
217pub enum EncodedWordToken<'a> {
218    Quoted(QuotedWord<'a>),
219    Base64(Base64Word<'a>),
220}
221impl<'a> EncodedWordToken<'a> {
222    pub fn data(&self) -> String {
223        match self {
224            EncodedWordToken::Quoted(v) => v.data(),
225            EncodedWordToken::Base64(v) => v.data(),
226        }
227    }
228}
229impl<'a> Print for EncodedWordToken<'a> {
230    fn print(&self, fmt: &mut impl Formatter) {
231        match self {
232            EncodedWordToken::Quoted(q) => q.print(fmt),
233            EncodedWordToken::Base64(b) => b.print(fmt),
234        }
235    }
236}
237
238#[derive(PartialEq, Clone, ToStatic)]
239#[cfg_attr(feature = "arbitrary", derive(FuzzEq))]
240pub struct Base64Word<'a> {
241    pub enc: EmailCharset,
242    // `content` must represent base64-encoded data. In particular,
243    // all bytes in `content` must satisfy `is_bchar`.
244    #[cfg_attr(feature = "arbitrary", fuzz_eq(use_eq))]
245    pub content: Cow<'a, [u8]>,
246}
247impl<'a> fmt::Debug for Base64Word<'a> {
248    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
249        fmt.debug_struct("Base64Word")
250            .field("enc", &self.enc)
251            .field("content", &String::from_utf8_lossy(&self.content))
252            .finish()
253    }
254}
255
256impl<'a> Base64Word<'a> {
257    pub fn data(&self) -> String {
258        general_purpose::STANDARD_NO_PAD
259            .decode(&self.content)
260            .map(|d| self.enc.decode(d.as_slice()).to_string())
261            .unwrap_or("".into())
262    }
263}
264
265impl<'a> Print for Base64Word<'a> {
266    fn print(&self, fmt: &mut impl Formatter) {
267        fmt.write_bytes(b"=?");
268        fmt.write_bytes(self.enc.as_bytes());
269        fmt.write_bytes(b"?B?");
270        fmt.write_bytes(&self.content);
271        fmt.write_bytes(b"?=");
272    }
273}
274
275#[cfg(feature = "arbitrary")]
276impl<'a> Arbitrary<'a> for Base64Word<'a> {
277    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
278        let enc: EmailCharset = u.arbitrary()?;
279        let content = arbitrary_vec_where(u, |c| is_bchar(*c))?;
280        Ok(Base64Word {
281            enc,
282            content: Cow::Owned(content),
283        })
284    }
285}
286
287#[derive(PartialEq, Debug, Clone, ToStatic)]
288#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
289pub struct QuotedWord<'a> {
290    pub enc: EmailCharset,
291    pub chunks: Vec<QuotedChunk<'a>>,
292}
293
294impl<'a> QuotedWord<'a> {
295    pub fn data(&self) -> String {
296        self.chunks.iter().fold(String::new(), |mut acc, c| {
297            match c {
298                QuotedChunk::Safe(v) => {
299                    let (content, _) = encoding_rs::UTF_8.decode_without_bom_handling(v);
300                    acc.push_str(content.as_ref());
301                }
302                QuotedChunk::Space => acc.push(' '),
303                QuotedChunk::Encoded(v) => {
304                    let d = self.enc.decode(v.as_slice());
305                    acc.push_str(d.as_ref());
306                }
307            };
308            acc
309        })
310    }
311}
312
313impl<'a> Print for QuotedWord<'a> {
314    fn print(&self, fmt: &mut impl Formatter) {
315        fmt.write_bytes(b"=?");
316        fmt.write_bytes(self.enc.as_bytes());
317        fmt.write_bytes(b"?Q?");
318        print_seq(fmt, &self.chunks, |_| ());
319        fmt.write_bytes(b"?=");
320    }
321}
322
323#[cfg(feature = "arbitrary")]
324impl<'a> FuzzEq for QuotedWord<'a> {
325    fn fuzz_eq(&self, other: &Self) -> bool {
326        self.enc.fuzz_eq(&other.enc)
327            && normalize_quoted_chunks(&self.chunks) == normalize_quoted_chunks(&other.chunks)
328    }
329}
330
331#[derive(PartialEq, Clone, ToStatic)]
332pub enum QuotedChunk<'a> {
333    Safe(Cow<'a, [u8]>), // must satisfy `is_safe_char2`
334    Encoded(Vec<u8>),
335    Space,
336}
337impl<'a> fmt::Debug for QuotedChunk<'a> {
338    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
339        match self {
340            QuotedChunk::Safe(b) => fmt
341                .debug_tuple("QuotedChunk::Safe")
342                .field(&String::from_utf8_lossy(b))
343                .finish(),
344            QuotedChunk::Encoded(e) => fmt.debug_tuple("QuotedChunk::Encoded").field(e).finish(),
345            QuotedChunk::Space => fmt.debug_tuple("QuotedChunk::Space").finish(),
346        }
347    }
348}
349
350impl<'a> Print for QuotedChunk<'a> {
351    fn print(&self, fmt: &mut impl Formatter) {
352        match self {
353            QuotedChunk::Safe(b) => fmt.write_bytes(b),
354            QuotedChunk::Encoded(e) => {
355                for c in e {
356                    fmt.write_bytes(format!("={:02X}", c).as_bytes());
357                }
358            }
359            QuotedChunk::Space => fmt.write_bytes(b"_"),
360        }
361    }
362}
363
364#[cfg(feature = "arbitrary")]
365impl<'a> Arbitrary<'a> for QuotedChunk<'a> {
366    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
367        match u.int_in_range(0..=2)? {
368            0 => {
369                let v = arbitrary_vec_where(u, |c| is_safe_char2(*c))?;
370                Ok(QuotedChunk::Safe(Cow::Owned(v)))
371            }
372            1 => {
373                let v: Vec<u8> = u.arbitrary()?;
374                Ok(QuotedChunk::Encoded(v))
375            }
376            2 => Ok(QuotedChunk::Space),
377            _ => unreachable!(),
378        }
379    }
380}
381
382//quoted_printable
383pub fn ptext(input: &[u8]) -> IResult<&[u8], Vec<QuotedChunk<'_>>> {
384    many0(alt((safe_char2, encoded_space, many_hex_octet)))(input)
385}
386
387fn safe_char2(input: &[u8]) -> IResult<&[u8], QuotedChunk<'_>> {
388    map(take_while1(is_safe_char2), |b| {
389        QuotedChunk::Safe(Cow::Borrowed(b))
390    })(input)
391}
392
393/// RFC2047 section 4.2
394/// 8-bit values which correspond to printable ASCII characters other
395/// than "=", "?", and "_" (underscore), MAY be represented as those
396/// characters.
397fn is_safe_char2(c: u8) -> bool {
398    words::is_vchar(c.into()) && c != ascii::UNDERSCORE && c != ascii::QUESTION && c != ascii::EQ
399}
400
401fn encoded_space(input: &[u8]) -> IResult<&[u8], QuotedChunk<'_>> {
402    map(tag("_"), |_| QuotedChunk::Space)(input)
403}
404
405fn hex_octet(input: &[u8]) -> IResult<&[u8], u8> {
406    let (rest, hbytes) = preceded(tag("="), take(2usize))(input)?;
407
408    let hstr = String::from_utf8_lossy(hbytes);
409    let parsed = u8::from_str_radix(hstr.as_ref(), 16).map_err(|_| {
410        nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Verify))
411    })?;
412
413    Ok((rest, parsed))
414}
415
416fn many_hex_octet(input: &[u8]) -> IResult<&[u8], QuotedChunk<'_>> {
417    map(many1(hex_octet), QuotedChunk::Encoded)(input)
418}
419
420//base64 (maybe use a crate)
421//TODO: this strips off padding chars (final '='s). is this ok?
422pub fn btext(input: &[u8]) -> IResult<&[u8], &[u8]> {
423    terminated(take_while(is_bchar), many0(tag("=")))(input)
424}
425
426fn is_bchar(c: u8) -> bool {
427    is_alphanumeric(c) || c == ascii::PLUS || c == ascii::SLASH
428}
429
430// Returns whether ASCII char `b` is safe to display as-is in the
431// encoded-text of an encoded-word.
432// As per RFC2047, in general this depends on the context in which
433// this encoded-word occurs. Because this function is used for
434// printing, it returns the most conservative answer, i.e. it only
435// returns `true` if the character is safe to use in any context.
436fn is_qchar_safe_strict(b: u8) -> bool {
437    // General restrictions for the Q encoding (RFC2047, 4.2, (3)),
438    // + restrictions when inside a comment (RFC2047, 5, (2)),
439    // + restrictions when inside a phrase (RFC2047, 5, (3)).
440    is_alphanumeric(b)
441        || b == ascii::EXCLAMATION
442        || b == ascii::ASTERISK
443        || b == ascii::PLUS
444        || b == ascii::MINUS
445        || b == ascii::SLASH
446}
447
448#[cfg(feature = "arbitrary")]
449fn normalize_quoted_chunks<'a>(chunks: &Vec<QuotedChunk<'a>>) -> Vec<QuotedChunk<'static>> {
450    use bounded_static::ToBoundedStatic;
451    let mut new_chunks: Vec<QuotedChunk<'static>> = vec![];
452    for chunk in chunks {
453        match (new_chunks.last_mut(), chunk) {
454            (Some(QuotedChunk::Safe(b1)), QuotedChunk::Safe(b2)) => b1.to_mut().extend(&**b2),
455            (Some(QuotedChunk::Encoded(v1)), QuotedChunk::Encoded(v2)) => v1.extend(v2),
456            (_, _) => new_chunks.push(chunk.to_static()),
457        }
458    }
459    new_chunks
460}
461
462#[cfg(test)]
463mod tests {
464    use super::*;
465    use crate::print::tests::print_to_vec_with;
466
467    // =?iso8859-1?Q?Accus=E9_de_r=E9ception_(affich=E9)?=
468    #[test]
469    fn test_ptext() {
470        assert_eq!(
471            ptext(b"Accus=E9_de_r=E9ception_(affich=E9)"),
472            Ok((
473                &b""[..],
474                vec![
475                    QuotedChunk::Safe(b"Accus"[..].into()),
476                    QuotedChunk::Encoded(vec![0xe9]),
477                    QuotedChunk::Space,
478                    QuotedChunk::Safe(b"de"[..].into()),
479                    QuotedChunk::Space,
480                    QuotedChunk::Safe(b"r"[..].into()),
481                    QuotedChunk::Encoded(vec![0xe9]),
482                    QuotedChunk::Safe(b"ception"[..].into()),
483                    QuotedChunk::Space,
484                    QuotedChunk::Safe(b"(affich"[..].into()),
485                    QuotedChunk::Encoded(vec![0xe9]),
486                    QuotedChunk::Safe(b")"[..].into()),
487                ]
488            ))
489        );
490    }
491
492    #[test]
493    fn test_invalid_space() {
494        // Context::Unstructured is the most lenient
495        assert!(
496            encoded_word(Context::Unstructured)(b"=?iso8859-1?Q?Accus=E9 de r=E9ception?=")
497                .is_err()
498        );
499    }
500
501    #[test]
502    fn test_decode_word() {
503        // This is only parsable in the Unstructured context, because of the naked parenthesis
504        assert_eq!(
505            encoded_word(Context::Unstructured)(
506                b"=?iso8859-1?Q?Accus=E9_de_r=E9ception_(affich=E9)?="
507            )
508            .unwrap()
509            .1
510            .data(),
511            "Accusé de réception (affiché)".to_string(),
512        );
513
514        assert_eq!(
515            encoded_word(Context::Unstructured)(b"=?iso-8859-1?Q?=805.4bn?=")
516                .unwrap()
517                .1
518                .data(),
519            "€5.4bn".to_string(),
520        );
521
522        assert!(encoded_word(Context::Phrase)(
523            b"=?iso8859-1?Q?Accus=E9_de_r=E9ception_(affich=E9)?="
524        )
525        .is_err());
526    }
527
528    #[test]
529    fn test_decode_word_ast() {
530        assert_eq!(
531            encoded_word(Context::Phrase)(b"=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=")
532                .unwrap()
533                .1,
534            EncodedWord(vec![EncodedWordToken::Base64(Base64Word {
535                enc: EmailCharset::from(b"iso-8859-1"),
536                content: b"SWYgeW91IGNhbiByZWFkIHRoaXMgeW8"[..].into(),
537            })])
538        );
539    }
540
541    // =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
542    #[test]
543    fn test_decode_word_b64() {
544        assert_eq!(
545            encoded_word(Context::Phrase)(b"=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=")
546                .unwrap()
547                .1
548                .data(),
549            "If you can read this yo".to_string(),
550        );
551    }
552
553    #[test]
554    fn test_strange_quoted() {
555        assert_eq!(
556            encoded_word(Context::Phrase)(b"=?UTF-8?Q?John_Sm=C3=AEth?=")
557                .unwrap()
558                .1
559                .data(),
560            "John Smîth".to_string(),
561        );
562    }
563
564    #[test]
565    fn test_multiple() {
566        // white space between adjacent encoded word is not displayed
567        assert_eq!(
568            encoded_word(Context::Phrase)(b"=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=")
569                .unwrap()
570                .1
571                .data(),
572            "ab".to_string(),
573        );
574
575        assert_eq!(
576            encoded_word(Context::Phrase)(b"=?ISO-8859-1?Q?a?=  \r\n   =?ISO-8859-1?Q?b?=")
577                .unwrap()
578                .1
579                .data(),
580            "ab".to_string(),
581        );
582    }
583
584    #[test]
585    fn test_encode() {
586        let out = print_to_vec_with(|f| {
587            EncodedWord::from_chars("Accusé de réception (affiché)".chars()).print(f);
588        });
589        assert_eq!(
590            String::from_utf8_lossy(&out),
591            "=?UTF-8?Q?Accus=C3=A9_de_r=C3=A9ception_=28affich=C3=A9=29?="
592        );
593
594        let out = print_to_vec_with(|f| {
595            EncodedWord::from_chars("John Smîth".chars()).print(f);
596        });
597        assert_eq!(out, b"=?UTF-8?Q?John_Sm=C3=AEth?=");
598    }
599
600    #[test]
601    fn test_encode_folding() {
602        let out = print_to_vec_with(|f| {
603            f.begin_line_folding();
604            EncodedWord::from_chars(
605                "Accusé de réception (affiché) Accusé de réception (affiché)".chars(),
606            )
607            .print(f);
608        });
609        assert_eq!(
610            String::from_utf8_lossy(&out),
611            "=?UTF-8?Q?Accus=C3=A9_de_r=C3=A9ception_=28affich=C3=A9=29_Accus=C3=A9_?=\r\n =?UTF-8?Q?de_r=C3=A9ception_=28affich=C3=A9=29?="
612        );
613    }
614
615    #[test]
616    fn test_encode_empty() {
617        let out = print_to_vec_with(|f| {
618            EncodedWord::from_chars("".chars()).print(f);
619        });
620        assert_eq!(out, b"=?UTF-8?Q??=");
621    }
622}