Skip to main content

eml_codec/imf/
identification.rs

1#[cfg(feature = "arbitrary")]
2use arbitrary::Arbitrary;
3use bounded_static::ToStatic;
4use nom::{
5    branch::alt,
6    bytes::complete::tag,
7    combinator::{eof, map, opt, recognize},
8    multi::many0,
9    sequence::{delimited, pair, preceded, tuple},
10    IResult,
11};
12use std::borrow::Cow;
13#[cfg(any(feature = "tracing-recover", feature = "tracing-unsupported"))]
14use tracing::warn;
15
16use crate::i18n::ContainsUtf8;
17use crate::imf::mailbox::{domain, dtext, local_part, Domain, Dtext, LocalPart};
18use crate::print::{print_seq, Formatter, Print, ToStringFromPrint};
19use crate::text::recovery::{take_quoted_encoded_or_until1, take_quoted_or_until};
20use crate::text::utf8::{is_nonascii_or, take_utf8_while1};
21use crate::text::whitespace::cfws;
22#[cfg(any(feature = "tracing-recover", feature = "tracing-unsupported"))]
23use crate::utils::bytes_to_trace_string;
24#[cfg(feature = "arbitrary")]
25use crate::{arbitrary_utils::arbitrary_string_nonempty_where, fuzz_eq::FuzzEq};
26use eml_codec_derives::instrument_input;
27
28// NOTE: MessageID is not strictly RFC-compliant, printing it may use obsolete
29// or non-compliant syntax.
30#[derive(Clone, ContainsUtf8, Debug, PartialEq, ToStatic, ToStringFromPrint)]
31#[cfg_attr(feature = "arbitrary", derive(FuzzEq))]
32pub enum MessageID<'a> {
33    // The compliant (but possibly obsolete) syntax
34    ObsLeftRight {
35        left: LocalPart<'a>,
36        right: Domain<'a>,
37    },
38    // Non-compliant char sequence (must be non-empty and satisfy is_invalid_msgid_text)
39    #[cfg_attr(feature = "arbitrary", fuzz_eq(use_eq))]
40    Invalid(Cow<'a, str>),
41}
42impl<'a> Print for MessageID<'a> {
43    fn print(&self, fmt: &mut impl Formatter) {
44        fmt.write_bytes(b"<");
45        match &self {
46            MessageID::ObsLeftRight { left, right } => {
47                left.print(fmt);
48                fmt.write_bytes(b"@");
49                right.print(fmt);
50            }
51            MessageID::Invalid(txt) => fmt.write_bytes(txt.as_bytes()),
52        }
53        fmt.write_bytes(b">");
54    }
55}
56#[cfg(feature = "arbitrary")]
57impl<'a> Arbitrary<'a> for MessageID<'a> {
58    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
59        match u.int_in_range(0..=1)? {
60            0 => Ok(MessageID::ObsLeftRight {
61                left: u.arbitrary()?,
62                right: u.arbitrary()?,
63            }),
64            1 => {
65                let s = arbitrary_string_nonempty_where(u, is_invalid_msgid_text, 'X')?;
66                Ok(MessageID::Invalid(s.into()))
67            }
68            _ => unreachable!(),
69        }
70    }
71}
72
73// Must be non-empty
74pub type MessageIDList<'a> = Vec<MessageID<'a>>;
75
76impl<'a> Print for MessageIDList<'a> {
77    fn print(&self, fmt: &mut impl Formatter) {
78        print_seq(fmt, self, Formatter::write_fws)
79    }
80}
81
82/// Message identifier
83///
84/// The RFC gives the following syntax:
85/// ```abnf
86///    msg-id          =   [CFWS] "<" id-left "@" id-right ">" [CFWS]
87/// ```
88///
89/// but we also handle invalid syntax found in the real-world:
90/// ```abnf
91///    our-msg-id        = our-msg-id-angle / our-msg-id-bare
92///    our-msg-id-angle  = "<" our-msg-id-bare ">"
93///    our-msg-id-bare   = id-left "@" id-right / 1*(not <>")
94/// ```
95/// The grammar above is ambiguous since "id-left @ id-right" and "1*(not <>")"
96/// intersect. To work around this problem, our parsers for our-msg-id and
97/// our-msg-id-bare assume that they consume all of their input. If this is not
98/// the case, our-msg-id-angle should be used instead (as it is properly
99/// delimited).
100#[instrument_input("tracing")]
101pub fn msg_id(input: &[u8]) -> IResult<&[u8], MessageID<'_>> {
102    alt((
103        msg_id_angle,
104        map(msg_id_bare(|i: &[u8]| eof(i)), |msg| {
105            #[cfg(feature = "tracing-recover")]
106            warn!("message-id: bare msg-id without <>");
107            msg
108        }),
109    ))(input)
110}
111pub fn msg_id_angle(input: &[u8]) -> IResult<&[u8], MessageID<'_>> {
112    preceded(
113        pair(opt(cfws), tag("<")),
114        msg_id_bare(|i: &[u8]| recognize(pair(tag(">"), opt(cfws)))(i)),
115    )(input)
116}
117pub fn msg_id_bare<F>(terminator: F) -> impl FnMut(&[u8]) -> IResult<&[u8], MessageID<'_>>
118where
119    F: for<'a> Fn(&'a [u8]) -> IResult<&'a [u8], &'a [u8]>,
120{
121    move |input: &[u8]| {
122        alt((
123            map(
124                tuple((id_left, tag("@"), id_right, &terminator)),
125                |(left, _, right, _)| MessageID::ObsLeftRight { left, right },
126            ),
127            map(
128                tuple((
129                    opt(cfws),
130                    take_utf8_while1(is_invalid_msgid_text),
131                    opt(cfws),
132                    &terminator,
133                )),
134                |(_, s, _, _)| {
135                    #[cfg(feature = "tracing-recover")]
136                    warn!("message-id: bare string instead of id-left@id-right");
137                    MessageID::Invalid(s)
138                },
139            ),
140        ))(input)
141    }
142}
143
144// This is VERY lenient
145fn is_invalid_msgid_text(c: char) -> bool {
146    is_nonascii_or(|c| c.is_ascii_graphic() && c != b'<' && c != b'>' && c != b'"')(c)
147}
148
149/// A *very* lenient parser for lists of msg_id as used by In-Reply-To and References
150///
151/// The RFC definition is:
152/// ```abnf
153///       in-reply-to    =    1*msg-id
154///   obs-in-reply-to    =    *(phrase / msg-id)
155/// ```
156/// In the obs- syntax, the phrase tokens must be ignored.
157///
158/// However, historical emails seem to contain a lot of nonsense in between
159/// msg-id, and a lot of it is not part of the "phrase" syntax. We implement a
160/// more lenient parser that skips "everything" in-between msg-ids: quoted
161/// strings, encoded words (both part of the phrase syntax), and as a last
162/// resort, any bytes until encountering something that could be the start of
163/// one of the more "structured" tokens (msg-id, encoded word, quoted string).
164///
165/// Additionally, we try to recover from broken msg-ids: after reading a '<', if
166/// we can't parse a valid msg-id, we skip to the next '>' and continue parsing.
167#[instrument_input("tracing")]
168pub fn nullable_msg_list(input: &[u8]) -> IResult<&[u8], MessageIDList<'_>> {
169    let (input, tokens) = many0(alt((
170        map(msg_id_angle, Some),
171        // recovery: recognize a broken msg-id, skipping to the next >
172        map(
173            recognize(tuple((
174                tag("<"),
175                take_quoted_or_until(|c| c == b'>'),
176                // use opt() since we might also be at end of input...
177                opt(tag(">")),
178            ))),
179            |_i| {
180                #[cfg(feature = "tracing-unsupported")]
181                warn!(input = %bytes_to_trace_string(_i),
182                      "unsupported msg-id in msg-list");
183                None
184            },
185        ),
186        // compliant CFWS in between msg-ids
187        map(cfws, |_| None),
188        // recovery: recognize junk in between msg-ids, skipping to the next <
189        map(take_quoted_encoded_or_until1(|c| c == b'<'), |_i| {
190            #[cfg(feature = "tracing-recover")]
191            warn!(input = %bytes_to_trace_string(_i),
192                  "non-compliant text between msg-ids");
193            None
194        }),
195    )))(input)?;
196
197    Ok((input, tokens.into_iter().flatten().collect()))
198}
199
200/// Implements obs-id-left, which is a superset of id-left:
201/// ```abnf
202///     id-left     =   dot-atom-text / obs-id-left
203/// obs-id-left     =   local-part
204/// ```
205///
206/// NOTE: this directly returns the AST corresponding to *possibly obsolete*
207/// syntax; we do not attempt to "strictify" it
208#[instrument_input("tracing")]
209fn id_left(input: &[u8]) -> IResult<&[u8], LocalPart<'_>> {
210    local_part(input)
211}
212
213/// Implements obs-id-right, which is a superset of id-right:
214/// ```abnf
215///     id-right     =   dot-atom-text / no-fold-literal / obs-id-right
216/// obs-id-right     =   domain
217/// ```
218///
219/// NOTE: this directly returns the AST corresponding to *possibly obsolete*
220/// syntax; we do not attempt to "strictify" it
221#[instrument_input("tracing")]
222fn id_right(input: &[u8]) -> IResult<&[u8], Domain<'_>> {
223    domain(input)
224}
225
226#[allow(dead_code)]
227#[instrument_input("tracing")]
228fn no_fold_literal(input: &[u8]) -> IResult<&[u8], Dtext<'_>> {
229    delimited(tag("["), dtext, tag("]"))(input)
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235    use crate::imf::mailbox::{Domain, LocalPart, LocalPartToken};
236    use crate::print::tests::print_to_vec;
237    use crate::text::misc_token::Word;
238    use crate::text::quoted::QuotedString;
239    use crate::text::words::Atom;
240
241    fn assert_msg_list_reprinted(txt: &[u8], printed: &[u8]) {
242        let (rest, parsed) = nullable_msg_list(txt).unwrap();
243        assert_eq!(rest, b"");
244        let reprinted = print_to_vec(parsed);
245        assert_eq!(
246            String::from_utf8_lossy(&reprinted),
247            String::from_utf8_lossy(printed)
248        );
249    }
250
251    #[test]
252    fn test_msg_id() {
253        assert_eq!(
254            msg_id(b"<5678.21-Nov-1997@example.com>"),
255            Ok((
256                &b""[..],
257                MessageID::ObsLeftRight {
258                    left: LocalPart(vec![
259                        LocalPartToken::Word(Word::Atom(Atom("5678".into()))),
260                        LocalPartToken::Dot,
261                        LocalPartToken::Word(Word::Atom(Atom("21-Nov-1997".into()))),
262                    ]),
263                    right: Domain::Atoms(vec![Atom("example".into()), Atom("com".into()),]),
264                }
265            )),
266        );
267    }
268
269    #[test]
270    fn test_obsolete_msg_id() {
271        assert_eq!(
272            msg_id(b" < foo . bar@univ-valenciennes  .fr >"),
273            Ok((
274                &b""[..],
275                MessageID::ObsLeftRight {
276                    left: LocalPart(vec![
277                        LocalPartToken::Word(Word::Atom(Atom("foo".into()))),
278                        LocalPartToken::Dot,
279                        LocalPartToken::Word(Word::Atom(Atom("bar".into()))),
280                    ]),
281                    right: Domain::Atoms(
282                        vec![Atom("univ-valenciennes".into()), Atom("fr".into()),]
283                    ),
284                }
285            )),
286        );
287
288        assert_eq!(
289            msg_id(b"<\"24806 Tue Sep 19 11:05:34 1995\"@bnr.ca>"),
290            Ok((
291                &b""[..],
292                MessageID::ObsLeftRight {
293                    left: LocalPart(vec![LocalPartToken::Word(Word::Quoted(QuotedString(
294                        vec![
295                            "24806".into(),
296                            " ".into(),
297                            "Tue".into(),
298                            " ".into(),
299                            "Sep".into(),
300                            " ".into(),
301                            "19".into(),
302                            " ".into(),
303                            "11:05:34".into(),
304                            " ".into(),
305                            "1995".into(),
306                        ]
307                    )))]),
308                    right: Domain::Atoms(vec![Atom("bnr".into()), Atom("ca".into()),]),
309                }
310            )),
311        );
312    }
313
314    #[test]
315    fn test_noncompliant_msg_id() {
316        assert_eq!(
317            msg_id(b" <523C50DA-160C-4550-A44E-7E192513CF91> "),
318            Ok((
319                &b""[..],
320                MessageID::Invalid("523C50DA-160C-4550-A44E-7E192513CF91".into())
321            ))
322        );
323
324        assert_eq!(
325            msg_id(b" foo "),
326            Ok((&b""[..], MessageID::Invalid("foo".into())))
327        );
328
329        assert_eq!(
330            msg_id(b"text/plain.RKLqBQUAAZl1yPGCYOHKDjrj_nwwBg.1758617731@alan.eu"),
331            Ok((
332                &b""[..],
333                MessageID::ObsLeftRight {
334                    left: LocalPart(vec![
335                        LocalPartToken::Word(Word::Atom(Atom("text/plain".into()))),
336                        LocalPartToken::Dot,
337                        LocalPartToken::Word(Word::Atom(Atom(
338                            "RKLqBQUAAZl1yPGCYOHKDjrj_nwwBg".into()
339                        ))),
340                        LocalPartToken::Dot,
341                        LocalPartToken::Word(Word::Atom(Atom("1758617731".into()))),
342                    ]),
343                    right: Domain::Atoms(vec![Atom("alan".into()), Atom("eu".into()),]),
344                },
345            ))
346        );
347
348        assert_eq!(
349            msg_id(b" <aAdGYiJBX0VZF2TI@millmess@rouba.net> "),
350            Ok((
351                &b""[..],
352                MessageID::Invalid("aAdGYiJBX0VZF2TI@millmess@rouba.net".into())
353            ))
354        );
355
356        assert_eq!(
357            msg_id(b"<md5:xqmIG/sV8WoSG9UzafBCGw==>"),
358            Ok((
359                &b""[..],
360                MessageID::Invalid("md5:xqmIG/sV8WoSG9UzafBCGw==".into())
361            ))
362        );
363    }
364
365    #[test]
366    fn test_comma_separated_msg_list() {
367        // This is not RFC-valid syntax but was encountered in real-world emails
368        assert_eq!(
369            nullable_msg_list(b" <8d9bb189354d4804bcc2fd1d1a5398b5@cnrs.fr>,<ef8fac8b36834864bae895571064565c@cnrs.fr>"),
370            Ok((
371                &b""[..],
372                vec![
373                    MessageID::ObsLeftRight {
374                        left: LocalPart(vec![
375                            LocalPartToken::Word(Word::Atom(Atom("8d9bb189354d4804bcc2fd1d1a5398b5".into()))),
376                        ]),
377                        right: Domain::Atoms(vec![
378                            Atom("cnrs".into()),
379                            Atom("fr".into()),
380                        ]),
381                    },
382                    MessageID::ObsLeftRight {
383                        left: LocalPart(vec![
384                            LocalPartToken::Word(Word::Atom(Atom("ef8fac8b36834864bae895571064565c".into()))),
385                        ]),
386                        right: Domain::Atoms(vec![
387                            Atom("cnrs".into()),
388                            Atom("fr".into()),
389                        ]),
390                    },
391                ]
392            ))
393        );
394    }
395
396    #[test]
397    fn test_msg_list_weird() {
398        assert_msg_list_reprinted(
399            b"<3AB624F9.5B6C6680@example.com>; from foo@example.com on Mon, Mar 19, 2001 at 04:25:45PM +0100",
400            b"<3AB624F9.5B6C6680@example.com>"
401        );
402
403        assert_msg_list_reprinted(
404            b"<3AB624F9.5B6C6680@example.com> from \"Foo bar\" on Mon, Mar 19, 2001 at 04:25:45 AM",
405            b"<3AB624F9.5B6C6680@example.com>",
406        );
407    }
408
409    #[test]
410    fn test_msg_list_recover() {
411        // The second msg-id is broken (incorrect line folding). (Found in
412        // URSSAF emails.) It is parsed as MessageID::Invalid and reprinted
413        // as-is. We skip it and continue parsing.
414        assert_msg_list_reprinted(
415            b"<abc@def>,<foo\n\tbar@outlook.com>,<baz@outlook.com>",
416            b"<abc@def> <baz@outlook.com>",
417        );
418
419        // worse offenders, not found IRL but demonstrate the behavior of our
420        // recovery strategy
421        assert_msg_list_reprinted(b"<abc@def>,<foo\n\tbar@outlook.com ", b"<abc@def>");
422
423        assert_msg_list_reprinted(
424            b"<abc@def>,random\"garbage=?utf-8?q?aabb?= <uuu@jjj>",
425            b"<abc@def> <uuu@jjj>",
426        );
427
428        assert_msg_list_reprinted(b"<abc@def>,<randomgarbage\">\" <uuu@jjj>", b"<abc@def>");
429    }
430}