mail_parser/parsers/
message.rs

1/*
2 * SPDX-FileCopyrightText: 2020 Stalwart Labs LLC <hello@stalw.art>
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR MIT
5 */
6
7use std::borrow::Cow;
8
9use crate::{
10    decoders::{charsets::map::charset_decoder, DecodeFnc},
11    ContentType, Encoding, GetHeader, HeaderName, HeaderValue, Message, MessageParser, MessagePart,
12    MessagePartId, PartType,
13};
14
15use super::MessageStream;
16
17const MAX_NESTED_ENCODED: usize = 3;
18
19#[derive(Debug, PartialEq, Default)]
20enum MimeType {
21    MultipartMixed,
22    MultipartAlternative,
23    MultipartRelated,
24    MultipartDigest,
25    TextPlain,
26    TextHtml,
27    TextOther,
28    Inline,
29    #[default]
30    Message,
31    Other,
32}
33
34#[inline(always)]
35fn mime_type(
36    content_type: Option<&ContentType<'_>>,
37    parent_content_type: &MimeType,
38) -> (bool, bool, bool, MimeType) {
39    if let Some(content_type) = content_type {
40        match content_type.ctype() {
41            "multipart" => (
42                true,
43                false,
44                false,
45                match content_type.subtype() {
46                    Some("mixed") => MimeType::MultipartMixed,
47                    Some("alternative") => MimeType::MultipartAlternative,
48                    Some("related") => MimeType::MultipartRelated,
49                    Some("digest") => MimeType::MultipartDigest,
50                    _ => MimeType::Other,
51                },
52            ),
53            "text" => match content_type.subtype() {
54                Some("plain") => (false, true, true, MimeType::TextPlain),
55                Some("html") => (false, true, true, MimeType::TextHtml),
56                _ => (false, false, true, MimeType::TextOther),
57            },
58            "image" | "audio" | "video" => (false, true, false, MimeType::Inline),
59            "message" if [Some("rfc822"), Some("global")].contains(&content_type.subtype()) => {
60                (false, false, false, MimeType::Message)
61            }
62            _ => (false, false, false, MimeType::Other),
63        }
64    } else if let MimeType::MultipartDigest = parent_content_type {
65        (false, false, false, MimeType::Message)
66    } else {
67        (false, true, true, MimeType::TextPlain)
68    }
69}
70
71#[derive(Default, Debug)]
72struct MessageParserState {
73    mime_type: MimeType,
74    mime_boundary: Option<Vec<u8>>,
75    in_alternative: bool,
76    parts: usize,
77    html_parts: usize,
78    text_parts: usize,
79    need_html_body: bool,
80    need_text_body: bool,
81    part_id: MessagePartId,
82    sub_part_ids: Vec<MessagePartId>,
83    offset_header: usize,
84    offset_body: usize,
85    offset_end: usize,
86}
87
88impl MessageParserState {
89    fn new() -> MessageParserState {
90        MessageParserState {
91            mime_type: MimeType::Message,
92            mime_boundary: None,
93            in_alternative: false,
94            parts: 0,
95            html_parts: 0,
96            text_parts: 0,
97            need_text_body: true,
98            need_html_body: true,
99            ..Default::default()
100        }
101    }
102}
103
104impl MessageParser {
105    /// Parses a byte slice containing the RFC5322 raw message and returns a
106    /// `Message` struct.
107    ///
108    /// This function never panics, a best-effort is made to parse the message and
109    /// if no headers are found None is returned.
110    ///
111    pub fn parse<'x>(&self, raw_message: &'x (impl AsRef<[u8]> + ?Sized)) -> Option<Message<'x>> {
112        self.parse_(raw_message.as_ref(), MAX_NESTED_ENCODED, false)
113    }
114
115    /// Parses a byte slice containing the RFC5322 raw message and returns a
116    /// `Message` struct containing only the headers.
117    pub fn parse_headers<'x>(
118        &self,
119        raw_message: &'x (impl AsRef<[u8]> + ?Sized),
120    ) -> Option<Message<'x>> {
121        self.parse_(raw_message.as_ref(), MAX_NESTED_ENCODED, true)
122    }
123
124    fn parse_<'x>(
125        &self,
126        raw_message: &'x [u8],
127        depth: usize,
128        skip_body: bool,
129    ) -> Option<Message<'x>> {
130        let mut stream = MessageStream::new(raw_message);
131
132        let mut message = Message::new();
133
134        let mut state = MessageParserState::new();
135        let mut state_stack = Vec::with_capacity(4);
136
137        let mut part_headers = Vec::new();
138
139        'outer: loop {
140            // Parse headers
141            state.offset_header = stream.offset();
142            if !stream.parse_headers(self, &mut part_headers) {
143                break;
144            }
145            state.offset_body = stream.offset();
146            if skip_body {
147                break;
148            }
149
150            state.parts += 1;
151            state.sub_part_ids.push(message.parts.len() as u32);
152
153            let content_type = part_headers
154                .header_value(&HeaderName::ContentType)
155                .and_then(|c| c.as_content_type());
156
157            let (is_multipart, mut is_inline, mut is_text, mut mime_type) =
158                mime_type(content_type, &state.mime_type);
159
160            if is_multipart {
161                if let Some(mime_boundary) = content_type.and_then(|f| f.attribute("boundary")) {
162                    if stream.seek_next_part(mime_boundary.as_bytes()) {
163                        let part_id = message.parts.len();
164                        let new_state = MessageParserState {
165                            in_alternative: state.in_alternative
166                                || mime_type == MimeType::MultipartAlternative,
167                            mime_type,
168                            mime_boundary: mime_boundary.as_bytes().to_vec().into(),
169                            html_parts: message.html_body.len(),
170                            text_parts: message.text_body.len(),
171                            need_html_body: state.need_html_body,
172                            need_text_body: state.need_text_body,
173                            part_id: part_id as u32,
174                            ..Default::default()
175                        };
176                        //add_missing_type(&mut part_header, "text".into(), "plain".into());
177                        message.parts.push(MessagePart {
178                            headers: std::mem::take(&mut part_headers),
179                            offset_header: state.offset_header as u32,
180                            offset_body: state.offset_body as u32,
181                            offset_end: 0,
182                            is_encoding_problem: false,
183                            encoding: Encoding::None,
184                            body: PartType::default(),
185                        });
186                        state_stack.push((state, None));
187                        state = new_state;
188                        stream.skip_crlf();
189                        continue;
190                    } else {
191                        mime_type = MimeType::TextOther;
192                        is_text = true;
193                    }
194                }
195            }
196
197            let (mut encoding, decode_fnc): (Encoding, DecodeFnc<'_>) = match part_headers
198                .header_value(&HeaderName::ContentTransferEncoding)
199            {
200                Some(HeaderValue::Text(encoding)) if encoding.eq_ignore_ascii_case("base64") => {
201                    (Encoding::Base64, MessageStream::decode_base64_mime)
202                }
203                Some(HeaderValue::Text(encoding))
204                    if encoding.eq_ignore_ascii_case("quoted-printable") =>
205                {
206                    (
207                        Encoding::QuotedPrintable,
208                        MessageStream::decode_quoted_printable_mime,
209                    )
210                }
211                _ => (Encoding::None, MessageStream::mime_part),
212            };
213
214            if mime_type == MimeType::Message && encoding == Encoding::None {
215                let new_state = MessageParserState {
216                    mime_type: MimeType::Message,
217                    mime_boundary: state.mime_boundary.take(),
218                    need_html_body: true,
219                    need_text_body: true,
220                    part_id: message.parts.len() as u32,
221                    ..Default::default()
222                };
223                message.attachments.push(message.parts.len() as u32);
224                message.parts.push(MessagePart {
225                    headers: std::mem::take(&mut part_headers),
226                    encoding,
227                    is_encoding_problem: false,
228                    offset_header: state.offset_header as u32,
229                    offset_body: state.offset_body as u32,
230                    offset_end: 0,
231                    body: PartType::default(), // Temp value, will be replaced later.
232                });
233                state_stack.push((state, message.into()));
234                message = Message::new();
235                state = new_state;
236                continue;
237            }
238
239            let (offset_end, mut bytes) = decode_fnc(
240                &mut stream,
241                state.mime_boundary.as_deref().unwrap_or(&b""[..]),
242            );
243
244            // Attempt to recover contents of an invalid message
245            let mut is_encoding_problem = offset_end == usize::MAX;
246            if is_encoding_problem {
247                encoding = Encoding::None;
248                if mime_type != MimeType::TextPlain {
249                    mime_type = MimeType::TextOther;
250                }
251                is_inline = false;
252                is_text = true;
253
254                let (offset_end, boundary_found) =
255                    stream.seek_part_end(state.mime_boundary.as_deref());
256                state.offset_end = offset_end;
257                bytes = stream.data[state.offset_body..state.offset_end].into();
258
259                if !boundary_found {
260                    state.mime_boundary = None;
261                }
262            } else {
263                state.offset_end = offset_end;
264            }
265
266            let body_part = if mime_type != MimeType::Message {
267                let is_inline = is_inline
268                    && part_headers
269                        .header_value(&HeaderName::ContentDisposition)
270                        .is_none_or(|d| !d.as_content_type().is_some_and(|ct| ct.is_attachment()))
271                    && (state.parts == 1
272                        || state.mime_type != MimeType::MultipartRelated
273                            && (mime_type == MimeType::Inline
274                                || content_type.is_none_or(|c| !c.has_attribute("name"))));
275
276                // if message consists of single text/plain part, classify as text regardless
277                // of encoding issues: see malformed/018.eml
278                let is_inline = is_inline
279                    || state.parts == 1
280                        && state.mime_type == MimeType::Message
281                        && mime_type == MimeType::TextPlain
282                        && is_encoding_problem;
283
284                let (add_to_html, add_to_text) =
285                    if let MimeType::MultipartAlternative = state.mime_type {
286                        match mime_type {
287                            MimeType::TextHtml => (true, false),
288                            MimeType::TextPlain => (false, true),
289                            _ => (false, false),
290                        }
291                    } else if is_inline {
292                        if state.in_alternative && (state.need_text_body || state.need_html_body) {
293                            match mime_type {
294                                MimeType::TextHtml => {
295                                    state.need_text_body = false;
296                                }
297                                MimeType::TextPlain => {
298                                    state.need_html_body = false;
299                                }
300                                _ => (),
301                            }
302                        }
303                        (state.need_html_body, state.need_text_body)
304                    } else {
305                        (false, false)
306                    };
307
308                if add_to_html {
309                    message.html_body.push(message.parts.len() as u32);
310                }
311                if add_to_text {
312                    message.text_body.push(message.parts.len() as u32);
313                }
314
315                if is_text {
316                    let text = match (
317                        bytes,
318                        content_type.and_then(|ct| {
319                            ct.attribute("charset")
320                                .and_then(|c| charset_decoder(c.as_bytes()))
321                        }),
322                    ) {
323                        (Cow::Owned(vec), Some(charset_decoder)) => charset_decoder(&vec).into(),
324                        (Cow::Owned(vec), None) => String::from_utf8(vec)
325                            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
326                            .into(),
327                        (Cow::Borrowed(bytes), Some(charset_decoder)) => {
328                            charset_decoder(bytes).into()
329                        }
330                        (Cow::Borrowed(bytes), None) => String::from_utf8_lossy(bytes),
331                    };
332
333                    let is_html = mime_type == MimeType::TextHtml;
334
335                    if !add_to_html && is_html || !add_to_text && !is_html {
336                        message.attachments.push(message.parts.len() as u32);
337                    }
338
339                    if is_html {
340                        PartType::Html(text)
341                    } else {
342                        PartType::Text(text)
343                    }
344                } else {
345                    message.attachments.push(message.parts.len() as u32);
346
347                    if !is_inline {
348                        PartType::Binary(bytes)
349                    } else {
350                        PartType::InlineBinary(bytes)
351                    }
352                }
353            } else {
354                message.attachments.push(message.parts.len() as u32);
355
356                if depth != 0 {
357                    if let Some(nested_message) = self.parse_(bytes.as_ref(), depth - 1, false) {
358                        PartType::Message(Message {
359                            html_body: nested_message.html_body,
360                            text_body: nested_message.text_body,
361                            attachments: nested_message.attachments,
362                            parts: nested_message
363                                .parts
364                                .into_iter()
365                                .map(|p| p.into_owned())
366                                .collect(),
367                            raw_message: bytes.into_owned().into(),
368                        })
369                    } else {
370                        is_encoding_problem = true;
371                        PartType::Binary(bytes)
372                    }
373                } else {
374                    is_encoding_problem = true;
375                    PartType::Binary(bytes)
376                }
377            };
378
379            // Add part
380            message.parts.push(MessagePart {
381                headers: std::mem::take(&mut part_headers),
382                encoding,
383                is_encoding_problem,
384                body: body_part,
385                offset_header: state.offset_header as u32,
386                offset_body: state.offset_body as u32,
387                offset_end: state.offset_end as u32,
388            });
389
390            if state.mime_boundary.is_some() {
391                // Currently processing a MIME part
392                'inner: loop {
393                    if let MimeType::Message = state.mime_type {
394                        // Finished processing a nested message, restore parent message from stack
395                        if let Some((mut prev_state, Some(mut prev_message))) = state_stack.pop() {
396                            let offset_end = state
397                                .mime_boundary
398                                .as_ref()
399                                .map(|b| {
400                                    let pos = stream.offset().saturating_sub(b.len() + 2);
401                                    stream.data.get(pos - 2).map_or(pos - 1, |&ch| {
402                                        if ch == b'\r' {
403                                            pos - 2
404                                        } else {
405                                            pos - 1
406                                        }
407                                    })
408                                })
409                                .unwrap_or_else(|| stream.offset());
410                            message.raw_message = raw_message.into();
411                            //raw_message[state.offset_header..offset_end].as_ref().into();
412
413                            if let Some(part) = prev_message.parts.get_mut(state.part_id as usize) {
414                                part.body = PartType::Message(message);
415                                part.offset_end = offset_end as u32;
416                            } else {
417                                debug_assert!(false, "Invalid part ID, could not find message.");
418                            }
419
420                            message = prev_message;
421                            prev_state.mime_boundary = state.mime_boundary;
422                            state = prev_state;
423                        } else {
424                            debug_assert!(false, "Failed to restore parent message. Aborting.");
425                            break 'outer;
426                        }
427                    }
428
429                    if stream.is_multipart_end() {
430                        // End of MIME part reached
431
432                        if MimeType::MultipartAlternative == state.mime_type
433                            && state.need_html_body
434                            && state.need_text_body
435                        {
436                            // Found HTML part only
437                            if state.text_parts == message.text_body.len()
438                                && state.html_parts != message.html_body.len()
439                            {
440                                for &part_id in &message.html_body[state.html_parts..] {
441                                    message.text_body.push(part_id);
442                                }
443                            }
444
445                            // Found text part only
446                            if state.html_parts == message.html_body.len()
447                                && state.text_parts != message.text_body.len()
448                            {
449                                for &part_id in &message.text_body[state.html_parts..] {
450                                    message.html_body.push(part_id);
451                                }
452                            }
453                        }
454
455                        if let Some(part) = message.parts.get_mut(state.part_id as usize) {
456                            // Add headers and substructure to parent part
457                            part.body =
458                                PartType::Multipart(std::mem::take(&mut state.sub_part_ids));
459
460                            // Restore ancestor's state
461                            if let Some((prev_state, _)) = state_stack.pop() {
462                                state = prev_state;
463
464                                if let Some(ref mime_boundary) = state.mime_boundary {
465                                    // Ancestor has a MIME boundary, seek it.
466                                    if let Some(offset) =
467                                        stream.seek_next_part_offset(mime_boundary)
468                                    {
469                                        part.offset_end = offset as u32;
470                                        continue 'inner;
471                                    }
472                                }
473                            }
474
475                            // This part has no boundary, update end offset
476                            part.offset_end = stream.offset() as u32;
477                        } else {
478                            debug_assert!(false, "Invalid part ID, could not find multipart.");
479                        }
480
481                        break 'outer;
482                    } else {
483                        // Headers of next part expected next, break inner look.
484                        break 'inner;
485                    }
486                }
487            } else if stream.offset() >= stream.data.len() {
488                break 'outer;
489            }
490        }
491
492        // Corrupted MIME message, try to recover whatever is possible.
493        while let Some((prev_state, prev_message)) = state_stack.pop() {
494            if let Some(mut prev_message) = prev_message {
495                message.raw_message = raw_message.into(); //raw_message[state.offset_header..stream.offset()].as_ref().into();
496
497                if let Some(part) = prev_message.parts.get_mut(state.part_id as usize) {
498                    part.body = PartType::Message(message);
499                    part.offset_end = stream.offset() as u32;
500                } else {
501                    debug_assert!(false, "Invalid part ID, could not find message.");
502                }
503
504                message = prev_message;
505            } else if let Some(part) = message.parts.get_mut(state.part_id as usize) {
506                part.offset_end = stream.offset() as u32;
507                part.body = PartType::Multipart(state.sub_part_ids);
508            } else {
509                debug_assert!(false, "This should not have happened.");
510            }
511            state = prev_state;
512        }
513
514        message.raw_message = raw_message.into();
515
516        if !message.is_empty() {
517            message.parts[0].offset_end = message.raw_message.len() as u32;
518            Some(message)
519        } else if !part_headers.is_empty() {
520            // Message without a body
521            message.parts.push(MessagePart {
522                headers: part_headers,
523                encoding: Encoding::None,
524                is_encoding_problem: true,
525                body: PartType::Text("".into()),
526                offset_header: 0,
527                offset_body: message.raw_message.len() as u32,
528                offset_end: message.raw_message.len() as u32,
529            });
530            Some(message)
531        } else {
532            None
533        }
534    }
535}
536
537impl<'x> Message<'x> {
538    fn new() -> Message<'x> {
539        Message {
540            ..Default::default()
541        }
542    }
543
544    /// Returns `false` if at least one header field was successfully parsed.
545    pub fn is_empty(&self) -> bool {
546        self.parts.is_empty()
547    }
548}
549
550#[cfg(test)]
551mod tests {
552    use std::{fs, path::PathBuf};
553
554    use crate::MessageParser;
555
556    #[test]
557    fn parse_full_messages() {
558        for test_suite in ["rfc", "legacy", "thirdparty", "malformed"] {
559            let test_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
560                .join("resources")
561                .join("eml")
562                .join(test_suite);
563
564            let mut tests_run = 0;
565
566            for file_name in fs::read_dir(&test_dir).unwrap() {
567                let mut file_name = file_name.unwrap().path();
568                if file_name.extension().is_some_and(|e| e == "eml") {
569                    let raw_original = fs::read(&file_name).unwrap();
570                    tests_run += 1;
571
572                    // Test without CRs
573                    let raw_message = strip_crlf(&raw_original);
574                    file_name.set_extension("json");
575                    let expected_result = fs::read(&file_name).unwrap();
576
577                    let message = MessageParser::default().parse(&raw_message).unwrap();
578                    let json_message = serde_json::to_string_pretty(&message).unwrap();
579
580                    // Test that deserialization also works
581                    let _: super::Message<'_> = serde_json::from_str(&json_message).unwrap();
582
583                    if json_message.as_bytes() != expected_result {
584                        file_name.set_extension("failed");
585                        fs::write(&file_name, json_message.as_bytes()).unwrap();
586                        panic!(
587                            "Test failed, parsed message saved to {}",
588                            file_name.display()
589                        );
590                    }
591
592                    // Test with CRs
593                    let raw_message = add_crlf(&raw_original);
594                    file_name.set_extension("crlf.json");
595                    let expected_result = fs::read(&file_name).unwrap();
596
597                    let message = MessageParser::default().parse(&raw_message).unwrap();
598                    let json_message = serde_json::to_string_pretty(&message).unwrap();
599
600                    if json_message.as_bytes() != expected_result {
601                        file_name.set_extension("failed");
602                        fs::write(&file_name, json_message.as_bytes()).unwrap();
603                        panic!(
604                            "Test failed, parsed message saved to {}",
605                            file_name.display()
606                        );
607                    }
608                }
609            }
610
611            assert!(
612                tests_run > 0,
613                "Did not find any tests to run in folder {}.",
614                test_dir.display()
615            );
616        }
617    }
618
619    fn add_crlf(bytes: &[u8]) -> Vec<u8> {
620        let mut result = Vec::with_capacity(bytes.len());
621        let mut last_ch = 0;
622        for &ch in bytes {
623            if ch == b'\n' && last_ch != b'\r' {
624                result.push(b'\r');
625            }
626            result.push(ch);
627            last_ch = ch;
628        }
629
630        result
631    }
632
633    fn strip_crlf(bytes: &[u8]) -> Vec<u8> {
634        let mut result = Vec::with_capacity(bytes.len());
635        for &ch in bytes {
636            if !ch != b'\r' {
637                result.push(ch);
638            }
639        }
640
641        result
642    }
643}