Skip to main content

mime_tree/
parse.rs

1use mail_parser::{
2    Encoding, HeaderValue, Message, MessageParser, MessagePart, MimeHeaders, PartType,
3};
4
5use crate::{
6    error::ParseError,
7    message::{DecodedBodyValue, ParsedMessage},
8    part::{ParsedHeader, ParsedPart, TransferEncoding},
9    walk,
10};
11
12/// Parse raw RFC 5322 bytes into a `ParsedMessage`.
13///
14/// Returns `Err(ParseError::EmptyInput)` for empty input and
15/// `Err(ParseError::NoHeaders)` when mail-parser cannot find any headers.
16/// All other malformed input produces a best-effort `ParsedMessage` with
17/// `warnings` populated.
18#[must_use = "the parsed message must be used"]
19pub fn parse(raw: &[u8]) -> Result<ParsedMessage, ParseError> {
20    if raw.is_empty() {
21        return Err(ParseError::EmptyInput);
22    }
23
24    let message = MessageParser::default()
25        .parse(raw)
26        .ok_or(ParseError::NoHeaders)?;
27
28    let mut warnings: Vec<String> = Vec::new();
29
30    // Extract top-level headers from parts[0].
31    let headers = message
32        .parts
33        .first()
34        .map(|p| extract_headers(p, raw))
35        .unwrap_or_default();
36
37    // Build the part tree.
38    let part_index = build_root(&message, 0, &mut warnings).ok_or(ParseError::NoHeaders)?;
39
40    // Compute RFC 8621 §4.1.4 body-view lists from the parsed part tree.
41    let body = walk::compute_body_structure(&part_index);
42
43    // Compute preview: first 256 decoded characters from the first text_body part.
44    // We decode up to 1024 bytes (not 256) because worst-case UTF-8 uses 4 bytes
45    // per character, so 1024 bytes guarantees at least 256 characters.
46    let preview = body.text_body.first().and_then(|id| {
47        let part = part_index.find_by_id(id)?;
48        let decoded = crate::decode::decode_body_value(raw, part, Some(1024)).ok()?;
49        let s: String = decoded.value.chars().take(256).collect();
50        if s.is_empty() {
51            None
52        } else {
53            Some(s)
54        }
55    });
56
57    Ok(ParsedMessage {
58        part_index,
59        text_body: body.text_body,
60        html_body: body.html_body,
61        attachments: body.attachments,
62        headers,
63        preview,
64        warnings,
65    })
66}
67
68/// Decode the body of a parsed part.
69///
70/// Slices `raw[part.body_range]`, applies transfer-encoding decode
71/// (Base64, QP, UUencode, or identity), then charset-converts the
72/// result to UTF-8.
73///
74/// # `max_bytes`
75///
76/// * `None` — decode the full body.  There is no implicit size limit.
77/// * `Some(n)` — decode at most `n` bytes of the **transfer-decoded**
78///   output.  [`DecodedBodyValue::is_truncated`] is set to `true` when
79///   the full body exceeds this limit.  The truncation point may fall
80///   mid-codepoint after charset conversion, in which case
81///   [`DecodedBodyValue::is_encoding_problem`] is also set.
82///
83/// # Errors
84///
85/// Returns [`ParseError::InvalidRange`] when `part.body_range` is out
86/// of bounds for `raw`.
87#[must_use = "the decoded body value must be used"]
88pub fn decode_body_value(
89    raw: &[u8],
90    part: &ParsedPart,
91    max_bytes: Option<usize>,
92) -> Result<DecodedBodyValue, ParseError> {
93    crate::decode::decode_body_value(raw, part, max_bytes)
94}
95
96// ---------------------------------------------------------------------------
97// Internal helpers
98// ---------------------------------------------------------------------------
99
100/// Extract `ParsedHeader` values from a part's header list.
101///
102/// For headers whose value mail-parser parses as plain text (Subject,
103/// Comments, Content-Description, and any unstructured header), the decoded
104/// string from `h.value` is used directly.  mail-parser decodes RFC 2047
105/// encoded-words during its own parse phase, so the `Text` variant already
106/// contains the final Unicode string.
107///
108/// For headers whose value is a `TextList` (e.g. References, Keywords), the
109/// list elements are joined with ", " after trimming each item.
110///
111/// For all other header types (Address, DateTime, ContentType, Received) the
112/// raw bytes are sliced from `raw` as before, because those values are not
113/// encoded-word fields and the structured `HeaderValue` variants would require
114/// lossy reconstruction.
115fn extract_headers(part: &MessagePart<'_>, raw: &[u8]) -> Vec<ParsedHeader> {
116    part.headers
117        .iter()
118        .map(|h| {
119            let name = h.name.as_str().to_owned();
120
121            // Always capture the raw bytes of the header field value from the
122            // original message. These bytes are faithful to the wire format
123            // and preserve non-UTF-8 bytes that `from_utf8_lossy` would
124            // replace with U+FFFD.
125            let raw_value = raw
126                .get(h.offset_start as usize..h.offset_end as usize)
127                .unwrap_or_default()
128                .to_vec();
129
130            let value = match &h.value {
131                // mail-parser has already decoded any RFC 2047 encoded-words
132                // into this Cow<str>; use it directly.
133                HeaderValue::Text(s) => s.as_ref().trim().to_owned(),
134                // TextList: join with comma+space (e.g. References, Keywords).
135                HeaderValue::TextList(list) => list
136                    .iter()
137                    .map(|s| s.as_ref().trim())
138                    .collect::<Vec<_>>()
139                    .join(", "),
140                // All other variants (Address, DateTime, ContentType, Received,
141                // Empty): fall back to lossy UTF-8 for the `value` string.
142                _ => String::from_utf8_lossy(raw_value.trim_ascii()).into_owned(),
143            };
144            ParsedHeader {
145                name,
146                value,
147                raw_value,
148            }
149        })
150        .collect()
151}
152
153/// Build a `ParsedPart` for `parts[part_idx]`, assigning it the given `part_id`.
154///
155/// Returns `None` when `part_idx` is out of range in `message.parts`; the
156/// caller logs a warning and skips the missing child.
157///
158/// For the root call (part_idx = 0) we use the dedicated `build_root` entry
159/// point which handles the special IMAP ID assignment for the root part.
160fn build_part(
161    message: &Message<'_>,
162    part_idx: u32,
163    part_id: String,
164    warnings: &mut Vec<String>,
165) -> Option<ParsedPart> {
166    let part = match message.parts.get(part_idx as usize) {
167        Some(p) => p,
168        None => {
169            warnings.push(format!("part {part_id}: index {part_idx} out of range"));
170            return None;
171        }
172    };
173
174    if part.is_encoding_problem {
175        warnings.push(format!("part {part_id}: encoding problem"));
176    }
177
178    let header_range = (
179        part.offset_header,
180        part.offset_body.saturating_sub(part.offset_header),
181    );
182    let body_range = (
183        part.offset_body,
184        part.offset_end.saturating_sub(part.offset_body),
185    );
186
187    let raw_ct = part.content_type();
188    let content_type = raw_ct
189        .map(|ct| {
190            let subtype = ct.subtype().unwrap_or("plain");
191            format!("{}/{}", ct.ctype(), subtype)
192        })
193        .unwrap_or_else(|| "text/plain".to_owned());
194
195    let charset = raw_ct
196        .and_then(|ct| ct.attribute("charset"))
197        .map(str::to_owned)
198        .or_else(|| {
199            if raw_ct.is_none() {
200                Some("us-ascii".to_owned())
201            } else {
202                None
203            }
204        });
205
206    let transfer_encoding = map_encoding(part, warnings);
207
208    let disposition = part.content_disposition().map(|cd| cd.ctype().to_owned());
209
210    let filename = part.attachment_name().map(str::to_owned);
211
212    let cid = part.content_id().map(str::to_owned);
213
214    let children = match &part.body {
215        PartType::Multipart(child_ids) => child_ids
216            .iter()
217            .enumerate()
218            .filter_map(|(n, &child_idx)| {
219                let child_id = if part_id.is_empty() {
220                    (n + 1).to_string()
221                } else {
222                    format!("{}.{}", part_id, n + 1)
223                };
224                build_part(message, child_idx, child_id, warnings)
225            })
226            .collect(),
227        PartType::Message(_nested) => {
228            // message/rfc822 is intentionally treated as an opaque leaf.
229            // Its raw bytes are accessible via body_range; callers that need
230            // the inner structure should pass those bytes to parse() themselves.
231            // See crate invariant: callers handle recursion, not this crate.
232            vec![]
233        }
234        _ => vec![],
235    };
236
237    Some(ParsedPart {
238        part_id,
239        content_type,
240        charset,
241        transfer_encoding,
242        disposition,
243        filename,
244        cid,
245        header_range,
246        body_range,
247        children,
248        is_encoding_problem: part.is_encoding_problem,
249    })
250}
251
252/// Entry point for the root part (parts[0]).
253///
254/// Returns `None` when `parts[part_idx]` does not exist.
255///
256/// IMAP part-ID rules:
257/// - If the root is multipart, it acts as an envelope container; its body
258///   children receive IDs `"1"`, `"2"`, ... and the root itself gets `""`.
259/// - If the root is a single-part leaf (or a nested `message/rfc822`), the
260///   body is accessible as `"1"`.
261fn build_root(
262    message: &Message<'_>,
263    part_idx: u32,
264    warnings: &mut Vec<String>,
265) -> Option<ParsedPart> {
266    let is_multipart = message
267        .parts
268        .get(part_idx as usize)
269        .is_some_and(|p| matches!(p.body, PartType::Multipart(_)));
270
271    let root_id = if is_multipart {
272        String::new()
273    } else {
274        "1".to_owned()
275    };
276
277    build_part(message, part_idx, root_id, warnings)
278}
279
280/// Map a mail-parser `Encoding` (and optional CTE string) to `TransferEncoding`.
281///
282/// Pushes a warning to `warnings` when the CTE token is non-empty and not one
283/// of the values recognised by this crate.  RFC 2045 §6.4 permits x-token
284/// CTE values; the conventional UUencode spellings are handled explicitly and
285/// do not produce a warning.
286fn map_encoding(part: &MessagePart<'_>, warnings: &mut Vec<String>) -> TransferEncoding {
287    match part.encoding {
288        Encoding::Base64 => TransferEncoding::Base64,
289        Encoding::QuotedPrintable => TransferEncoding::QuotedPrintable,
290        Encoding::None => {
291            // Check the raw CTE header string for well-known values.
292            match part.content_transfer_encoding() {
293                Some(s) if s.eq_ignore_ascii_case("7bit") => TransferEncoding::SevenBit,
294                Some(s) if s.eq_ignore_ascii_case("8bit") => TransferEncoding::EightBit,
295                Some(s) if s.eq_ignore_ascii_case("binary") => TransferEncoding::Binary,
296                Some(s)
297                    if s.eq_ignore_ascii_case("x-uuencode")
298                        || s.eq_ignore_ascii_case("x-uue")
299                        || s.eq_ignore_ascii_case("uuencode") =>
300                {
301                    TransferEncoding::UUEncode
302                }
303                Some(s) if !s.is_empty() => {
304                    warnings.push(format!("Unknown Content-Transfer-Encoding: {s}"));
305                    TransferEncoding::Identity
306                }
307                _ => TransferEncoding::Identity,
308            }
309        }
310    }
311}