mime_tree/parse.rs
1use mail_parser::{
2 Encoding, HeaderValue, Message, MessageParser, MessagePart, MimeHeaders, PartType,
3};
4
5use crate::{
6 error::ParseError,
7 message::{DecodedBodyValue, ParsedMessage},
8 part::{ParsedHeader, ParsedPart, TransferEncoding},
9 walk,
10};
11
12/// Parse raw RFC 5322 bytes into a `ParsedMessage`.
13///
14/// Returns `Err(ParseError::EmptyInput)` for empty input and
15/// `Err(ParseError::NoHeaders)` when mail-parser cannot find any headers.
16/// All other malformed input produces a best-effort `ParsedMessage` with
17/// `warnings` populated.
18#[must_use = "the parsed message must be used"]
19pub fn parse(raw: &[u8]) -> Result<ParsedMessage, ParseError> {
20 if raw.is_empty() {
21 return Err(ParseError::EmptyInput);
22 }
23
24 let message = MessageParser::default()
25 .parse(raw)
26 .ok_or(ParseError::NoHeaders)?;
27
28 let mut warnings: Vec<String> = Vec::new();
29
30 // Extract top-level headers from parts[0].
31 let headers = message
32 .parts
33 .first()
34 .map(|p| extract_headers(p, raw))
35 .unwrap_or_default();
36
37 // Build the part tree.
38 let part_index = build_root(&message, 0, &mut warnings).ok_or(ParseError::NoHeaders)?;
39
40 // Compute RFC 8621 §4.1.4 body-view lists from the parsed part tree.
41 let body = walk::compute_body_structure(&part_index);
42
43 // Compute preview: first 256 decoded characters from the first text_body part.
44 // We decode up to 1024 bytes (not 256) because worst-case UTF-8 uses 4 bytes
45 // per character, so 1024 bytes guarantees at least 256 characters.
46 let preview = body.text_body.first().and_then(|id| {
47 let part = part_index.find_by_id(id)?;
48 let decoded = crate::decode::decode_body_value(raw, part, Some(1024)).ok()?;
49 let s: String = decoded.value.chars().take(256).collect();
50 if s.is_empty() {
51 None
52 } else {
53 Some(s)
54 }
55 });
56
57 Ok(ParsedMessage {
58 part_index,
59 text_body: body.text_body,
60 html_body: body.html_body,
61 attachments: body.attachments,
62 headers,
63 preview,
64 warnings,
65 })
66}
67
68/// Decode the body of a parsed part.
69///
70/// Slices `raw[part.body_range]`, applies transfer-encoding decode
71/// (Base64, QP, UUencode, or identity), then charset-converts the
72/// result to UTF-8.
73///
74/// # `max_bytes`
75///
76/// * `None` — decode the full body. There is no implicit size limit.
77/// * `Some(n)` — decode at most `n` bytes of the **transfer-decoded**
78/// output. [`DecodedBodyValue::is_truncated`] is set to `true` when
79/// the full body exceeds this limit. The truncation point may fall
80/// mid-codepoint after charset conversion, in which case
81/// [`DecodedBodyValue::is_encoding_problem`] is also set.
82///
83/// # Errors
84///
85/// Returns [`ParseError::InvalidRange`] when `part.body_range` is out
86/// of bounds for `raw`.
87#[must_use = "the decoded body value must be used"]
88pub fn decode_body_value(
89 raw: &[u8],
90 part: &ParsedPart,
91 max_bytes: Option<usize>,
92) -> Result<DecodedBodyValue, ParseError> {
93 crate::decode::decode_body_value(raw, part, max_bytes)
94}
95
96// ---------------------------------------------------------------------------
97// Internal helpers
98// ---------------------------------------------------------------------------
99
100/// Extract `ParsedHeader` values from a part's header list.
101///
102/// For headers whose value mail-parser parses as plain text (Subject,
103/// Comments, Content-Description, and any unstructured header), the decoded
104/// string from `h.value` is used directly. mail-parser decodes RFC 2047
105/// encoded-words during its own parse phase, so the `Text` variant already
106/// contains the final Unicode string.
107///
108/// For headers whose value is a `TextList` (e.g. References, Keywords), the
109/// list elements are joined with ", " after trimming each item.
110///
111/// For all other header types (Address, DateTime, ContentType, Received) the
112/// raw bytes are sliced from `raw` as before, because those values are not
113/// encoded-word fields and the structured `HeaderValue` variants would require
114/// lossy reconstruction.
115fn extract_headers(part: &MessagePart<'_>, raw: &[u8]) -> Vec<ParsedHeader> {
116 part.headers
117 .iter()
118 .map(|h| {
119 let name = h.name.as_str().to_owned();
120
121 // Always capture the raw bytes of the header field value from the
122 // original message. These bytes are faithful to the wire format
123 // and preserve non-UTF-8 bytes that `from_utf8_lossy` would
124 // replace with U+FFFD.
125 let raw_value = raw
126 .get(h.offset_start as usize..h.offset_end as usize)
127 .unwrap_or_default()
128 .to_vec();
129
130 let value = match &h.value {
131 // mail-parser has already decoded any RFC 2047 encoded-words
132 // into this Cow<str>; use it directly.
133 HeaderValue::Text(s) => s.as_ref().trim().to_owned(),
134 // TextList: join with comma+space (e.g. References, Keywords).
135 HeaderValue::TextList(list) => list
136 .iter()
137 .map(|s| s.as_ref().trim())
138 .collect::<Vec<_>>()
139 .join(", "),
140 // All other variants (Address, DateTime, ContentType, Received,
141 // Empty): fall back to lossy UTF-8 for the `value` string.
142 _ => String::from_utf8_lossy(raw_value.trim_ascii()).into_owned(),
143 };
144 ParsedHeader {
145 name,
146 value,
147 raw_value,
148 }
149 })
150 .collect()
151}
152
153/// Build a `ParsedPart` for `parts[part_idx]`, assigning it the given `part_id`.
154///
155/// Returns `None` when `part_idx` is out of range in `message.parts`; the
156/// caller logs a warning and skips the missing child.
157///
158/// For the root call (part_idx = 0) we use the dedicated `build_root` entry
159/// point which handles the special IMAP ID assignment for the root part.
160fn build_part(
161 message: &Message<'_>,
162 part_idx: u32,
163 part_id: String,
164 warnings: &mut Vec<String>,
165) -> Option<ParsedPart> {
166 let part = match message.parts.get(part_idx as usize) {
167 Some(p) => p,
168 None => {
169 warnings.push(format!("part {part_id}: index {part_idx} out of range"));
170 return None;
171 }
172 };
173
174 if part.is_encoding_problem {
175 warnings.push(format!("part {part_id}: encoding problem"));
176 }
177
178 let header_range = (
179 part.offset_header,
180 part.offset_body.saturating_sub(part.offset_header),
181 );
182 let body_range = (
183 part.offset_body,
184 part.offset_end.saturating_sub(part.offset_body),
185 );
186
187 let raw_ct = part.content_type();
188 let content_type = raw_ct
189 .map(|ct| {
190 let subtype = ct.subtype().unwrap_or("plain");
191 format!("{}/{}", ct.ctype(), subtype)
192 })
193 .unwrap_or_else(|| "text/plain".to_owned());
194
195 let charset = raw_ct
196 .and_then(|ct| ct.attribute("charset"))
197 .map(str::to_owned)
198 .or_else(|| {
199 if raw_ct.is_none() {
200 Some("us-ascii".to_owned())
201 } else {
202 None
203 }
204 });
205
206 let transfer_encoding = map_encoding(part, warnings);
207
208 let disposition = part.content_disposition().map(|cd| cd.ctype().to_owned());
209
210 let filename = part.attachment_name().map(str::to_owned);
211
212 let cid = part.content_id().map(str::to_owned);
213
214 let children = match &part.body {
215 PartType::Multipart(child_ids) => child_ids
216 .iter()
217 .enumerate()
218 .filter_map(|(n, &child_idx)| {
219 let child_id = if part_id.is_empty() {
220 (n + 1).to_string()
221 } else {
222 format!("{}.{}", part_id, n + 1)
223 };
224 build_part(message, child_idx, child_id, warnings)
225 })
226 .collect(),
227 PartType::Message(_nested) => {
228 // message/rfc822 is intentionally treated as an opaque leaf.
229 // Its raw bytes are accessible via body_range; callers that need
230 // the inner structure should pass those bytes to parse() themselves.
231 // See crate invariant: callers handle recursion, not this crate.
232 vec![]
233 }
234 _ => vec![],
235 };
236
237 Some(ParsedPart {
238 part_id,
239 content_type,
240 charset,
241 transfer_encoding,
242 disposition,
243 filename,
244 cid,
245 header_range,
246 body_range,
247 children,
248 is_encoding_problem: part.is_encoding_problem,
249 })
250}
251
252/// Entry point for the root part (parts[0]).
253///
254/// Returns `None` when `parts[part_idx]` does not exist.
255///
256/// IMAP part-ID rules:
257/// - If the root is multipart, it acts as an envelope container; its body
258/// children receive IDs `"1"`, `"2"`, ... and the root itself gets `""`.
259/// - If the root is a single-part leaf (or a nested `message/rfc822`), the
260/// body is accessible as `"1"`.
261fn build_root(
262 message: &Message<'_>,
263 part_idx: u32,
264 warnings: &mut Vec<String>,
265) -> Option<ParsedPart> {
266 let is_multipart = message
267 .parts
268 .get(part_idx as usize)
269 .is_some_and(|p| matches!(p.body, PartType::Multipart(_)));
270
271 let root_id = if is_multipart {
272 String::new()
273 } else {
274 "1".to_owned()
275 };
276
277 build_part(message, part_idx, root_id, warnings)
278}
279
280/// Map a mail-parser `Encoding` (and optional CTE string) to `TransferEncoding`.
281///
282/// Pushes a warning to `warnings` when the CTE token is non-empty and not one
283/// of the values recognised by this crate. RFC 2045 §6.4 permits x-token
284/// CTE values; the conventional UUencode spellings are handled explicitly and
285/// do not produce a warning.
286fn map_encoding(part: &MessagePart<'_>, warnings: &mut Vec<String>) -> TransferEncoding {
287 match part.encoding {
288 Encoding::Base64 => TransferEncoding::Base64,
289 Encoding::QuotedPrintable => TransferEncoding::QuotedPrintable,
290 Encoding::None => {
291 // Check the raw CTE header string for well-known values.
292 match part.content_transfer_encoding() {
293 Some(s) if s.eq_ignore_ascii_case("7bit") => TransferEncoding::SevenBit,
294 Some(s) if s.eq_ignore_ascii_case("8bit") => TransferEncoding::EightBit,
295 Some(s) if s.eq_ignore_ascii_case("binary") => TransferEncoding::Binary,
296 Some(s)
297 if s.eq_ignore_ascii_case("x-uuencode")
298 || s.eq_ignore_ascii_case("x-uue")
299 || s.eq_ignore_ascii_case("uuencode") =>
300 {
301 TransferEncoding::UUEncode
302 }
303 Some(s) if !s.is_empty() => {
304 warnings.push(format!("Unknown Content-Transfer-Encoding: {s}"));
305 TransferEncoding::Identity
306 }
307 _ => TransferEncoding::Identity,
308 }
309 }
310 }
311}