daaki-message 0.2.0

RFC 5322 email message parser and builder
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
//! Semantic interpreter for wire-parsed email messages.
//!
//! Converts a [`WireMessage`] (raw headers + body bytes) into a fully
//! interpreted [`ParsedEmail`] by performing RFC 2047 decoding, address
//! parsing, date parsing, MIME tree walking, charset conversion, and
//! content-transfer-encoding decoding.
//!
//! # References
//! - RFC 5322 (Internet Message Format)
//! - RFC 2045 (MIME Part One — body format, Content-Transfer-Encoding)
//! - RFC 2046 (MIME Part Two — media types, multipart boundaries)
//! - RFC 2047 (MIME Part Three — encoded words in headers)
//! - RFC 2183 (Content-Disposition)
//! - RFC 2231 (MIME parameter encoding)
//! - RFC 6532 (Internationalized email headers)

mod address;
mod date;
mod encoded_words;
mod message_id;
mod mime;
mod params;

use crate::error::Error;
use crate::types::{Address, DateTime, ParsedEmail};

use super::wire::WireMessage;

// Re-export pub(crate) items so parser/mod.rs can reach them.
pub(crate) use address::find_paren_outside_quotes;
pub(crate) use address::normalize_display_name_phrase;
pub use address::parse_address_list;
pub(crate) use address::strip_comments;
pub(crate) use date::parse_rfc5322_date;
pub(crate) use encoded_words::decode_encoded_words;

// Re-export items used by parser/tests.rs (only needed under cfg(test)
// since no non-test code in the parent module references them directly).
#[cfg(test)]
pub(super) use address::contains_at_outside_quotes;
#[cfg(test)]
pub(super) use address::extract_comment_text;
#[cfg(test)]
pub(super) use address::parse_single_address;
#[cfg(test)]
pub(super) use address::unescape_quoted_string;
#[cfg(test)]
pub(super) use date::parse_timezone;
#[cfg(test)]
pub(super) use date::parse_year;
#[cfg(test)]
pub(super) use encoded_words::decode_q_encoding;
#[cfg(test)]
pub(super) use mime::decode_body;
#[cfg(test)]
pub(super) use mime::decode_quoted_printable;
#[cfg(test)]
pub(super) use mime::decode_transfer_encoding;
#[cfg(test)]
pub(super) use params::decode_hex_pair;
#[cfg(test)]
pub(super) use params::extract_filename;
#[cfg(test)]
pub(super) use params::extract_mime_type;
#[cfg(test)]
pub(super) use params::extract_param;
#[cfg(test)]
pub(super) use params::extract_rfc2231_continuation;
#[cfg(test)]
pub(super) use params::extract_rfc2231_param;
#[cfg(test)]
pub(super) use params::find_closing_quote;
#[cfg(test)]
pub(super) use params::find_param_value;
#[cfg(test)]
pub(super) use params::hex_digit;
#[cfg(test)]
pub(super) use params::is_disposition_type;
#[cfg(test)]
pub(super) use params::is_inside_quotes;
#[cfg(test)]
pub(super) use params::percent_decode;
#[cfg(test)]
pub(super) use params::strip_outer_quotes;

/// Maximum MIME nesting depth to prevent stack overflow on pathological input.
/// RFC 2046 does not specify a limit; 64 is generous for real-world messages.
pub(super) const MAX_MIME_DEPTH: u32 = 64;

/// Lenient base64 engine that accepts both padded and unpadded input
/// (RFC 2045 Section 6.8).
pub(super) const LENIENT_BASE64: base64::engine::GeneralPurpose =
    base64::engine::GeneralPurpose::new(
        &base64::alphabet::STANDARD,
        base64::engine::GeneralPurposeConfig::new()
            .with_decode_padding_mode(base64::engine::DecodePaddingMode::Indifferent),
    );

/// Structured header fields extracted from an RFC 5322 message.
///
/// Used internally to deduplicate the shared header extraction logic
/// between [`parse_email`] and [`parse_headers_only`].
///
/// # References
/// - RFC 5322 Section 3.6 (field definitions)
#[derive(Default)]
struct HeaderFields {
    message_id: Option<String>,
    in_reply_to: Vec<String>,
    references: Vec<String>,
    subject: Option<String>,
    from: Vec<Address>,
    /// RFC 5322 Section 3.6.2: `sender = "Sender:" mailbox`.
    sender: Option<Address>,
    to: Vec<Address>,
    cc: Vec<Address>,
    bcc: Vec<Address>,
    reply_to: Vec<Address>,
    date: Option<DateTime>,
    /// Optional fields (RFC 5322 Section 3.6.8) — headers not in the
    /// well-known set, stored as `(lowercase-name, value)` pairs.
    extra_headers: Vec<(String, String)>,
}

/// Well-known header names that are extracted into dedicated fields.
///
/// Headers not in this set are collected into `extra_headers`
/// (RFC 5322 Section 3.6.8: optional fields). `Content-Disposition` is kept
/// in `extra_headers` as well as being interpreted for top-level body
/// classification, and `Content-ID` is kept as well as being consulted for
/// top-level inline classification, so header-only consumers can still inspect
/// RFC 2183 Section 2.10 and RFC 2045 Section 7 metadata.
const WELL_KNOWN_HEADERS: &[&str] = &[
    "from",
    "to",
    "cc",
    "bcc",
    "reply-to",
    "sender",
    "subject",
    "date",
    "message-id",
    "in-reply-to",
    "references",
    "content-type",
    "content-transfer-encoding",
    "mime-version",
];

/// Structured header fields where RFC 2047 encoded-words MUST NOT appear
/// (RFC 2047 Section 5). These headers have their own syntax rules and
/// `=?charset?encoding?text?=` sequences must be treated as literal text.
///
/// Includes trace fields (RFC 5321), authentication results
/// (RFC 8601), and DKIM/ARC signature headers (RFC 6376, RFC 8617).
///
/// NOTE: Resent address fields (`Resent-From`, `Resent-Sender`, `Resent-To`,
/// `Resent-Cc`, `Resent-Bcc`, `Resent-Reply-To`) are intentionally excluded.
/// RFC 5322 Section 3.6.6 says each uses the same syntax as its non-Resent
/// counterpart (mailbox / address-list), whose `phrase` production permits
/// encoded-words per RFC 2047 Section 5 rule (3). Only `Resent-Date` and
/// `Resent-Message-ID` remain because they contain no `phrase` production.
const STRUCTURED_HEADERS: &[&str] = &[
    "content-disposition",
    "content-id",
    "received",
    "return-path",
    "resent-date",
    "resent-message-id",
    "dkim-signature",
    "domainkey-signature",
    "arc-seal",
    "arc-message-signature",
    "arc-authentication-results",
    "authentication-results",
];

/// Interprets a wire-parsed message into a fully structured [`ParsedEmail`].
///
/// When `headers_only` is true, body/MIME processing is skipped and body-related
/// fields are set to `None`/empty.
///
/// # References
/// - RFC 5322 (Internet Message Format)
/// - RFC 2045–2047 (MIME)
/// - RFC 2183 (Content-Disposition)
/// - RFC 2231 (MIME parameter encoding)
pub(crate) fn interpret(wire_msg: &WireMessage, headers_only: bool) -> Result<ParsedEmail, Error> {
    let hf = if wire_msg.headerless {
        HeaderFields::default()
    } else {
        extract_header_fields(&wire_msg.headers, &wire_msg.raw_headers)?
    };

    if headers_only {
        return Ok(ParsedEmail {
            message_id: hf.message_id,
            in_reply_to: hf.in_reply_to,
            references: hf.references,
            subject: hf.subject,
            from: hf.from,
            sender: hf.sender,
            to: hf.to,
            cc: hf.cc,
            bcc: hf.bcc,
            reply_to: hf.reply_to,
            date: hf.date,
            body_text: None,
            body_html: None,
            attachments: Vec::new(),
            raw_headers: wire_msg.raw_headers.clone(),
            extra_headers: hf.extra_headers,
            size: wire_msg.size,
        });
    }

    // Determine Content-Type and walk MIME tree or extract simple body
    let content_type = get_header_value(&wire_msg.headers, "content-type")
        .unwrap_or_else(|| "text/plain; charset=us-ascii".to_string());
    // RFC 2045 Section 6.1: default Content-Transfer-Encoding is "7bit".
    let transfer_encoding = get_header_value(&wire_msg.headers, "content-transfer-encoding")
        .unwrap_or_else(|| "7bit".to_string());
    let content_disposition =
        get_header_value(&wire_msg.headers, "content-disposition").unwrap_or_default();
    let content_id = get_header_value(&wire_msg.headers, "content-id");

    let body_bytes = &wire_msg.body;

    let (body_text, body_html, attachments) = if params::is_multipart(&content_type) {
        match params::extract_boundary_for_body(&content_type, body_bytes) {
            Some(boundary) => {
                let mime_type = params::extract_mime_type(&content_type);
                let is_digest = mime_type == "multipart/digest";
                // RFC 2046 Section 5.1.4: multipart/alternative lists parts
                // in order of increasing faithfulness — prefer the last match.
                let is_alternative = mime_type == "multipart/alternative";
                mime::walk_mime_tree(body_bytes, &boundary, "", 0, is_digest, is_alternative)
            }
            // Multipart with no usable boundary parameter: gracefully
            // degrade to text/plain since we cannot split the MIME parts
            // (RFC 2046 Section 5.1.1 — boundary is required for multipart).
            None => mime::extract_simple_body(
                body_bytes,
                "text/plain; charset=us-ascii",
                &transfer_encoding,
                &content_disposition,
                content_id.as_deref(),
            ),
        }
    } else {
        mime::extract_simple_body(
            body_bytes,
            &content_type,
            &transfer_encoding,
            &content_disposition,
            content_id.as_deref(),
        )
    };

    Ok(ParsedEmail {
        message_id: hf.message_id,
        in_reply_to: hf.in_reply_to,
        references: hf.references,
        subject: hf.subject,
        from: hf.from,
        sender: hf.sender,
        to: hf.to,
        cc: hf.cc,
        bcc: hf.bcc,
        reply_to: hf.reply_to,
        date: hf.date,
        body_text,
        body_html,
        attachments,
        raw_headers: wire_msg.raw_headers.clone(),
        extra_headers: hf.extra_headers,
        size: wire_msg.size,
    })
}

// ---------------------------------------------------------------------------
// Header field extraction
// ---------------------------------------------------------------------------

/// Extracts all structured header fields from parsed header pairs.
///
/// Well-known headers are mapped to dedicated fields; all remaining
/// headers are collected into `extra_headers` (RFC 5322 Section 3.6.8).
///
/// # References
/// - RFC 5322 (Internet Message Format — address, date-time, identification)
/// - RFC 2047 (MIME encoded words in headers)
fn extract_header_fields(
    headers: &[(String, String)],
    raw_headers: &str,
) -> Result<HeaderFields, Error> {
    // RFC 5322 Section 3.6.8 permits optional fields outside the well-known
    // set. Keep rejecting inputs with no syntactically valid header fields at
    // all, but allow header blocks made entirely of optional/custom fields so
    // callers can still inspect partial or malformed messages.
    if headers.is_empty() {
        return Err(Error::MissingFrom);
    }

    let continuation_flags = header_body_starts_on_continuation_flags(raw_headers);

    // RFC 5322 Section 3.6.8: collect optional fields — any header not in
    // the well-known set. Values are decoded for RFC 2047 encoded words
    // (RFC 2047 Section 5) so callers get human-readable text, EXCEPT for
    // structured headers where encoded-words MUST NOT appear
    // (RFC 2047 Section 5).
    let extra_headers: Vec<(String, String)> = headers
        .iter()
        .zip(
            continuation_flags
                .iter()
                .copied()
                .chain(std::iter::repeat(false)),
        )
        .filter(|((k, _), _)| !WELL_KNOWN_HEADERS.contains(&k.as_str()))
        .map(|((k, v), starts_on_continuation)| {
            let normalized = if starts_on_continuation {
                strip_leading_structural_wsp(v)
            } else {
                v.as_str()
            };
            // RFC 2047 Section 5: encoded-words MUST NOT appear in
            // structured header fields. Only decode unstructured fields.
            let decoded = if STRUCTURED_HEADERS.contains(&k.as_str()) {
                normalized.to_string()
            } else {
                decode_encoded_words(normalized)
            };
            (k.clone(), decoded)
        })
        .collect();

    Ok(HeaderFields {
        message_id: message_id::extract_message_id(headers),
        in_reply_to: message_id::extract_in_reply_to(headers),
        references: message_id::extract_references(headers),
        // RFC 5322 Section 2.2.3: if the field body begins on the first
        // continuation line, one leading SP/HTAB is only a structural
        // separator before the body and should not survive into the
        // consumer-facing unstructured value.
        subject: get_header_value_with_continuation_flag(headers, &continuation_flags, "subject")
            .map(|(v, starts_on_continuation)| {
                let normalized = if starts_on_continuation {
                    strip_leading_structural_wsp(&v)
                } else {
                    v.as_str()
                };
                decode_encoded_words(normalized)
            }),
        from: address::extract_from(headers),
        sender: address::extract_sender(headers),
        to: address::extract_address_list(headers, "to"),
        cc: address::extract_address_list(headers, "cc"),
        bcc: address::extract_address_list(headers, "bcc"),
        reply_to: address::extract_address_list(headers, "reply-to"),
        date: date::extract_date(headers),
        extra_headers,
    })
}

/// Returns the value of the first header matching `name` (case-insensitive).
///
/// # References
/// - RFC 5322 Section 2.2 (header fields)
pub(super) fn get_header_value(headers: &[(String, String)], name: &str) -> Option<String> {
    headers
        .iter()
        .find(|(k, _)| k == name)
        .map(|(_, v)| v.clone())
}

/// Returns the first header matching `name` together with whether its field
/// body began on the first folded continuation line.
///
/// `continuation_flags` must be aligned with `headers` in parse order; any
/// missing flags default to `false`.
fn get_header_value_with_continuation_flag(
    headers: &[(String, String)],
    continuation_flags: &[bool],
    name: &str,
) -> Option<(String, bool)> {
    headers
        .iter()
        .enumerate()
        .find(|(_, (k, _))| k == name)
        .map(|(idx, (_, v))| {
            (
                v.clone(),
                continuation_flags.get(idx).copied().unwrap_or(false),
            )
        })
}

/// Normalize the leading structural separator of an unfolded field body.
///
/// RFC 5322 Section 2.2 allows a field body to begin after optional WSP
/// following `field-name:`. When the body starts on the first folded
/// continuation line, RFC 5322 Section 2.2.3 unfolding preserves that
/// continuation WSP byte even though it serves the same structural role.
/// Strip exactly one leading SP or HTAB so higher-level consumers see the
/// semantic field body while preserving any additional leading WSP as data.
fn strip_leading_structural_wsp(value: &str) -> &str {
    value
        .strip_prefix(' ')
        .or_else(|| value.strip_prefix('\t'))
        .unwrap_or(value)
}

/// Records, in header parse order, which field bodies begin only on their
/// first folded continuation line.
///
/// RFC 5322 Section 2.2.3 unfolding preserves the continuation line's leading
/// WSP. When the initial header line contains no non-WSP field-body content,
/// that first continuation SP/HTAB serves only as a structural separator for
/// the field body rather than semantic content.
fn header_body_starts_on_continuation_flags(raw_headers: &str) -> Vec<bool> {
    let mut flags = Vec::new();
    let mut lines = raw_headers.split('\n').peekable();

    while let Some(line) = lines.next() {
        let line = line.strip_suffix('\r').unwrap_or(line);
        if line.is_empty() || line.starts_with(' ') || line.starts_with('\t') {
            continue;
        }

        let Some(colon_pos) = line.find(':') else {
            continue;
        };

        let field_name = line[..colon_pos].trim();
        if crate::types::HeaderName::new(field_name).is_err() {
            continue;
        }

        let raw_value = &line[colon_pos + 1..];
        let starts_on_continuation = raw_value.bytes().all(|byte| byte == b' ' || byte == b'\t')
            && lines.peek().is_some_and(|next| {
                let next = next.strip_suffix('\r').unwrap_or(next);
                next.starts_with(' ') || next.starts_with('\t')
            });
        flags.push(starts_on_continuation);
    }

    flags
}