mailrs-rfc5322 1.0.1

Pull-based RFC 5322 message parser — lazy header lookup, byte-level scan, zero-alloc borrowed slices. Skip-ahead to the header you want without building a full Message tree.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
//! [`Message`] — the top-level type. Holds a `&[u8]` slice of the raw
//! message and provides lazy header / body access.

use crate::header::{find_unfolded_line_end, Header, HeaderIter};

/// A parsed-on-demand RFC 5322 message backed by the caller's bytes.
///
/// Construction is `O(1)` — `Message::new(bytes)` just stores the
/// slice. Header lookup is `O(header-region-size)`, not `O(message-size)`:
/// the scanner stops at the empty-line boundary, so finding "Subject"
/// in a 5 MB attachment-bearing message takes the same time as finding
/// it in a 5 KB one (modulo cache effects).
///
/// Body access (`body()` / `body_offset()`) caches the offset on first
/// call, so repeated body accesses are `O(1)`.
///
/// Zero allocation in the hot path: header values come back as
/// `&[u8]` borrowing from the input. The caller decides whether to
/// allocate.
#[derive(Debug, Clone)]
pub struct Message<'a> {
    bytes: &'a [u8],
    /// Cached offset of the start of the body (the byte after the
    /// empty CRLF/LF that terminates the header block). `None` means
    /// "not yet computed"; `Some(usize::MAX)` means "no body in this
    /// message".
    body_offset: core::cell::Cell<Option<usize>>,
}

impl<'a> Message<'a> {
    /// Wrap a raw message byte slice. No parsing is performed yet.
    pub fn new(bytes: &'a [u8]) -> Self {
        Self {
            bytes,
            body_offset: core::cell::Cell::new(None),
        }
    }

    /// The original message bytes the parser was constructed with.
    pub fn raw(&self) -> &'a [u8] {
        self.bytes
    }

    /// Look up the first header with name `wanted` (case-insensitive
    /// per RFC 5322 §3.6.8). Returns the value bytes, or `None` if no
    /// such header exists.
    ///
    /// This is `O(header-region-size)` — the scanner stops at the
    /// empty line that separates headers from the body, so it never
    /// reads the body even on multi-MB attachments.
    ///
    /// Fast-path optimization: a non-matching header is rejected by
    /// comparing only its first `wanted.len()` bytes + the next byte
    /// (must be `:`), avoiding the full per-header `from_utf8` +
    /// colon-find work that the generic [`Message::headers`] iterator
    /// does. On a worst-case 50-header lookup this is ~30% faster than
    /// the iterate-and-filter equivalent.
    ///
    /// To get an iterator of *all* occurrences (Received: chains, etc),
    /// use [`Message::header_all`] or [`Message::headers`].
    pub fn header(&self, wanted: &str) -> Option<&'a [u8]> {
        let wanted_bytes = wanted.as_bytes();
        let wanted_len = wanted_bytes.len();
        let bytes = self.bytes;
        let mut cursor = 0usize;
        while cursor < bytes.len() {
            // Empty line → end of header block
            if bytes[cursor] == b'\n'
                || (bytes[cursor] == b'\r' && cursor + 1 < bytes.len() && bytes[cursor + 1] == b'\n')
            {
                return None;
            }

            // Fast-path: check this line's name region matches `wanted`
            // BEFORE doing the full parse.
            let has_match = cursor + wanted_len < bytes.len()
                && bytes[cursor + wanted_len] == b':'
                && bytes[cursor..cursor + wanted_len]
                    .iter()
                    .zip(wanted_bytes.iter())
                    .all(|(a, b)| a.eq_ignore_ascii_case(b));

            // Find end-of-logical-line (handles folding)
            let (line_end, after_crlf) = match crate::header::find_unfolded_line_end(bytes, cursor) {
                Some(pair) => pair,
                None => return None,
            };

            if has_match {
                // Extract value: skip colon + optional single WSP
                let value_start_local = wanted_len + 1; // past colon
                let line = &bytes[cursor..line_end];
                let mut vs = value_start_local;
                if vs < line.len() && (line[vs] == b' ' || line[vs] == b'\t') {
                    vs += 1;
                }
                return Some(&line[vs..]);
            }

            cursor = after_crlf;
        }
        None
    }

    /// Look up the first header with name `wanted` (case-insensitive)
    /// as a `&str`. Convenience wrapper over [`Message::header`] +
    /// UTF-8 check.
    ///
    /// Returns `None` for missing OR for non-UTF-8 header values
    /// (RFC 6532 says they should be UTF-8; pre-6532 they're 7-bit
    /// ASCII; real malformed messages exist).
    pub fn header_str(&self, wanted: &str) -> Option<&'a str> {
        let value = self.header(wanted)?;
        std::str::from_utf8(value).ok()
    }

    /// Iterate over every header in document order. Stops at the
    /// empty line that separates headers from the body.
    pub fn headers(&self) -> HeaderIter<'a> {
        HeaderIter {
            bytes: self.bytes,
            cursor: 0,
        }
    }

    /// Iterate over every occurrence of a given header name
    /// (case-insensitive). For headers that can appear multiple times
    /// (Received, Authentication-Results, …), use this instead of
    /// `header()`.
    pub fn header_all(&self, wanted: &'a str) -> impl Iterator<Item = Header<'a>> + 'a {
        let wanted_bytes = wanted.as_bytes();
        self.headers().filter(move |h| {
            eq_ignore_ascii_case(h.name.as_bytes(), wanted_bytes)
        })
    }

    /// Locate the offset of the empty-line terminator separating
    /// headers from body. Memoized on the message — first call is
    /// `O(header-region-size)`, subsequent calls are `O(1)`.
    ///
    /// Returns `Some(offset)` where `bytes[offset]` is the first byte
    /// of the body. Returns `None` if the message has no body
    /// separator (no empty line found — entirely a header block).
    pub fn body_offset(&self) -> Option<usize> {
        if let Some(cached) = self.body_offset.get() {
            return (cached != usize::MAX).then_some(cached);
        }

        // Scan headers to find the empty line.
        let mut cursor = 0;
        loop {
            if cursor >= self.bytes.len() {
                self.body_offset.set(Some(usize::MAX));
                return None;
            }

            // empty-line check: this position starts with \r\n or \n
            if self.bytes[cursor] == b'\n' {
                self.body_offset.set(Some(cursor + 1));
                return Some(cursor + 1);
            }
            if self.bytes[cursor] == b'\r'
                && cursor + 1 < self.bytes.len()
                && self.bytes[cursor + 1] == b'\n'
            {
                self.body_offset.set(Some(cursor + 2));
                return Some(cursor + 2);
            }

            // walk past this header line (handle folding)
            match find_unfolded_line_end(self.bytes, cursor) {
                Some((_, after)) => cursor = after,
                None => {
                    self.body_offset.set(Some(usize::MAX));
                    return None;
                }
            }
        }
    }

    /// The message body — bytes after the empty-line CRLF/LF that
    /// terminates the header block. Returns `None` if no header
    /// terminator was found.
    pub fn body(&self) -> Option<&'a [u8]> {
        let offset = self.body_offset()?;
        Some(&self.bytes[offset..])
    }
}

/// ASCII case-insensitive comparison. RFC 5322 header names are
/// case-insensitive ASCII. We don't call `String::to_lowercase` because
/// it allocates; the std slice method does the same byte-walk.
#[inline]
fn eq_ignore_ascii_case(a: &[u8], b: &[u8]) -> bool {
    a.eq_ignore_ascii_case(b)
}

#[cfg(test)]
mod tests {
    use super::*;

    const SIMPLE: &[u8] = b"\
From: alice@example.com\r\n\
To: bob@example.com\r\n\
Subject: hi\r\n\
\r\n\
Hello, world!\r\n";

    #[test]
    fn raw_round_trips_bytes() {
        let msg = Message::new(SIMPLE);
        assert_eq!(msg.raw(), SIMPLE);
    }

    #[test]
    fn header_finds_first_occurrence() {
        let msg = Message::new(SIMPLE);
        assert_eq!(msg.header("Subject"), Some(b"hi" as &[u8]));
        assert_eq!(msg.header("From"), Some(b"alice@example.com" as &[u8]));
    }

    #[test]
    fn header_is_case_insensitive() {
        let msg = Message::new(SIMPLE);
        assert_eq!(msg.header("subject"), Some(b"hi" as &[u8]));
        assert_eq!(msg.header("SUBJECT"), Some(b"hi" as &[u8]));
        assert_eq!(msg.header("SuBjEcT"), Some(b"hi" as &[u8]));
    }

    #[test]
    fn header_str_returns_utf8() {
        let msg = Message::new(SIMPLE);
        assert_eq!(msg.header_str("Subject"), Some("hi"));
    }

    #[test]
    fn header_missing_returns_none() {
        let msg = Message::new(SIMPLE);
        assert_eq!(msg.header("X-Missing"), None);
    }

    #[test]
    fn body_extracts_correct_bytes() {
        let msg = Message::new(SIMPLE);
        assert_eq!(msg.body(), Some(b"Hello, world!\r\n" as &[u8]));
    }

    #[test]
    fn body_offset_is_memoized() {
        let msg = Message::new(SIMPLE);
        let first = msg.body_offset();
        let second = msg.body_offset();
        assert_eq!(first, second);
        assert!(first.is_some());
    }

    #[test]
    fn lf_only_line_endings_work() {
        let bytes = b"Subject: hi\nFrom: x\n\nbody";
        let msg = Message::new(bytes);
        assert_eq!(msg.header("Subject"), Some(b"hi" as &[u8]));
        assert_eq!(msg.header("From"), Some(b"x" as &[u8]));
        assert_eq!(msg.body(), Some(b"body" as &[u8]));
    }

    #[test]
    fn folded_header_value_includes_continuation() {
        let bytes = b"Subject: first\r\n second\r\nFrom: x\r\n\r\nbody";
        let msg = Message::new(bytes);
        let subj = msg.header("Subject").unwrap();
        // The raw value carries the CRLF + WSP unfolding the caller can
        // resolve. The "first" comes before the CRLF, "second" after.
        // RFC 5322 §3.2.2: WSP after CRLF means continuation.
        assert!(subj.starts_with(b"first"));
        // And the next header is still findable after folding.
        assert_eq!(msg.header("From"), Some(b"x" as &[u8]));
    }

    #[test]
    fn header_all_finds_multiple_received_chains() {
        let bytes = b"\
Received: from a.example.com\r\n\
Received: from b.example.com\r\n\
Received: from c.example.com\r\n\
\r\n\
body";
        let msg = Message::new(bytes);
        let received: Vec<_> = msg.header_all("Received").collect();
        assert_eq!(received.len(), 3);
        assert!(received[0].value.starts_with(b"from a"));
        assert!(received[1].value.starts_with(b"from b"));
        assert!(received[2].value.starts_with(b"from c"));
    }

    #[test]
    fn no_body_returns_none() {
        let bytes = b"Subject: header-only\r\n";
        let msg = Message::new(bytes);
        assert_eq!(msg.body(), None);
    }

    #[test]
    fn headers_iterator_returns_all_in_order() {
        let msg = Message::new(SIMPLE);
        let names: Vec<&str> = msg.headers().map(|h| h.name).collect();
        assert_eq!(names, vec!["From", "To", "Subject"]);
    }

    #[test]
    fn header_value_with_no_wsp_after_colon() {
        let bytes = b"X-Custom:value\r\n\r\n";
        let msg = Message::new(bytes);
        assert_eq!(msg.header("X-Custom"), Some(b"value" as &[u8]));
    }

    #[test]
    fn header_value_with_tab_after_colon() {
        let bytes = b"X-Custom:\tvalue\r\n\r\n";
        let msg = Message::new(bytes);
        assert_eq!(msg.header("X-Custom"), Some(b"value" as &[u8]));
    }

    #[test]
    fn empty_message_returns_no_body() {
        let msg = Message::new(b"");
        assert_eq!(msg.body(), None);
        assert_eq!(msg.headers().count(), 0);
    }

    #[test]
    fn body_only_message_works() {
        // header-then-empty-line-then-body is the canonical shape;
        // an empty leading CRLF means "no headers, all body".
        let bytes = b"\r\nbody here";
        let msg = Message::new(bytes);
        assert_eq!(msg.body(), Some(b"body here" as &[u8]));
        assert_eq!(msg.headers().count(), 0);
    }

    // ===== edge cases =====

    #[test]
    fn folded_header_three_continuation_lines() {
        let bytes = b"Subject: line1\r\n line2\r\n\tline3\r\n line4\r\nFrom: x\r\n\r\nbody";
        let msg = Message::new(bytes);
        let subj = msg.header("Subject").unwrap();
        // All four lines (including the original) should be in the value,
        // with the CRLF + WSP intact between them.
        assert!(subj.starts_with(b"line1"));
        assert!(std::str::from_utf8(subj).unwrap().contains("line2"));
        assert!(std::str::from_utf8(subj).unwrap().contains("line3"));
        assert!(std::str::from_utf8(subj).unwrap().contains("line4"));
        // And From: still findable after the long fold
        assert_eq!(msg.header("From"), Some(b"x" as &[u8]));
    }

    #[test]
    fn folded_header_tab_continuation() {
        // Tab is also a valid WSP continuation per RFC 5322 §3.2.2
        let bytes = b"X-Long-Header: first\r\n\tsecond\r\n\r\nbody";
        let msg = Message::new(bytes);
        let val = msg.header("X-Long-Header").unwrap();
        assert!(val.starts_with(b"first"));
        assert!(std::str::from_utf8(val).unwrap().contains("second"));
    }

    #[test]
    fn header_value_contains_colon_received_chain() {
        // Real-world Received headers have many colons in the value
        // (timestamps, port numbers, etc.). Parser should split only on
        // the FIRST colon.
        let bytes = b"Received: from mta.example.com (mta.example.com [203.0.113.42:25])\
                       by mx.golia.jp with ESMTP id 12345; Sun, 22 May 2026 10:00:00 +0900\r\n\r\nbody";
        let msg = Message::new(bytes);
        let received = msg.header("Received").unwrap();
        // Value should start with "from mta…", NOT some sub-split
        assert!(received.starts_with(b"from mta.example.com"));
        // And the colons inside should be preserved
        assert!(std::str::from_utf8(received).unwrap().contains("203.0.113.42:25"));
    }

    #[test]
    fn empty_header_value() {
        let bytes = b"X-Empty:\r\nFrom: alice\r\n\r\nbody";
        let msg = Message::new(bytes);
        assert_eq!(msg.header("X-Empty"), Some(b"" as &[u8]));
        // Subsequent headers still parse correctly
        assert_eq!(msg.header("From"), Some(b"alice" as &[u8]));
    }

    #[test]
    fn whitespace_only_header_value() {
        let bytes = b"X-Tabbed: \t \r\nFrom: alice\r\n\r\nbody";
        let msg = Message::new(bytes);
        // After stripping one leading WSP, whitespace remainder kept verbatim
        let val = msg.header("X-Tabbed").unwrap();
        assert_eq!(val, b"\t ");
        assert_eq!(msg.header("From"), Some(b"alice" as &[u8]));
    }

    #[test]
    fn header_name_case_preserved_in_iter() {
        let bytes = b"X-Custom-Header: v\r\n\r\nbody";
        let msg = Message::new(bytes);
        // Iterator preserves the literal name as it appears
        let names: Vec<&str> = msg.headers().map(|h| h.name).collect();
        assert_eq!(names, vec!["X-Custom-Header"]);
        // But lookup is case-insensitive
        assert!(msg.header("x-custom-header").is_some());
        assert!(msg.header("X-CUSTOM-HEADER").is_some());
    }

    #[test]
    fn header_name_with_digits_and_hyphens() {
        let bytes = b"X-MS-Has-Attach: yes\r\n\
                       Content-Type-1: text/plain\r\n\r\nbody";
        let msg = Message::new(bytes);
        assert_eq!(msg.header("X-MS-Has-Attach"), Some(b"yes" as &[u8]));
        assert_eq!(msg.header("Content-Type-1"), Some(b"text/plain" as &[u8]));
    }

    #[test]
    fn iter_can_be_called_twice_independently() {
        let bytes = b"A: 1\r\nB: 2\r\n\r\nbody";
        let msg = Message::new(bytes);
        let names1: Vec<_> = msg.headers().map(|h| h.name).collect();
        let names2: Vec<_> = msg.headers().map(|h| h.name).collect();
        assert_eq!(names1, names2);
    }

    #[test]
    fn body_offset_cached_call_returns_same_value() {
        let bytes = b"From: x\r\n\r\nbody";
        let msg = Message::new(bytes);
        let first = msg.body_offset();
        let second = msg.body_offset();
        let third = msg.body_offset();
        assert_eq!(first, second);
        assert_eq!(second, third);
    }

    #[test]
    fn message_with_no_body_separator_returns_no_body() {
        let bytes = b"From: x\r\nSubject: hi\r\n";
        let msg = Message::new(bytes);
        assert!(msg.body().is_none());
        assert!(msg.body_offset().is_none());
        // But headers still iterate
        assert_eq!(msg.headers().count(), 2);
    }

    #[test]
    fn just_crlf_message() {
        // Empty headers + empty body
        let bytes = b"\r\n";
        let msg = Message::new(bytes);
        assert_eq!(msg.body(), Some(b"" as &[u8]));
        assert_eq!(msg.headers().count(), 0);
    }

    #[test]
    fn lone_lf_terminator_works() {
        // Some implementations send LF-only line endings
        let bytes = b"From: x\nSubject: hi\n\nbody";
        let msg = Message::new(bytes);
        assert_eq!(msg.header("From"), Some(b"x" as &[u8]));
        assert_eq!(msg.body(), Some(b"body" as &[u8]));
    }

    #[test]
    fn mixed_crlf_and_lf_line_endings() {
        // Some buggy clients mix them — should still parse
        let bytes = b"From: x\r\nSubject: hi\n\r\nbody";
        let msg = Message::new(bytes);
        assert_eq!(msg.header("From"), Some(b"x" as &[u8]));
        // Subject line terminates with \n (no preceding \r). Acceptable.
        assert_eq!(msg.header("Subject"), Some(b"hi" as &[u8]));
    }

    #[test]
    fn long_header_value_2kb() {
        // Stress test: 2 KB header value (e.g. very long DKIM signature)
        let long: String = "x".repeat(2000);
        let raw = format!("DKIM-Signature: {long}\r\n\r\nbody");
        let msg = Message::new(raw.as_bytes());
        let val = msg.header("DKIM-Signature").unwrap();
        assert_eq!(val.len(), 2000);
        assert!(val.iter().all(|&b| b == b'x'));
    }

    #[test]
    fn find_target_at_end_of_many_headers() {
        // Worst case for header(): target is the last of 50 headers.
        let mut raw = Vec::new();
        for i in 0..49 {
            raw.extend_from_slice(format!("X-Filler-{i}: value{i}\r\n").as_bytes());
        }
        raw.extend_from_slice(b"X-Target: bingo\r\n\r\nbody");
        let msg = Message::new(&raw);
        assert_eq!(msg.header("X-Target"), Some(b"bingo" as &[u8]));
    }

    #[test]
    fn header_not_found_traverses_full_block() {
        // Ensure missing-header lookup doesn't accidentally return Some
        // from a partial match.
        let raw = b"X-AAA: 1\r\nX-BBB: 2\r\nX-CCC: 3\r\n\r\nbody";
        let msg = Message::new(raw);
        assert_eq!(msg.header("X-DDD"), None);
        assert_eq!(msg.header("X-AAAA"), None); // prefix-only must not match
        assert_eq!(msg.header("X-AA"), None); // shorter must not match
    }

    #[test]
    fn header_all_returns_zero_when_no_match() {
        let raw = b"From: x\r\nTo: y\r\n\r\nbody";
        let msg = Message::new(raw);
        assert_eq!(msg.header_all("Received").count(), 0);
    }

    #[test]
    fn utf8_in_header_value_via_str_helper() {
        // RFC 6532 says headers can be UTF-8. header_str returns Some
        // when the bytes are valid UTF-8.
        let raw = "From: アリス\r\n\r\nbody".as_bytes();
        let msg = Message::new(raw);
        assert_eq!(msg.header_str("From"), Some("アリス"));
    }

    #[test]
    fn invalid_utf8_in_header_value_returns_none_via_str() {
        // bytes 0xFF 0xFE are not valid UTF-8
        let mut raw = b"X-Bin: ".to_vec();
        raw.extend_from_slice(&[0xFF, 0xFE]);
        raw.extend_from_slice(b"\r\n\r\nbody");
        let msg = Message::new(&raw);
        // header() returns raw bytes (still works)
        assert!(msg.header("X-Bin").is_some());
        // header_str returns None because not UTF-8
        assert!(msg.header_str("X-Bin").is_none());
    }

    #[test]
    fn body_with_internal_empty_lines_kept_intact() {
        // The header/body boundary is the FIRST empty line. Any empty
        // lines inside the body are part of the body verbatim.
        let raw = b"From: x\r\n\r\nfirst\r\n\r\nsecond\r\n";
        let msg = Message::new(raw);
        let body = msg.body().unwrap();
        assert!(body.starts_with(b"first"));
        // The second empty line is INSIDE the body
        assert_eq!(body, b"first\r\n\r\nsecond\r\n");
    }
}