mime_tree/header_typed.rs
1//! Typed header value parsing for RFC 8621 JMAP `As*` header forms.
2//!
3//! RFC 8621 §4.1.2 defines several parsed-form selectors that a JMAP server
4//! may apply to a single header field's raw bytes:
5//!
6//! | RFC 8621 form | Section | mime-tree result variant |
7//! |----------------------|------------|------------------------------------------|
8//! | `asRaw` | §4.1.2.1 | [`HeaderValueTyped::Raw`] |
9//! | `asText` | §4.1.2.2 | [`HeaderValueTyped::Text`] |
10//! | `asAddresses` | §4.1.2.3 | [`HeaderValueTyped::Addresses`] |
11//! | `asGroupedAddresses` | §4.1.2.4 | [`HeaderValueTyped::GroupedAddresses`] |
12//! | `asMessageIds` | §4.1.2.5 | [`HeaderValueTyped::MessageIds`] |
13//! | `asDate` | §4.1.2.6 | [`HeaderValueTyped::DateTime`] |
14//! | `asURLs` | §4.1.2.7 | [`HeaderValueTyped::URLs`] |
15//!
16//! The entry point is [`parse_header_typed`]. It takes the [`HeaderForm`]
17//! selector and the raw bytes of the header field value (the portion to the
18//! right of the `:` in the header line, including any folded continuation
19//! lines but excluding the header name and the trailing CRLF).
20//!
21//! Parsing is best-effort. On failure the function returns the appropriate
22//! empty value (an empty `Vec`, an empty `Raw` string, or `DateTime(None)`
23//! for an unparseable date) — it never panics and never returns an error.
24//!
25//! These types are independent of the [`crate::ParsedHeader`] surface,
26//! which exposes both a decoded `value` string and the original wire
27//! bytes in `raw_value`. To layer a typed view on top of an existing
28//! `ParsedHeader`, use [`parse_header_typed_from`] or feed its
29//! `raw_value` bytes to [`parse_header_typed`]:
30//!
31//! ```ignore
32//! let msg = mime_tree::parse(raw)?;
33//! if let Some(h) = msg.headers.iter().find(|h| h.name.eq_ignore_ascii_case("From")) {
34//! // Option A: convenience wrapper
35//! let typed = mime_tree::parse_header_typed_from(h, mime_tree::HeaderForm::Addresses);
36//! // Option B: direct call with raw bytes (equivalent)
37//! let addrs = mime_tree::parse_addresses(&h.raw_value);
38//! }
39//! ```
40//!
41//! Always use `raw_value` — not `value.as_bytes()` — when calling
42//! [`parse_header_typed`], because `value` undergoes lossy UTF-8
43//! conversion for structured headers and non-UTF-8 bytes would be
44//! silently corrupted.
45
46use std::borrow::Cow;
47use std::fmt;
48
49use mail_parser::{parsers::MessageStream, Address, HeaderValue};
50use serde::{Deserialize, Serialize};
51use unicode_normalization::UnicodeNormalization;
52
53use crate::ParsedHeader;
54
55/// A single RFC 5322 `mailbox` parsed from an `address-list`.
56///
57/// Mirrors the JMAP `EmailAddress` object defined in RFC 8621 §4.1.2.3.
58///
59/// `name` is the optional display name. `address` is the `addr-spec`. Both
60/// are populated best-effort; either may be `None` if the original header
61/// is malformed.
62///
63/// # Equality semantics
64///
65/// The derived `PartialEq`/`Eq`/`Hash` is byte-exact on both fields. In
66/// particular, `address` comparison is case-sensitive across the entire
67/// addr-spec, even though RFC 5321 §2.4 defines the *domain* part of an
68/// addr-spec as case-insensitive — so `alice@example.com` and
69/// `alice@EXAMPLE.COM` compare as not equal and hash differently. Callers
70/// that need RFC-5321-conformant equality (HashSet dedup of recipient
71/// lists, etc.) MUST canonicalise the domain part themselves before
72/// comparing or hashing.
73#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
74#[non_exhaustive]
75pub struct EmailAddress {
76 /// Display name from the `mailbox`, RFC 2047 encoded-words already
77 /// decoded.
78 ///
79 /// Parser-produced values are RFC 8621 §4.1.2.3 normalised:
80 /// surrounding ASCII whitespace is trimmed from the decoded
81 /// display name. A name that is empty after trimming is mapped to
82 /// `None` (a lone empty quoted-string never surfaces as
83 /// `Some(String::new())`).
84 pub name: Option<String>,
85 /// `addr-spec` of the `mailbox`.
86 pub address: Option<String>,
87}
88
89impl EmailAddress {
90 /// Construct an `EmailAddress` from optional display name and
91 /// addr-spec.
92 ///
93 /// `EmailAddress` is `#[non_exhaustive]` so external callers cannot
94 /// use struct expression syntax. Use this constructor — or
95 /// `Default::default()` followed by field assignment — instead.
96 #[must_use]
97 pub fn new(name: Option<String>, address: Option<String>) -> Self {
98 Self { name, address }
99 }
100
101 /// Whether this `EmailAddress` carries an `addr-spec`.
102 ///
103 /// `parse_header_typed` produces `EmailAddress` values with
104 /// `address == None` for malformed mailboxes (most commonly,
105 /// display-name-only mailboxes from non-spec-conformant clients —
106 /// e.g. a draft saved with just a typed-but-incomplete `To:`).
107 /// Such entries are unusable for sending mail, address comparison,
108 /// or addr-spec-keyed lookup.
109 ///
110 /// Use this helper to filter parsed address lists down to the
111 /// usable subset:
112 ///
113 /// ```
114 /// use mime_tree::EmailAddress;
115 ///
116 /// let parsed = vec![
117 /// EmailAddress::new(
118 /// Some("Alice".to_owned()),
119 /// Some("alice@example.com".to_owned()),
120 /// ),
121 /// EmailAddress::new(Some("Display-Name Only".to_owned()), None),
122 /// ];
123 /// let usable: Vec<EmailAddress> = parsed
124 /// .into_iter()
125 /// .filter(EmailAddress::is_addressable)
126 /// .collect();
127 /// assert_eq!(usable.len(), 1);
128 /// assert_eq!(usable[0].address.as_deref(), Some("alice@example.com"));
129 /// ```
130 #[must_use]
131 pub fn is_addressable(&self) -> bool {
132 self.address.is_some()
133 }
134}
135
136impl fmt::Display for EmailAddress {
137 /// Render in RFC 5322 §3.4 mailbox-ish form.
138 ///
139 /// * Both `name` and `address` present: `Display Name <addr@host>`.
140 /// * `address` only: `addr@host` (bare addr-spec, no angle brackets).
141 /// * `name` only: `Display Name` (degenerate; not a valid RFC 5322
142 /// mailbox, but the best a Display impl can do).
143 /// * Neither present: the empty string.
144 ///
145 /// Names are emitted verbatim. This Display impl prioritises human
146 /// readability over RFC 5322 round-trippability — names containing
147 /// `<`, `>`, `,`, or other RFC 5322 specials are not quoted. Callers
148 /// that need byte-stable round-trip into a header field MUST roll
149 /// their own serializer with proper quoting.
150 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
151 match (self.name.as_deref(), self.address.as_deref()) {
152 (Some(n), Some(a)) => write!(f, "{n} <{a}>"),
153 (None, Some(a)) => f.write_str(a),
154 (Some(n), None) => f.write_str(n),
155 (None, None) => Ok(()),
156 }
157 }
158}
159
160/// A group of `EmailAddress` values, optionally named.
161///
162/// Mirrors the JMAP `EmailAddressGroup` object defined in RFC 8621 §4.1.2.4.
163///
164/// Per RFC 8621 §4.1.2.4, consecutive mailboxes that are not part of a
165/// declared RFC 5322 `group` are still collected under an `AddressGroup`
166/// whose `name` is `None`, "to provide a uniform type".
167#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
168#[non_exhaustive]
169pub struct AddressGroup {
170 /// Display name of the group, or `None` for ungrouped mailboxes.
171 ///
172 /// Parser-produced values follow the same normalisation as
173 /// [`EmailAddress::name`]: surrounding ASCII whitespace trimmed; an
174 /// empty result is mapped to `None`, not `Some(String::new())`.
175 pub name: Option<String>,
176 /// Mailboxes belonging to this group.
177 pub addresses: Vec<EmailAddress>,
178}
179
180impl AddressGroup {
181 /// Construct an `AddressGroup` from an optional group name and a
182 /// vector of mailboxes.
183 ///
184 /// `AddressGroup` is `#[non_exhaustive]` so external callers cannot
185 /// use struct expression syntax. Use this constructor — or
186 /// `Default::default()` followed by field assignment — instead.
187 #[must_use]
188 pub fn new(name: Option<String>, addresses: Vec<EmailAddress>) -> Self {
189 Self { name, addresses }
190 }
191}
192
193impl fmt::Display for AddressGroup {
194 /// Render in RFC 5322 §3.4 group form: `name: mb1, mb2;`.
195 ///
196 /// * Named group: `Friends: alice@example.com, bob@example.com;`.
197 /// * Anonymous group (`name == None`): just the comma-joined
198 /// mailbox list, no `:` and no terminating `;`.
199 /// * Empty group: just `name:;` (or empty string for an anonymous
200 /// empty group).
201 ///
202 /// Same caveat as `EmailAddress` Display: prioritises human
203 /// readability; not guaranteed RFC 5322 round-trippable.
204 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
205 if let Some(n) = &self.name {
206 write!(f, "{n}:")?;
207 if !self.addresses.is_empty() {
208 f.write_str(" ")?;
209 }
210 }
211 for (i, addr) in self.addresses.iter().enumerate() {
212 if i > 0 {
213 f.write_str(", ")?;
214 }
215 write!(f, "{addr}")?;
216 }
217 if self.name.is_some() {
218 f.write_str(";")?;
219 }
220 Ok(())
221 }
222}
223
224/// Sign of a `date-time` timezone offset from GMT (RFC 5322 §3.3).
225///
226/// East of GMT corresponds to positive `+HHMM` offsets (e.g. `+0100`).
227/// West of GMT corresponds to negative `-HHMM` offsets (e.g. `-0600`).
228#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
229#[non_exhaustive]
230pub enum TzSign {
231 /// Offset is east of GMT (`+HHMM`).
232 East,
233 /// Offset is west of GMT (`-HHMM`).
234 West,
235}
236
237/// An RFC 5322 §3.3 `date-time` value parsed from a header.
238///
239/// Public fields permit serde transparency and direct field access from
240/// JMAP-shaped code. The fields mirror `mail_parser::DateTime` 1-to-1
241/// **except** for `tz_sign`, which is an explicit enum rather than a
242/// bool. This is a deliberate API choice — see `TzSign` — and means
243/// `HeaderDateTime` and `mail_parser::DateTime` are not bit-identical
244/// even though they round-trip via [`HeaderDateTime::from_mail_parser`]
245/// / [`HeaderDateTime::to_mail_parser`].
246///
247/// # Wire-format dependency on mail-parser
248///
249/// [`Self::to_rfc3339`] and [`Self::to_timestamp`] delegate to
250/// `mail_parser::DateTime`'s formatters. The exact strings produced by
251/// `to_rfc3339`, and the exact value produced by `to_timestamp` for
252/// edge-case input, are therefore defined by the pinned mail-parser
253/// version. mime-tree's Cargo.toml uses a caret range (`mail-parser =
254/// "0.11"`) so 0.11.x patch updates can in principle change the output
255/// without a mime-tree version bump. Downstream callers that persist
256/// these strings (database keys, JMAP wire responses, indexed columns)
257/// SHOULD pin mail-parser tightly if they require byte-stable output
258/// across mime-tree patch bumps.
259///
260/// # Field invariants
261///
262/// `parse_header_typed` only constructs `HeaderDateTime` values that
263/// passed mail-parser's validation: `year >= 1900`, `month ∈ 1..=12`,
264/// `day ∈ 1..=31` (calendar-validated), `hour ∈ 0..=23`,
265/// `minute ∈ 0..=59`, `second ∈ 0..=60` (RFC 5322 §4.3 leap second),
266/// `tz_hour ∈ 0..=23`, `tz_minute ∈ 0..=59`.
267///
268/// Direct construction with public fields can produce out-of-range
269/// values. The behaviour of `to_rfc3339` and `to_timestamp` on such
270/// values is unspecified — output may be syntactically malformed
271/// RFC 3339 or a meaningless `i64`. Callers that build `HeaderDateTime`
272/// from external sources should validate ranges themselves.
273///
274/// # Equality semantics
275///
276/// The derived `PartialEq`/`Eq`/`Hash` is **field-wise**, not
277/// **instant-wise**. Two `HeaderDateTime` values representing the same
278/// moment in time at different offsets compare as not-equal and hash
279/// differently. For example:
280///
281/// ```text
282/// 2024-01-01T12:00:00+00:00 (12:00 UTC)
283/// 2024-01-01T13:00:00+01:00 (12:00 UTC, expressed +01:00)
284/// ```
285///
286/// are the same instant but compare `!=`. Callers needing
287/// instant-equality (deduping timestamps across clients in different
288/// time zones, time-series bucketing) MUST compare
289/// [`Self::to_timestamp`] values rather than relying on the derived
290/// `PartialEq`.
291#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
292#[non_exhaustive]
293pub struct HeaderDateTime {
294 /// Four-digit calendar year. Parser-produced values: `1900..=3000`.
295 pub year: u16,
296 /// Month of the year, `1..=12` for parser-produced values.
297 pub month: u8,
298 /// Day of the month, `1..=31` (calendar-validated against
299 /// `year`/`month`) for parser-produced values.
300 pub day: u8,
301 /// Hour of the day, `0..=23` for parser-produced values.
302 pub hour: u8,
303 /// Minute, `0..=59` for parser-produced values.
304 pub minute: u8,
305 /// Second, `0..=60` for parser-produced values (RFC 5322 §4.3
306 /// allows 60 to represent a leap second).
307 pub second: u8,
308 /// Sign of the timezone offset from GMT.
309 pub tz_sign: TzSign,
310 /// Hours component of the timezone offset, `0..=23` for
311 /// parser-produced values.
312 pub tz_hour: u8,
313 /// Minutes component of the timezone offset, `0..=59` for
314 /// parser-produced values.
315 pub tz_minute: u8,
316}
317
318impl HeaderDateTime {
319 /// Render as an RFC 3339 / ISO 8601 §5.6 date-time string.
320 ///
321 /// # Output format
322 ///
323 /// * Non-UTC offset (any of `tz_hour`, `tz_minute` non-zero):
324 /// `YYYY-MM-DDTHH:MM:SS±HH:MM`. Each component is zero-padded;
325 /// `±` is `-` for west-of-GMT, `+` otherwise.
326 /// * UTC (`tz_hour == 0 && tz_minute == 0`):
327 /// `YYYY-MM-DDTHH:MM:SSZ`. Zulu form, not `+00:00`.
328 ///
329 /// No subsecond fraction is emitted (the seconds-fraction extension
330 /// of RFC 3339 is not represented in `HeaderDateTime`).
331 ///
332 /// # Examples
333 ///
334 /// * `1997-11-21T09:55:06-06:00` for `21 Nov 1997 09:55:06 -0600`.
335 /// * `2024-01-15T12:34:56Z` for `15 Jan 2024 12:34:56 +0000`.
336 ///
337 /// # Behaviour on out-of-range input
338 ///
339 /// The exact string for out-of-range field values
340 /// (e.g. `month = 13`) is unspecified — it depends on the pinned
341 /// mail-parser version and may not be syntactically valid RFC 3339.
342 /// See the type-level docs.
343 #[must_use]
344 pub fn to_rfc3339(&self) -> String {
345 self.to_mail_parser().to_rfc3339()
346 }
347
348 /// Render as a Unix timestamp (seconds since 1970-01-01T00:00:00Z).
349 ///
350 /// Pre-epoch dates return negative values. The result is computed
351 /// linearly from the field values without validation; on
352 /// out-of-range or otherwise invalid input (e.g. `month = 0`,
353 /// `day = 99`, year overflowing the calendar arithmetic) the
354 /// returned `i64` is unspecified and SHOULD NOT be relied upon.
355 /// See the type-level docs.
356 #[must_use]
357 pub fn to_timestamp(&self) -> i64 {
358 self.to_mail_parser().to_timestamp()
359 }
360
361 fn to_mail_parser(&self) -> mail_parser::DateTime {
362 mail_parser::DateTime {
363 year: self.year,
364 month: self.month,
365 day: self.day,
366 hour: self.hour,
367 minute: self.minute,
368 second: self.second,
369 tz_before_gmt: matches!(self.tz_sign, TzSign::West),
370 tz_hour: self.tz_hour,
371 tz_minute: self.tz_minute,
372 }
373 }
374
375 fn from_mail_parser(dt: mail_parser::DateTime) -> Self {
376 Self {
377 year: dt.year,
378 month: dt.month,
379 day: dt.day,
380 hour: dt.hour,
381 minute: dt.minute,
382 second: dt.second,
383 tz_sign: if dt.tz_before_gmt {
384 TzSign::West
385 } else {
386 TzSign::East
387 },
388 tz_hour: dt.tz_hour,
389 tz_minute: dt.tz_minute,
390 }
391 }
392}
393
394impl fmt::Display for HeaderDateTime {
395 /// Render as an RFC 3339 / ISO 8601 §5.6 date-time string by
396 /// delegating to [`HeaderDateTime::to_rfc3339`]. See that method
397 /// for the exact output format and behaviour on out-of-range input.
398 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
399 f.write_str(&self.to_rfc3339())
400 }
401}
402
403/// Selector for the RFC 8621 parsed-form of a header value.
404///
405/// This is the form-token from a JMAP `header:<name>:as<form>` property
406/// selector, normalised to an enum.
407///
408/// [`Display`](fmt::Display) emits the canonical JMAP form-token string
409/// (`asRaw`, `asAddresses`, …, `asURLs`).
410/// [`FromStr`](std::str::FromStr) accepts exactly that set of strings;
411/// any other input yields [`UnknownHeaderForm`].
412#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
413#[non_exhaustive]
414pub enum HeaderForm {
415 /// Trim surrounding whitespace; return the bytes as a UTF-8 string.
416 /// (§4.1.2.1)
417 ///
418 /// Non-UTF-8 bytes — legal in raw RFC 5322 but not in JMAP wire
419 /// format — are replaced with U+FFFD REPLACEMENT CHARACTER
420 /// (lossy conversion). This preserves the position and rough shape
421 /// of malformed input so callers can flag a mojibake header
422 /// without losing the rest of the field body.
423 Raw,
424 /// RFC 8621 §4.1.2.2 Text form. Unfold whitespace, strip the
425 /// trailing CRLF and leading SP, decode all syntactically-correct
426 /// RFC 2047 encoded-words, then Unicode-normalise the result to
427 /// NFC.
428 ///
429 /// This is the form most commonly used by JMAP clients fetching
430 /// human-readable header fields like Subject, Comments, Keywords,
431 /// and List-Id.
432 Text,
433 /// Parse as an RFC 5322 `address-list`. Group structure is discarded;
434 /// only the flat list of mailboxes is returned. (§4.1.2.3)
435 Addresses,
436 /// Parse as an RFC 5322 `address-list`, preserving group structure.
437 /// (§4.1.2.4)
438 GroupedAddresses,
439 /// Parse as a list of RFC 5322 `msg-id` values. Surrounding angle
440 /// brackets and CFWS are stripped. (§4.1.2.5)
441 MessageIds,
442 /// Parse as an RFC 5322 §3.3 `date-time`. (§4.1.2.6)
443 Date,
444 /// Parse as an RFC 2369 list of URLs. Surrounding angle brackets and
445 /// comments are stripped. (§4.1.2.7)
446 URLs,
447}
448
449impl HeaderForm {
450 /// Return the canonical RFC 8621 §4.1.2 form-token string for this
451 /// variant.
452 ///
453 /// The token starts with `as` (the convention used in JMAP property
454 /// selectors such as `header:Subject:asText`). Inverse of
455 /// [`HeaderForm`]'s [`FromStr`](std::str::FromStr) impl.
456 #[must_use]
457 pub fn as_jmap_token(&self) -> &'static str {
458 match self {
459 HeaderForm::Raw => "asRaw",
460 HeaderForm::Text => "asText",
461 HeaderForm::Addresses => "asAddresses",
462 HeaderForm::GroupedAddresses => "asGroupedAddresses",
463 HeaderForm::MessageIds => "asMessageIds",
464 HeaderForm::Date => "asDate",
465 HeaderForm::URLs => "asURLs",
466 }
467 }
468}
469
470impl fmt::Display for HeaderForm {
471 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
472 f.write_str(self.as_jmap_token())
473 }
474}
475
476/// Error returned by [`HeaderForm`]'s [`FromStr`](std::str::FromStr) impl
477/// when the input is not a recognised JMAP form-token.
478///
479/// The wrapped string is the input as given (case-sensitive); JMAP
480/// form-tokens are case-sensitive per RFC 8621 §4.1.2.
481#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
482pub struct UnknownHeaderForm(pub String);
483
484impl fmt::Display for UnknownHeaderForm {
485 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
486 write!(f, "unknown JMAP header form token: {:?}", self.0)
487 }
488}
489
490impl std::error::Error for UnknownHeaderForm {}
491
492impl std::str::FromStr for HeaderForm {
493 type Err = UnknownHeaderForm;
494
495 fn from_str(s: &str) -> Result<Self, Self::Err> {
496 match s {
497 "asRaw" => Ok(HeaderForm::Raw),
498 "asText" => Ok(HeaderForm::Text),
499 "asAddresses" => Ok(HeaderForm::Addresses),
500 "asGroupedAddresses" => Ok(HeaderForm::GroupedAddresses),
501 "asMessageIds" => Ok(HeaderForm::MessageIds),
502 "asDate" => Ok(HeaderForm::Date),
503 "asURLs" => Ok(HeaderForm::URLs),
504 _ => Err(UnknownHeaderForm(s.to_owned())),
505 }
506 }
507}
508
509/// A header field value rendered in one of the RFC 8621 parsed forms.
510///
511/// # Serde wire format
512///
513/// `HeaderValueTyped` and [`HeaderForm`] use serde's *default*
514/// representation: externally-tagged for the enum, capitalised Rust
515/// variant names. Examples:
516///
517/// ```json
518/// {"Addresses": [{"name": "Alice", "address": "alice@example.com"}]}
519/// {"DateTime": null}
520/// {"Raw": "Subject line"}
521/// "Addresses" // serialized HeaderForm
522/// ```
523///
524/// This is a **deliberate** choice for in-crate serialization (between
525/// services, into databases). It is **not** the RFC 8621 wire format
526/// used over JMAP HTTP/JSON. RFC 8621 §4.1.2 uses property-selector
527/// strings such as `header:Subject:asAddresses` rather than serializing
528/// the form name as an enum tag. Callers exposing parsed headers to a
529/// JMAP client SHOULD map between this representation and the JMAP
530/// wire format at the API boundary; relying on the in-crate serde
531/// shape as the wire format will produce a non-conformant JMAP
532/// response.
533///
534/// Pre-1.0, this representation is subject to change. From 1.0 onward
535/// the in-crate serde format will be a stability surface and will be
536/// changed only with a major version bump.
537#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
538#[non_exhaustive]
539pub enum HeaderValueTyped {
540 /// Result of [`HeaderForm::Raw`]: the trimmed UTF-8 string.
541 Raw(String),
542 /// Result of [`HeaderForm::Text`]: whitespace-unfolded, RFC 2047
543 /// decoded, NFC-normalised UTF-8 string.
544 Text(String),
545 /// Result of [`HeaderForm::Addresses`].
546 Addresses(Vec<EmailAddress>),
547 /// Result of [`HeaderForm::GroupedAddresses`].
548 GroupedAddresses(Vec<AddressGroup>),
549 /// Result of [`HeaderForm::MessageIds`]: bare msg-id strings with no
550 /// angle brackets.
551 MessageIds(Vec<String>),
552 /// Result of [`HeaderForm::Date`], or `None` if the header value did
553 /// not parse as a `date-time`.
554 DateTime(Option<HeaderDateTime>),
555 /// Result of [`HeaderForm::URLs`]: bare URL strings with no angle
556 /// brackets.
557 URLs(Vec<String>),
558}
559
560/// Parse a header field value into the requested RFC 8621 parsed form.
561///
562/// `raw_value` is the bytes of the header field value — the portion to
563/// the right of the `:` in the header line, including any folded
564/// continuation lines.
565///
566/// # Trailing line ending
567///
568/// A trailing CRLF (or bare LF) is **permitted but not required**:
569///
570/// * Input ending in `\r\n` is used as-is.
571/// * Input ending in `\n` is converted to `\r\n` internally.
572/// * Input with no trailing line ending has `\r\n` appended internally.
573///
574/// All three shapes produce the same result. Callers may pass the field
575/// body with or without the line ending — pick whichever is easiest to
576/// extract from the source.
577///
578/// # Best-effort parsing
579///
580/// Malformed input yields the empty result for the requested form
581/// (empty `Vec`, empty string, or `DateTime(None)`). The function never
582/// panics and never returns an error.
583///
584/// # Empty-result ambiguity
585///
586/// The empty result is the **same value** regardless of cause:
587///
588/// | Form | Empty value | Triggered by |
589/// |---------------------|-------------------------|----------------------------------------------------|
590/// | `Raw` | `Raw("")` | empty, all-whitespace, or all-malformed-UTF-8 input |
591/// | `Addresses` | `Addresses(vec![])` | empty, malformed, or zero-mailbox input |
592/// | `GroupedAddresses` | `GroupedAddresses(vec![])` | same as above |
593/// | `MessageIds` | `MessageIds(vec![])` | empty, no `<...>` brackets, all garbage |
594/// | `Date` | `DateTime(None)` | empty, malformed, or all-zero day/month |
595/// | `URLs` | `URLs(vec![])` | empty, no `<...>` brackets, all garbage |
596///
597/// Callers cannot distinguish "header was present but empty" from
598/// "header was present but malformed" from "input was non-UTF-8 noise"
599/// using this API. Out-of-band signalling (e.g. recording a warning
600/// alongside the parse result) is the caller's responsibility. Future
601/// minor releases may add a parallel `parse_header_typed_strict`
602/// returning `Result` or a tuple `(HeaderValueTyped, Warnings)` —
603/// neither is exposed today.
604///
605/// # Examples
606///
607/// ## Addresses (RFC 8621 §4.1.2.3)
608///
609/// ```
610/// use mime_tree::{parse_header_typed, EmailAddress, HeaderForm, HeaderValueTyped};
611///
612/// // RFC 8621 §4.1.2.3 example (the "James Smythe" address-list, simplified).
613/// let raw = b" \"James Smythe\" <james@example.com>";
614/// let parsed = parse_header_typed(HeaderForm::Addresses, raw);
615/// assert_eq!(
616/// parsed,
617/// HeaderValueTyped::Addresses(vec![EmailAddress::new(
618/// Some("James Smythe".to_owned()),
619/// Some("james@example.com".to_owned()),
620/// )]),
621/// );
622/// ```
623///
624/// ## GroupedAddresses (RFC 8621 §4.1.2.4)
625///
626/// ```
627/// use mime_tree::{parse_header_typed, AddressGroup, EmailAddress, HeaderForm, HeaderValueTyped};
628///
629/// let raw = b"Friends: alice@example.com, bob@example.com;";
630/// let parsed = parse_header_typed(HeaderForm::GroupedAddresses, raw);
631/// assert_eq!(
632/// parsed,
633/// HeaderValueTyped::GroupedAddresses(vec![AddressGroup::new(
634/// Some("Friends".to_owned()),
635/// vec![
636/// EmailAddress::new(None, Some("alice@example.com".to_owned())),
637/// EmailAddress::new(None, Some("bob@example.com".to_owned())),
638/// ],
639/// )]),
640/// );
641/// ```
642///
643/// ## MessageIds (RFC 8621 §4.1.2.5)
644///
645/// ```
646/// use mime_tree::{parse_header_typed, HeaderForm, HeaderValueTyped};
647///
648/// let raw = b"<abc@example.com> <def@example.com>";
649/// let parsed = parse_header_typed(HeaderForm::MessageIds, raw);
650/// assert_eq!(
651/// parsed,
652/// HeaderValueTyped::MessageIds(vec![
653/// "abc@example.com".to_owned(),
654/// "def@example.com".to_owned(),
655/// ]),
656/// );
657/// ```
658///
659/// ## Date (RFC 5322 §3.3 / RFC 3339)
660///
661/// ```
662/// use mime_tree::{parse_header_typed, HeaderForm, HeaderValueTyped};
663///
664/// let raw = b" Fri, 21 Nov 1997 09:55:06 -0600";
665/// let parsed = parse_header_typed(HeaderForm::Date, raw);
666/// if let HeaderValueTyped::DateTime(Some(dt)) = parsed {
667/// assert_eq!(dt.to_rfc3339(), "1997-11-21T09:55:06-06:00");
668/// } else {
669/// panic!("expected DateTime");
670/// }
671/// ```
672///
673/// ## URLs (RFC 8621 §4.1.2.7 / RFC 2369)
674///
675/// ```
676/// use mime_tree::{parse_header_typed, HeaderForm, HeaderValueTyped};
677///
678/// // RFC 2369 List-Help with comment after the URL.
679/// let raw = b" <mailto:list@host.com?subject=help> (List Instructions)";
680/// let parsed = parse_header_typed(HeaderForm::URLs, raw);
681/// assert_eq!(
682/// parsed,
683/// HeaderValueTyped::URLs(vec!["mailto:list@host.com?subject=help".to_owned()]),
684/// );
685/// ```
686///
687/// ## Raw (RFC 8621 §4.1.2.1)
688///
689/// ```
690/// use mime_tree::{parse_header_typed, HeaderForm, HeaderValueTyped};
691///
692/// // Surrounding whitespace stripped; no other transformation.
693/// // Encoded-words survive verbatim.
694/// let raw = b" Subject line with =?UTF-8?Q?encoded?= words ";
695/// let parsed = parse_header_typed(HeaderForm::Raw, raw);
696/// assert_eq!(
697/// parsed,
698/// HeaderValueTyped::Raw("Subject line with =?UTF-8?Q?encoded?= words".to_owned()),
699/// );
700/// ```
701#[must_use]
702pub fn parse_header_typed(form: HeaderForm, raw_value: &[u8]) -> HeaderValueTyped {
703 // Why the crlf_terminated() round-trip below: mail-parser's typed
704 // parsing APIs (parse_address, parse_date, etc.) only accept raw
705 // bytes via MessageStream — there is no "parse from &str" path.
706 // The CRLF re-termination is also required because MessageStream
707 // expects RFC 5322 wire-format input. Do not remove either step.
708 match form {
709 HeaderForm::Raw => {
710 // RFC 8621 §4.1.2.1: the value is the header field value
711 // with surrounding white space removed. Non-UTF-8 bytes are
712 // replaced with U+FFFD via `from_utf8_lossy` so
713 // malformed-but-non-empty input does not collapse into an
714 // indistinguishable empty string. This matches mail-parser's
715 // own handling of non-UTF-8 header bytes (`HeaderValue::Text`
716 // is `Cow<str>` populated via `from_utf8_lossy`).
717 let s = String::from_utf8_lossy(raw_value);
718 HeaderValueTyped::Raw(s.trim().to_owned())
719 }
720 HeaderForm::Text => {
721 // RFC 8621 §4.1.2.2: unfold whitespace, strip trailing CRLF
722 // and leading SP, decode RFC 2047 encoded-words with known
723 // charsets, NFC-normalise the result.
724 //
725 // mail-parser's `parse_unstructured` handles unfolding,
726 // CRLF stripping, leading-SP removal, and RFC 2047 decoding
727 // in one pass — that's steps 1-4 of RFC 8621 §4.1.2.2.
728 // Step 5 (NFC) is applied here via the unicode-normalization
729 // crate.
730 let buf = crlf_terminated(raw_value);
731 let hv = MessageStream::new(&buf).parse_unstructured();
732 let decoded = match hv {
733 HeaderValue::Text(s) => s.into_owned(),
734 _ => String::new(),
735 };
736 HeaderValueTyped::Text(decoded.nfc().collect())
737 }
738 HeaderForm::URLs => {
739 // RFC 8621 §4.1.2.7 / RFC 2369 §2: each URL is wrapped in
740 // angle brackets. RFC 8621 §4.1.2.7 mandates that any
741 // value outside of the angle-bracket arguments MUST be
742 // ignored. mail-parser's address parser doesn't honour
743 // that contract (e.g. bare `https://example.com/u/abc`
744 // is treated as a malformed address with `https` as a
745 // group name), so we extract bracket contents directly
746 // rather than delegating.
747 HeaderValueTyped::URLs(extract_bracketed_urls(raw_value))
748 }
749 HeaderForm::Addresses => {
750 let buf = crlf_terminated(raw_value);
751 let hv = MessageStream::new(&buf).parse_address();
752 HeaderValueTyped::Addresses(flatten_addresses(&hv))
753 }
754 HeaderForm::GroupedAddresses => {
755 let buf = crlf_terminated(raw_value);
756 let hv = MessageStream::new(&buf).parse_address();
757 HeaderValueTyped::GroupedAddresses(group_addresses(&hv))
758 }
759 HeaderForm::MessageIds => {
760 // mail-parser's `parse_id` has a broken-client recovery
761 // branch (mail-parser-0.11/src/parsers/fields/id.rs) that
762 // returns `HeaderValue::Text` containing the unparsed bytes
763 // when no `<...>` tokens were found in the input. From the
764 // result type alone we cannot tell that case apart from the
765 // single-valid-msg-id case (`<x>` → `Text("x")`).
766 //
767 // Discriminator: a Text result is the result of bracket
768 // stripping iff the original input contained at least one
769 // `<` byte. mail-parser does not insert angle brackets that
770 // were not present in the input, so absence of `<` in the
771 // raw bytes is a sufficient signal that mail-parser cannot
772 // have produced Text via the stripping branch.
773 let buf = crlf_terminated(raw_value);
774 let hv = MessageStream::new(&buf).parse_id();
775 let had_angle_brackets = raw_value.contains(&b'<');
776 HeaderValueTyped::MessageIds(extract_msg_ids(&hv, had_angle_brackets))
777 }
778 HeaderForm::Date => {
779 let buf = crlf_terminated(raw_value);
780 let hv = MessageStream::new(&buf).parse_date();
781 // mail-parser's `parse_date` returns `HeaderValue::Empty`
782 // when it cannot recover 6 numeric components. Belt-and-
783 // braces: also reject zero month/day, which RFC 5322 §3.3
784 // does not permit and which mail-parser can emit when its
785 // position counter advances past slots that never got
786 // numeric digits. (A zero `year` is unreachable in
787 // mail-parser-0.11: years are remapped 0..=49 → +2000,
788 // 50..=99 → +1900, so `parts[2]` is never copied through
789 // as 0.)
790 let dt = match hv {
791 HeaderValue::DateTime(dt) if dt.month != 0 && dt.day != 0 => {
792 Some(HeaderDateTime::from_mail_parser(dt))
793 }
794 _ => None,
795 };
796 HeaderValueTyped::DateTime(dt)
797 }
798 }
799}
800
801// ---------------------------------------------------------------------------
802// Per-form entry points
803// ---------------------------------------------------------------------------
804//
805// Convenience wrappers over `parse_header_typed` for callers that know
806// the form statically. Each wrapper unwraps the `HeaderValueTyped`
807// variant that `parse_header_typed` is contractually required to return
808// for the matching `HeaderForm`, eliminating the boilerplate match at
809// every call site. If `parse_header_typed` ever broke that contract, the
810// wrapper returns the empty result for the form (defensive, not panic).
811
812/// Parse the header field value as an RFC 8621 §4.1.2.1 Raw form: trim
813/// surrounding whitespace, decode bytes as UTF-8 (lossy with U+FFFD on
814/// invalid sequences).
815///
816/// Convenience wrapper over [`parse_header_typed`] with
817/// [`HeaderForm::Raw`].
818#[must_use]
819pub fn parse_raw(raw_value: &[u8]) -> String {
820 match parse_header_typed(HeaderForm::Raw, raw_value) {
821 HeaderValueTyped::Raw(s) => s,
822 _ => String::new(),
823 }
824}
825
826/// Parse the header field value as an RFC 8621 §4.1.2.2 Text form:
827/// whitespace unfolded, RFC 2047 encoded-words decoded, Unicode
828/// normalised to NFC.
829///
830/// Convenience wrapper over [`parse_header_typed`] with
831/// [`HeaderForm::Text`].
832#[must_use]
833pub fn parse_text(raw_value: &[u8]) -> String {
834 match parse_header_typed(HeaderForm::Text, raw_value) {
835 HeaderValueTyped::Text(s) => s,
836 _ => String::new(),
837 }
838}
839
840/// Parse the header field value as an RFC 8621 §4.1.2.3 Addresses form:
841/// a flat list of mailboxes with group structure discarded.
842///
843/// Convenience wrapper over [`parse_header_typed`] with
844/// [`HeaderForm::Addresses`].
845#[must_use]
846pub fn parse_addresses(raw_value: &[u8]) -> Vec<EmailAddress> {
847 match parse_header_typed(HeaderForm::Addresses, raw_value) {
848 HeaderValueTyped::Addresses(v) => v,
849 _ => Vec::new(),
850 }
851}
852
853/// Parse the header field value as an RFC 8621 §4.1.2.4 GroupedAddresses
854/// form: preserves group structure; a flat mailbox-list is wrapped in a
855/// single anonymous group.
856///
857/// Convenience wrapper over [`parse_header_typed`] with
858/// [`HeaderForm::GroupedAddresses`].
859#[must_use]
860pub fn parse_grouped_addresses(raw_value: &[u8]) -> Vec<AddressGroup> {
861 match parse_header_typed(HeaderForm::GroupedAddresses, raw_value) {
862 HeaderValueTyped::GroupedAddresses(v) => v,
863 _ => Vec::new(),
864 }
865}
866
867/// Parse the header field value as an RFC 8621 §4.1.2.5 MessageIds form:
868/// `<...>`-stripped msg-id strings.
869///
870/// Convenience wrapper over [`parse_header_typed`] with
871/// [`HeaderForm::MessageIds`].
872#[must_use]
873pub fn parse_message_ids(raw_value: &[u8]) -> Vec<String> {
874 match parse_header_typed(HeaderForm::MessageIds, raw_value) {
875 HeaderValueTyped::MessageIds(v) => v,
876 _ => Vec::new(),
877 }
878}
879
880/// Parse the header field value as an RFC 8621 §4.1.2.6 Date form: an
881/// RFC 5322 §3.3 date-time, or `None` on parse failure.
882///
883/// Convenience wrapper over [`parse_header_typed`] with
884/// [`HeaderForm::Date`].
885#[must_use]
886pub fn parse_date(raw_value: &[u8]) -> Option<HeaderDateTime> {
887 match parse_header_typed(HeaderForm::Date, raw_value) {
888 HeaderValueTyped::DateTime(dt) => dt,
889 _ => None,
890 }
891}
892
893/// Parse the header field value as an RFC 8621 §4.1.2.7 URLs form: bare
894/// URL strings with surrounding angle brackets stripped (RFC 2369).
895///
896/// Convenience wrapper over [`parse_header_typed`] with
897/// [`HeaderForm::URLs`].
898#[must_use]
899pub fn parse_urls(raw_value: &[u8]) -> Vec<String> {
900 match parse_header_typed(HeaderForm::URLs, raw_value) {
901 HeaderValueTyped::URLs(v) => v,
902 _ => Vec::new(),
903 }
904}
905
906/// Parse an existing [`ParsedHeader`]'s value into the requested
907/// RFC 8621 parsed form.
908///
909/// Uses `header.raw_value` (the original wire bytes) rather than
910/// `header.value.as_bytes()`, so non-UTF-8 bytes in structured headers
911/// (From, To, Date, etc.) are preserved faithfully. This matters for
912/// display names containing ISO-8859-1 or other non-UTF-8 encodings,
913/// which would be silently corrupted by lossy UTF-8 round-tripping.
914#[must_use]
915pub fn parse_header_typed_from(header: &ParsedHeader, form: HeaderForm) -> HeaderValueTyped {
916 parse_header_typed(form, &header.raw_value)
917}
918
919// ---------------------------------------------------------------------------
920// Conversion helpers
921// ---------------------------------------------------------------------------
922
923/// Trim ASCII whitespace from `s` and return `Some(trimmed)` if the
924/// result is non-empty, otherwise `None`.
925///
926/// Centralises the RFC 8621 §4.1.2.3 / §4.1.2.4 normalisation rule for
927/// display names ("trim surrounding white space; an empty result is
928/// `None`").
929fn trim_or_none(s: &str) -> Option<String> {
930 let trimmed = s.trim();
931 if trimmed.is_empty() {
932 None
933 } else {
934 Some(trimmed.to_owned())
935 }
936}
937
938/// Ensure `raw_value` ends with a CRLF, returning the original bytes
939/// when it already does (no allocation), and a freshly allocated copy
940/// terminated with `\r\n` otherwise.
941///
942/// mail-parser's `MessageStream` parsers expect header field bodies in
943/// a real RFC 5322 stream — i.e. terminated by CRLF. Callers pass the
944/// field value with no trailing CRLF; this helper normalises any of
945/// {already-CRLF, LF-only, no-line-ending} to CRLF-terminated.
946fn crlf_terminated(raw_value: &[u8]) -> Cow<'_, [u8]> {
947 if raw_value.ends_with(b"\r\n") {
948 Cow::Borrowed(raw_value)
949 } else if raw_value.ends_with(b"\n") {
950 // Strip the bare LF and replace with CRLF.
951 let head = &raw_value[..raw_value.len() - 1];
952 let mut v = Vec::with_capacity(head.len() + 2);
953 v.extend_from_slice(head);
954 v.extend_from_slice(b"\r\n");
955 Cow::Owned(v)
956 } else {
957 let mut v = Vec::with_capacity(raw_value.len() + 2);
958 v.extend_from_slice(raw_value);
959 v.extend_from_slice(b"\r\n");
960 Cow::Owned(v)
961 }
962}
963
964fn convert_addr(addr: &mail_parser::Addr<'_>) -> EmailAddress {
965 // RFC 8621 §4.1.2.3 mandates that for a quoted-string display name,
966 // surrounding DQUOTE characters be removed, quoted-pairs decoded, and
967 // white space unfolded with leading/trailing white space removed.
968 // mail-parser already does the dequoting and quoted-pair decoding, but
969 // leaves surrounding white space inside the quoted-string in place
970 // (e.g. `" James Smythe"` parses to `Some(" James Smythe")`). Strip
971 // here. An empty trimmed result is mapped to `None` so a lone empty
972 // quoted-string does not surface as a phantom display name.
973 let name = addr.name.as_ref().and_then(|s| trim_or_none(s.as_ref()));
974 EmailAddress {
975 name,
976 address: addr.address.as_ref().map(|s| s.as_ref().to_owned()),
977 }
978}
979
980/// Flatten an `Address` (which is either a flat list of mailboxes or a
981/// list of groups) into a single `Vec<EmailAddress>`. Used for
982/// [`HeaderForm::Addresses`], which per RFC 8621 §4.1.2.3 discards group
983/// structure and produces one item per mailbox.
984fn flatten_addresses(hv: &HeaderValue<'_>) -> Vec<EmailAddress> {
985 match hv {
986 HeaderValue::Address(Address::List(list)) => list.iter().map(convert_addr).collect(),
987 HeaderValue::Address(Address::Group(groups)) => groups
988 .iter()
989 .flat_map(|g| g.addresses.iter().map(convert_addr))
990 .collect(),
991 _ => Vec::new(),
992 }
993}
994
995/// Convert an `Address` into a list of groups, per RFC 8621 §4.1.2.4. A
996/// flat list of mailboxes is wrapped in a single group with `name = None`.
997fn group_addresses(hv: &HeaderValue<'_>) -> Vec<AddressGroup> {
998 match hv {
999 HeaderValue::Address(Address::List(list)) if !list.is_empty() => {
1000 vec![AddressGroup {
1001 name: None,
1002 addresses: list.iter().map(convert_addr).collect(),
1003 }]
1004 }
1005 HeaderValue::Address(Address::Group(groups)) => groups
1006 .iter()
1007 .map(|g| AddressGroup {
1008 // RFC 8621 §4.1.2.4: the group `name` is "processed the
1009 // same as the name in the EmailAddress type" — trim white
1010 // space; empty after trimming becomes None.
1011 name: g.name.as_ref().and_then(|s| trim_or_none(s.as_ref())),
1012 addresses: g.addresses.iter().map(convert_addr).collect(),
1013 })
1014 .collect(),
1015 _ => Vec::new(),
1016 }
1017}
1018
1019/// Extract bare msg-id strings from a `HeaderValue` produced by
1020/// mail-parser's `parse_id`.
1021///
1022/// `had_angle_brackets` indicates whether the original raw input
1023/// contained at least one `<` byte. mail-parser's `parse_id` returns
1024/// `HeaderValue::Text` both for a single valid bracket-stripped msg-id
1025/// and for its broken-client recovery branch on input with no brackets at
1026/// all (returning the lossy UTF-8 of the unparsed bytes). Without this
1027/// discriminator, malformed input would leak the unparsed bytes into the
1028/// result vec, violating the RFC 8621 §4.1.2.5 empty-on-malformed
1029/// contract.
1030fn extract_msg_ids(hv: &HeaderValue<'_>, had_angle_brackets: bool) -> Vec<String> {
1031 match hv {
1032 HeaderValue::Text(s) if had_angle_brackets => vec![s.as_ref().to_owned()],
1033 HeaderValue::TextList(list) => list.iter().map(|s| s.as_ref().to_owned()).collect(),
1034 _ => Vec::new(),
1035 }
1036}
1037
1038/// Extract URL strings from RFC 2369 / RFC 8621 §4.1.2.7 bracketed
1039/// list-URL syntax.
1040///
1041/// Scans `raw_value` for `<...>` substrings and yields the byte sequence
1042/// between each matching pair, in order. Bytes outside the brackets are
1043/// ignored — including comments (`(...)`), CFWS, commas, and any
1044/// malformed framing — per RFC 8621 §4.1.2.7: "Any value outside of the
1045/// angle bracket arguments MUST be ignored."
1046///
1047/// ASCII whitespace inside a bracketed value is stripped, because RFC
1048/// 3986 URIs cannot contain literal whitespace; any whitespace seen is a
1049/// CRLF folding artifact. Non-UTF-8 bracket contents are dropped.
1050///
1051/// An unclosed `<` (no matching `>`) is ignored. An empty `<>` is
1052/// ignored.
1053fn extract_bracketed_urls(raw_value: &[u8]) -> Vec<String> {
1054 let mut out = Vec::new();
1055 let mut iter = raw_value.iter();
1056 while let Some(&b) = iter.next() {
1057 if b != b'<' {
1058 continue;
1059 }
1060 let mut url = Vec::new();
1061 let mut closed = false;
1062 for &b2 in iter.by_ref() {
1063 if b2 == b'>' {
1064 closed = true;
1065 break;
1066 }
1067 // Drop ASCII whitespace; URIs per RFC 3986 cannot contain it
1068 // literally, so any whitespace seen is CRLF folding.
1069 if !matches!(b2, b' ' | b'\t' | b'\r' | b'\n') {
1070 url.push(b2);
1071 }
1072 }
1073 if !closed || url.is_empty() {
1074 continue;
1075 }
1076 if let Ok(s) = std::str::from_utf8(&url) {
1077 out.push(s.to_owned());
1078 }
1079 }
1080 out
1081}