Skip to main content

daaki_message/parser/interpret/
address.rs

1//! RFC 5322 Section 3.4 address parsing.
2//!
3//! Parses comma-separated address lists, single addresses (name-addr and
4//! addr-spec), RFC 5322 group syntax, parenthesized comments, and
5//! display-name phrase normalization.
6//!
7//! # References
8//! - RFC 5322 Section 3.4 (address specification)
9//! - RFC 5322 Section 3.2.2 (comments)
10//! - RFC 5322 Section 3.2.4 (quoted-string)
11//! - RFC 5322 Section 3.2.5 (phrase / display-name)
12//! - RFC 2047 Section 5 (encoded-words in phrase context)
13
14use super::{encoded_words, get_header_value};
15
16use crate::types::Address;
17
18/// Extracts all `From` addresses.
19///
20/// RFC 5322 Section 3.6.2: `from = "From:" mailbox-list CRLF` — multiple
21/// originator mailboxes are valid and all must be preserved.
22///
23/// Address structure is parsed first on the raw header value, then RFC 2047
24/// encoded words are decoded in each address's display name. Decoding before
25/// parsing would break address splitting when an encoded-word display name
26/// contains address-significant characters (`,`, `<`, `>`, `:`, `;`).
27///
28/// Extracts addresses from ALL occurrences of the `From` header.
29///
30/// RFC 5322 Section 3.6 specifies that `From` SHOULD appear at most once.
31/// However, broken mailers sometimes produce duplicate headers. Per Postel's
32/// law ("be liberal in what you accept"), we concatenate addresses from every
33/// occurrence — consistent with how [`extract_address_list`] handles
34/// To/Cc/Bcc/Reply-To.
35///
36/// # References
37/// - RFC 5322 Section 3.6.2 — originator fields (from = mailbox-list)
38/// - RFC 2047 Section 5 rule (3) — encoded-words in phrase context
39/// - RFC 5322 Section 3.4 — address specification
40pub(crate) fn extract_from(headers: &[(String, String)]) -> Vec<Address> {
41    // Iterate ALL matching "from" headers, not just the first, so that
42    // duplicate From headers produced by broken mailers are concatenated
43    // rather than silently dropped.
44    headers
45        .iter()
46        .filter(|(k, _)| k == "from")
47        .flat_map(|(_, v)| decode_address_names(parse_address_list(v)))
48        .collect()
49}
50
51/// Extracts the Sender mailbox from the `Sender` header (RFC 5322 Section 3.6.2).
52///
53/// Unlike `From` (which is a `mailbox-list`), `Sender` contains exactly one
54/// `mailbox`. If the header contains multiple addresses, only the first is
55/// used (Postel's law — be liberal in what you accept).
56///
57/// Returns `None` when the `Sender` header is absent.
58///
59/// # References
60/// - RFC 5322 Section 3.6.2 (sender field)
61pub(crate) fn extract_sender(headers: &[(String, String)]) -> Option<Address> {
62    let value = get_header_value(headers, "sender")?;
63    // Reuse the same parse-then-decode pipeline as From/To/Cc.
64    // RFC 5322 Section 3.6.2: sender = "Sender:" mailbox CRLF
65    let addrs = decode_address_names(parse_address_list(&value));
66    addrs.into_iter().next()
67}
68
69/// Extracts an address list from ALL occurrences of the named header.
70///
71/// RFC 5322 Section 3.6 specifies that destination address fields (To, Cc,
72/// Bcc) and Reply-To SHOULD appear at most once. However, broken mailers
73/// sometimes produce duplicate headers. Per Postel's law ("be liberal in
74/// what you accept"), we concatenate addresses from every occurrence to
75/// avoid silently dropping recipients.
76///
77/// Parses address structure first, then decodes RFC 2047 encoded words in
78/// display names — see [`extract_from`] for rationale.
79///
80/// # References
81/// - RFC 5322 Section 3.6.3 (destination address fields)
82pub(crate) fn extract_address_list(headers: &[(String, String)], name: &str) -> Vec<Address> {
83    headers
84        .iter()
85        .filter(|(k, _)| k == name)
86        .flat_map(|(_, v)| decode_address_names(parse_address_list(v)))
87        .collect()
88}
89
90/// Returns addresses unchanged — RFC 2047 decoding is now performed inside
91/// [`parse_single_address`] where the quoted-string vs unquoted-phrase
92/// context is known.
93///
94/// RFC 2047 Section 5: encoded-words MUST NOT appear inside a quoted-string.
95/// By decoding only in the unquoted-phrase path (and comment path) within
96/// `parse_single_address`, we correctly preserve encoded-word literals that
97/// appear inside quoted-strings.
98///
99/// This function is retained as a pass-through to avoid churning callers.
100///
101/// # References
102/// - RFC 2047 Section 5 (encoded-words placement rules)
103fn decode_address_names(addrs: Vec<Address>) -> Vec<Address> {
104    addrs
105}
106
107/// Parses a comma-separated address list, respecting quoted strings, angle
108/// brackets, parenthesized comments, and RFC 5322 group syntax
109/// (RFC 5322 Section 3.4).
110///
111/// This is the **liberal** address parser used internally to interpret
112/// inbound `From`/`To`/`Cc`/`Bcc`/`Reply-To`/`Sender` headers, exposed
113/// publicly so consumers can apply the same Postel-compliant parsing to
114/// other "be liberal in what you accept" inputs — for example,
115/// user-typed recipient strings in a compose form, or addresses already
116/// extracted from an IMAP `ENVELOPE` response.
117///
118/// # Behavior
119///
120/// - Returns `Vec<Address>` with one entry per recognized mailbox. The
121///   parser never errors: malformed segments are best-effort-recovered
122///   or silently dropped (Postel's law, RFC 1122 Section 1.2.2).
123/// - Group syntax (`display-name ":" [group-list] ";"`) is unwrapped
124///   and member addresses are flattened into the result. Empty groups
125///   (e.g., `undisclosed-recipients:;`) contribute no addresses.
126/// - Parenthesized comments (RFC 5322 Section 3.2.2) may appear in
127///   addr-spec CFWS contexts and can contain commas, angle brackets,
128///   and other address-significant characters; these are not treated
129///   as separators.
130/// - Domain-literals (`[192.0.2.1]`, `[IPv6:...]`) are preserved
131///   intact per RFC 5321 Section 4.1.3.
132/// - Display names are normalized: quoted-strings are unescaped, CFWS
133///   comments are stripped, and RFC 2047 encoded-words are decoded
134///   only in unquoted phrase spans (RFC 2047 Section 5 rule (3)).
135///
136/// # No outgoing validation
137///
138/// The returned [`Address`] records are constructed via
139/// [`Address::new_unchecked`] and may contain syntax that is technically
140/// non-conformant but still meaningful — exactly what is needed when
141/// receiving from the network. **They have not been validated against
142/// the strict outgoing-mail rules in RFC 5322 Section 3.4.**
143///
144/// If you are about to send mail — or otherwise need to enforce strict
145/// validation — pass each result through [`Address::new`] or
146/// [`Address::with_name`] afterwards. Those constructors apply the same
147/// rules the message builder uses and will reject malformed input at
148/// construction time rather than at send time.
149///
150/// # Input expectations
151///
152/// The input is a single, already-decoded address-list string.
153/// This function does **not** perform RFC 5322 Section 2.2.3 header
154/// unfolding, charset detection, or transfer-encoding decoding.
155/// Feeding it raw header bytes with CRLF folds, 8-bit content from
156/// unknown charsets, or quoted-printable sequences will produce wrong
157/// results — use [`parse_email`](crate::parse_email) for raw message
158/// bytes, and use this function for text that has already crossed the
159/// wire/semantic boundary (user input in a UTF-8 terminal, a decoded
160/// header value, etc.).
161///
162/// # Example
163///
164/// ```
165/// use daaki_message::{parse_address_list, Address};
166///
167/// let raw = r#""Doe, Jane" <jane@example.com>, alice@example.com"#;
168/// let addrs = parse_address_list(raw);
169///
170/// assert_eq!(addrs.len(), 2);
171/// assert_eq!(addrs[0].name.as_deref(), Some("Doe, Jane"));
172/// assert_eq!(addrs[0].email, "jane@example.com");
173/// assert_eq!(addrs[1].name, None);
174/// assert_eq!(addrs[1].email, "alice@example.com");
175///
176/// // For outgoing mail, re-validate each result through the strict
177/// // constructors so malformed input is rejected before send time.
178/// let validated: Result<Vec<Address>, _> = addrs
179///     .into_iter()
180///     .map(|a| match a.name {
181///         Some(name) => Address::with_name(name, a.email),
182///         None => Address::new(a.email),
183///     })
184///     .collect();
185/// assert!(validated.is_ok());
186/// ```
187///
188/// # References
189/// - RFC 5322 Section 3.4 (address specification)
190/// - RFC 5322 Section 3.2.2 (comments)
191/// - RFC 5322 Section 3.2.4 (quoted-string)
192/// - RFC 5322 Section 3.2.5 (phrase / display-name)
193/// - RFC 5321 Section 4.1.3 (domain-literal)
194/// - RFC 2047 Section 5 (encoded-words in phrase context)
195/// - RFC 1122 Section 1.2.2 (robustness principle)
196pub fn parse_address_list(input: &str) -> Vec<Address> {
197    let mut addresses = Vec::new();
198    let mut current = String::new();
199    let mut in_quotes = false;
200    let mut escaped = false;
201    let mut angle_depth: i32 = 0;
202    // Track parenthesized comment depth (RFC 5322 Section 3.2.2).
203    // Commas and other structural characters inside comments must not
204    // be treated as address separators.
205    let mut paren_depth: i32 = 0;
206    // Track whether we're inside a group construct (after ':' but before ';').
207    // RFC 5322 Section 3.4: group = display-name ":" [group-list] ";"
208    let mut in_group = false;
209    // Track whether we're inside a domain-literal `[...]`
210    // (RFC 5321 Section 4.1.3: domain-literal = "[" *dtext "]").
211    // Characters inside brackets (e.g., commas in IPv6 or non-standard
212    // domain-literals) must not be treated as structural separators.
213    let mut in_brackets = false;
214
215    for ch in input.chars() {
216        // Inside a quoted-string, a backslash escapes the next character
217        // (RFC 5322 Section 3.2.4 quoted-pair).
218        if escaped {
219            current.push(ch);
220            escaped = false;
221            continue;
222        }
223        match ch {
224            '\\' if in_quotes || paren_depth > 0 => {
225                // Backslash escapes next character in quoted-strings
226                // (RFC 5322 Section 3.2.4) and inside comments
227                // (RFC 5322 Section 3.2.2 quoted-pair in ccontent).
228                escaped = true;
229                current.push(ch);
230            }
231            '"' if paren_depth == 0 => {
232                in_quotes = !in_quotes;
233                current.push(ch);
234            }
235            // RFC 5322 Section 3.2.2: parenthesized comments may be nested.
236            // Track depth so that commas inside comments are not treated as
237            // address separators.
238            '(' if !in_quotes => {
239                paren_depth += 1;
240                current.push(ch);
241            }
242            ')' if !in_quotes && paren_depth > 0 => {
243                paren_depth -= 1;
244                current.push(ch);
245            }
246            // RFC 5321 Section 4.1.3: domain-literal = "[" *dtext "]".
247            // Track bracket depth so that commas and other structural
248            // characters inside domain-literals are not misinterpreted.
249            '[' if !in_quotes && paren_depth == 0 => {
250                in_brackets = true;
251                current.push(ch);
252            }
253            ']' if !in_quotes && paren_depth == 0 && in_brackets => {
254                in_brackets = false;
255                current.push(ch);
256            }
257            '<' if !in_quotes && paren_depth == 0 => {
258                angle_depth += 1;
259                current.push(ch);
260            }
261            '>' if !in_quotes && paren_depth == 0 && angle_depth > 0 => {
262                angle_depth -= 1;
263                current.push(ch);
264            }
265            // RFC 5322 Section 3.4: ':' starts a group construct when
266            // we're not inside quotes, angle brackets, comments, or an
267            // existing group.
268            // Heuristic: only treat as group if the current token contains
269            // no '@' outside of quoted strings and parenthesized comments
270            // (i.e., it's a display-name, not a bare addr-spec).  An '@'
271            // inside a quoted display-name (e.g., `"user@host":`) or a
272            // comment (e.g., `Group (user@host):`) must not prevent
273            // group detection (RFC 5322 Sections 3.2.2, 3.2.4).
274            ':' if !in_quotes
275                && angle_depth == 0
276                && paren_depth == 0
277                && !in_group
278                && !in_brackets =>
279            {
280                if contains_at_outside_quotes(current.trim()) {
281                    current.push(ch);
282                } else {
283                    // Enter group: discard the display-name portion
284                    in_group = true;
285                    current.clear();
286                }
287            }
288            // RFC 5322 Section 3.4: ';' terminates the group construct.
289            ';' if !in_quotes
290                && angle_depth == 0
291                && paren_depth == 0
292                && in_group
293                && !in_brackets =>
294            {
295                // Emit any pending address inside the group
296                if let Some(addr) = parse_single_address(&current) {
297                    addresses.push(addr);
298                }
299                current.clear();
300                in_group = false;
301            }
302            ',' if !in_quotes && angle_depth == 0 && paren_depth == 0 && !in_brackets => {
303                if let Some(addr) = parse_single_address(&current) {
304                    addresses.push(addr);
305                }
306                current.clear();
307            }
308            _ => current.push(ch),
309        }
310    }
311    if let Some(addr) = parse_single_address(&current) {
312        addresses.push(addr);
313    }
314
315    addresses
316}
317
318/// Parses a single address: either `Display Name <email>` or bare `email`.
319///
320/// Handles RFC 5322 Section 3.2.2 comments (parenthesized text) that may
321/// appear before or after a bare addr-spec per Section 3.4.1 CFWS rules.
322/// A trailing comment like `(Display Name)` is used as the display name,
323/// following the common RFC 822 convention.
324///
325/// # References
326/// - RFC 5322 Section 3.4 (address specification)
327/// - RFC 5322 Section 3.4.1 (addr-spec)
328/// - RFC 5322 Section 3.2.2 (comments)
329pub(crate) fn parse_single_address(input: &str) -> Option<Address> {
330    let input = input.trim();
331    if input.is_empty() {
332        return None;
333    }
334
335    // Try "Display Name <email@domain>" form (RFC 5322 Section 3.4)
336    if let Some(angle_start) = input.rfind('<') {
337        if let Some(angle_end) = input.rfind('>') {
338            if angle_end > angle_start {
339                let mut email = input[angle_start + 1..angle_end].trim().to_string();
340                // RFC 5322 Section 4.4: strip obsolete source route
341                // (obs-route = obs-domain-list ":"). Example:
342                // `<@hop1,@hop2:user@domain>` → `user@domain`.
343                if email.starts_with('@') {
344                    if let Some(colon) = email.find(':') {
345                        email = email[colon + 1..].trim().to_string();
346                    }
347                }
348                let name_part = input[..angle_start].trim();
349                let name = normalize_display_name_phrase(name_part);
350                if !email.is_empty() {
351                    return Some(Address { name, email });
352                }
353            }
354        }
355    }
356
357    // Bare email address — may have RFC 5322 Section 3.2.2 comments
358    // (parenthesized text) before or after the addr-spec per Section 3.4.1.
359    //
360    // Use `contains_at_outside_quotes` instead of plain `contains('@')`
361    // so that a quoted local-part containing `@` (e.g., `"user@internal"`)
362    // is not mistaken for an addr-spec when there is no structural `@`
363    // outside the quoted-string (RFC 5322 Section 3.4.1).
364    if contains_at_outside_quotes(input) {
365        // Check for a trailing comment like "user@example.com (Display Name)".
366        // RFC 822 convention: trailing parenthesized comment is the display name.
367        //
368        // Use `find_paren_outside_quotes` instead of plain `find('(')` so
369        // that parentheses inside a quoted local-part (RFC 5322 Section 3.2.4)
370        // are not mistaken for comment delimiters.
371        if let Some(paren_start) = find_paren_outside_quotes(input) {
372            let email_part = input[..paren_start].trim();
373            let comment_and_rest = input[paren_start..].trim();
374            let name = if !email_part.is_empty() && contains_at_outside_quotes(email_part) {
375                // Trailing comment: extract text between parentheses
376                // as display name (RFC 822 convention, RFC 5322 Section 3.4.1 CFWS).
377                // Decode RFC 2047 encoded words in the comment text
378                // (RFC 2047 Section 5 rule (2): encoded-words may appear in comments).
379                extract_comment_text(comment_and_rest)
380                    .map(|n| encoded_words::decode_encoded_words(&n))
381            } else if email_part.is_empty() || !contains_at_outside_quotes(email_part) {
382                // Leading comment: the comment appears before the addr-spec.
383                // RFC 5322 Section 3.2.2 allows comments in CFWS positions,
384                // and the common RFC 822 convention uses a leading comment as
385                // the display name (e.g., `(John Doe) user@example.com`).
386                // Verify the text after the comment contains an addr-spec.
387                let after_comment = strip_comments(comment_and_rest);
388                if contains_at_outside_quotes(after_comment.trim()) {
389                    extract_comment_text(comment_and_rest)
390                        .map(|n| encoded_words::decode_encoded_words(&n))
391                } else {
392                    None
393                }
394            } else {
395                None
396            };
397            // Strip all comments to get the bare addr-spec
398            // (RFC 5322 Section 3.2.2)
399            let stripped = strip_comments(input);
400            let email = stripped.trim().to_string();
401            if !email.is_empty() && contains_at_outside_quotes(&email) {
402                return Some(Address { name, email });
403            }
404        }
405        return Some(Address {
406            name: None,
407            email: input.to_string(),
408        });
409    }
410
411    None
412}
413
414/// Extracts the text content from a parenthesized RFC 5322 comment string.
415///
416/// Given a string like `(Display Name)`, returns `Some("Display Name")`.
417/// Handles nested parentheses and backslash-escaped characters per
418/// RFC 5322 Section 3.2.2.
419///
420/// # References
421/// - RFC 5322 Section 3.2.2 (comment syntax)
422pub(crate) fn extract_comment_text(s: &str) -> Option<String> {
423    let s = s.trim();
424    if !s.starts_with('(') {
425        return None;
426    }
427    // Find the matching closing paren, respecting nesting and escapes
428    let mut depth: u32 = 0;
429    let mut result = String::new();
430    let mut escaped = false;
431    let mut started = false;
432    for c in s.chars() {
433        if escaped {
434            escaped = false;
435            result.push(c);
436            continue;
437        }
438        match c {
439            '\\' => {
440                escaped = true;
441            }
442            '(' => {
443                if started {
444                    // Nested paren — include literally
445                    result.push(c);
446                }
447                depth = depth.saturating_add(1);
448                started = true;
449            }
450            ')' => {
451                depth = depth.saturating_sub(1);
452                if depth == 0 {
453                    break;
454                }
455                // Nested closing paren — include literally
456                result.push(c);
457            }
458            _ => {
459                if depth > 0 {
460                    result.push(c);
461                }
462            }
463        }
464    }
465    let trimmed = result.trim().to_string();
466    if trimmed.is_empty() {
467        None
468    } else {
469        Some(trimmed)
470    }
471}
472
473/// Returns `true` if `s` contains an `@` character outside of quoted strings
474/// and parenthesized comments.
475///
476/// Used by the group-address heuristic: an `@` inside a quoted display-name
477/// (e.g., `"user@host"`) or inside a parenthesized comment (e.g.,
478/// `Group (user@host):`) is not an addr-spec indicator and must not prevent
479/// recognition of group syntax (RFC 5322 Section 3.4).
480///
481/// # References
482/// - RFC 5322 Section 3.4 (group syntax)
483/// - RFC 5322 Section 3.2.2 (comment, quoted-pair inside comments)
484/// - RFC 5322 Section 3.2.4 (quoted-string, quoted-pair)
485pub(crate) fn contains_at_outside_quotes(s: &str) -> bool {
486    let mut in_quotes = false;
487    let mut paren_depth: u32 = 0;
488    let mut escaped = false;
489    for c in s.chars() {
490        if escaped {
491            escaped = false;
492            continue;
493        }
494        match c {
495            // Quoted-pair: backslash escapes the next character inside
496            // quoted-strings (RFC 5322 Section 3.2.4) and comments
497            // (RFC 5322 Section 3.2.2).
498            '\\' if in_quotes || paren_depth > 0 => escaped = true,
499            '"' if paren_depth == 0 => in_quotes = !in_quotes,
500            // RFC 5322 Section 3.2.2: comments nest and are delimited by
501            // parentheses.  Only track outside of quoted strings.
502            '(' if !in_quotes => paren_depth = paren_depth.saturating_add(1),
503            ')' if !in_quotes && paren_depth > 0 => paren_depth -= 1,
504            '@' if !in_quotes && paren_depth == 0 => return true,
505            _ => {}
506        }
507    }
508    false
509}
510
511/// Returns the byte offset of the first `(` that is not inside a quoted-string.
512///
513/// Parentheses inside a quoted local-part (e.g., `"user(foo)"@example.com`)
514/// are literal per RFC 5322 Section 3.2.4 and must not be treated as comment
515/// delimiters. This function walks the string respecting quoted-string
516/// boundaries so that only structural `(` characters are found.
517///
518/// # References
519/// - RFC 5322 Section 3.2.2 (comment syntax)
520/// - RFC 5322 Section 3.2.4 (quoted-string)
521pub(crate) fn find_paren_outside_quotes(s: &str) -> Option<usize> {
522    let mut in_quotes = false;
523    let mut escaped = false;
524    for (i, c) in s.char_indices() {
525        if escaped {
526            escaped = false;
527            continue;
528        }
529        match c {
530            // Quoted-pair: backslash escapes the next character inside
531            // quoted-strings (RFC 5322 Section 3.2.4).
532            '\\' if in_quotes => escaped = true,
533            '"' => in_quotes = !in_quotes,
534            '(' if !in_quotes => return Some(i),
535            _ => {}
536        }
537    }
538    None
539}
540
541/// Strips parenthesized comments from a string.
542///
543/// RFC 5322 Section 3.2.2 defines comments as text enclosed in parentheses,
544/// which may be nested. A backslash escapes the next character inside a comment.
545/// Parentheses inside quoted-strings (RFC 5322 Section 3.2.4) are literal
546/// characters and do not open or close comments.
547///
548/// # References
549/// - RFC 5322 Section 3.2.2 (comment syntax)
550/// - RFC 5322 Section 3.2.4 (quoted-string: parens are literal inside quotes)
551/// - RFC 5322 Section 4.3 (CFWS in obsolete date syntax)
552pub(crate) fn strip_comments(input: &str) -> String {
553    let mut result = String::with_capacity(input.len());
554    let mut depth: u32 = 0;
555    let mut escaped = false;
556    let mut in_quotes = false;
557    for c in input.chars() {
558        if escaped {
559            escaped = false;
560            if depth == 0 {
561                result.push(c);
562            }
563            continue;
564        }
565        // Inside a quoted-string, only backslash and closing quote are special.
566        // Parentheses are literal per RFC 5322 Section 3.2.4.
567        if in_quotes && depth == 0 {
568            match c {
569                '\\' => {
570                    escaped = true;
571                    result.push(c);
572                }
573                '"' => {
574                    in_quotes = false;
575                    result.push(c);
576                }
577                _ => result.push(c),
578            }
579            continue;
580        }
581        match c {
582            '\\' => {
583                escaped = true;
584                if depth == 0 {
585                    result.push(c);
586                }
587            }
588            '"' if depth == 0 => {
589                in_quotes = true;
590                result.push(c);
591            }
592            '(' => depth = depth.saturating_add(1),
593            ')' if depth > 0 => depth = depth.saturating_sub(1),
594            _ if depth == 0 => result.push(c),
595            _ => {}
596        }
597    }
598    result
599}
600
601/// Normalizes a `display-name` phrase from a `name-addr`.
602///
603/// RFC 5322 Section 3.2.5 defines `display-name = phrase`, where each `word`
604/// may be either an atom or a quoted-string. RFC 5322 Section 3.2.2 allows
605/// CFWS comments between those words, but comments are semantically
606/// invisible. RFC 2047 Section 5 additionally allows encoded-words only in
607/// the unquoted phrase context, never inside quoted-strings.
608///
609/// This helper therefore:
610/// - strips CFWS comments from the phrase,
611/// - collapses inter-word WSP to single spaces,
612/// - unquotes quoted-string words while preserving their literal contents,
613/// - decodes RFC 2047 encoded-words only in unquoted phrase spans.
614///
615/// # References
616/// - RFC 5322 Section 3.2.5 (phrase / display-name)
617/// - RFC 5322 Section 3.2.2 (comments)
618/// - RFC 2047 Section 5 (encoded-words in phrase context)
619pub(crate) fn normalize_display_name_phrase(name_part: &str) -> Option<String> {
620    let stripped = strip_comments(name_part);
621    let mut segments: Vec<String> = Vec::new();
622    let mut raw = String::new();
623    let mut quoted = String::new();
624    let mut in_quotes = false;
625    let mut escaped = false;
626
627    for c in stripped.chars() {
628        if in_quotes {
629            if escaped {
630                quoted.push(c);
631                escaped = false;
632                continue;
633            }
634
635            match c {
636                '\\' => {
637                    escaped = true;
638                    quoted.push(c);
639                }
640                '"' => {
641                    let unescaped = unescape_quoted_string(&quoted);
642                    if !unescaped.is_empty() {
643                        segments.push(unescaped);
644                    }
645                    quoted.clear();
646                    in_quotes = false;
647                }
648                _ => quoted.push(c),
649            }
650        } else if c == '"' {
651            push_decoded_phrase_segment(&mut segments, &raw);
652            raw.clear();
653            in_quotes = true;
654        } else {
655            raw.push(c);
656        }
657    }
658
659    // Unterminated quoted-string: fall back to treating the remainder as a
660    // raw phrase fragment rather than dropping it outright (Postel's law,
661    // RFC 1122 Section 1.2.2).
662    if in_quotes {
663        raw.push('"');
664        raw.push_str(&quoted);
665    }
666    push_decoded_phrase_segment(&mut segments, &raw);
667
668    if segments.is_empty() {
669        None
670    } else {
671        Some(segments.join(" "))
672    }
673}
674
675/// Normalizes an unquoted phrase span by collapsing CFWS-equivalent WSP.
676///
677/// # References
678/// - RFC 5322 Section 3.2.5 (phrase whitespace)
679fn normalize_phrase_whitespace(input: &str) -> String {
680    input.split_ascii_whitespace().collect::<Vec<_>>().join(" ")
681}
682
683/// Decodes one unquoted `phrase` span and appends it to `segments` when it
684/// carries any semantic content.
685///
686/// # References
687/// - RFC 5322 Section 3.2.5 (phrase)
688/// - RFC 2047 Section 5 (encoded-words in phrases)
689fn push_decoded_phrase_segment(segments: &mut Vec<String>, raw: &str) {
690    let normalized = normalize_phrase_whitespace(raw);
691    if normalized.is_empty() {
692        return;
693    }
694
695    let decoded = encoded_words::decode_encoded_words(&normalized);
696    let decoded = normalize_phrase_whitespace(&decoded);
697    if !decoded.is_empty() {
698        segments.push(decoded);
699    }
700}
701
702/// Unescapes a quoted-string: removes backslash from `\\` → `\` and `\"` → `"`.
703///
704/// Per RFC 5322 Section 3.2.4, a `quoted-pair` is `"\" (VCHAR / WSP)`.
705///
706/// # References
707/// - RFC 5322 Section 3.2.4 (quoted-pair)
708pub(crate) fn unescape_quoted_string(input: &str) -> String {
709    let mut result = String::with_capacity(input.len());
710    let mut chars = input.chars();
711    while let Some(c) = chars.next() {
712        if c == '\\' {
713            // Consume the escaped character (RFC 5322 Section 3.2.4)
714            if let Some(next) = chars.next() {
715                result.push(next);
716            } else {
717                result.push(c);
718            }
719        } else {
720            result.push(c);
721        }
722    }
723    result
724}