daaki-message 0.2.0

RFC 5322 email message parser and builder
Documentation
//! Message-ID, In-Reply-To, and References extraction.
//!
//! Extracts and validates RFC 5322 Section 3.6.4 identification fields,
//! handling both bracketed `<msg-id>` and bare fallback forms.
//!
//! # References
//! - RFC 5322 Section 3.6.4 (identification fields)
//! - RFC 5322 Section 4.5.4 (obsolete msg-id syntax)
//! - RFC 1122 Section 1.2.2 (Postel's law)

use super::{address, get_header_value, params};

use crate::types::is_valid_bare_message_id_body;

/// Extracts Message-ID, stripping angle brackets if present.
///
/// Handles both RFC-compliant `<id@host>` form and bare `id@host` form
/// for tolerance of broken mailers (RFC 5322 Section 3.6.4).
///
/// # References
/// - RFC 5322 Section 3.6.4 (identification fields)
pub(crate) fn extract_message_id(headers: &[(String, String)]) -> Option<String> {
    get_header_value(headers, "message-id").and_then(|v| {
        // Try bracketed form first (RFC 5322 Section 3.6.4)
        if let Some(id) = extract_first_msg_id(&v) {
            return Some(id);
        }
        // Fall back to bare form only if no angle brackets are present
        // (tolerates broken mailers that omit brackets entirely), but still
        // require valid `msg-id` body syntax so arbitrary garbage does not
        // get promoted to a stable message identifier.
        let trimmed = v.trim();
        if trimmed.is_empty()
            || trimmed.contains('<')
            || trimmed.contains('>')
            || !is_valid_bare_message_id_body(trimmed)
        {
            None
        } else {
            Some(trimmed.to_string())
        }
    })
}

/// Extracts all message-ids from In-Reply-To (RFC 5322 Section 3.6.4).
///
/// RFC 5322: `in-reply-to = "In-Reply-To:" 1*msg-id CRLF` — multiple
/// message-IDs are valid. Returns each `<...>` message-ID with brackets
/// stripped.
///
/// # References
/// - RFC 5322 Section 3.6.4 (identification fields)
pub(crate) fn extract_in_reply_to(headers: &[(String, String)]) -> Vec<String> {
    // Concatenate message-IDs from all In-Reply-To headers (Postel's law:
    // tolerate broken mailers that emit duplicate headers, matching the
    // treatment of From/To/Cc/Bcc/Reply-To in extract_header_fields).
    headers
        .iter()
        .filter(|(k, _)| k == "in-reply-to")
        .flat_map(|(_, v)| extract_all_msg_ids(v))
        .collect()
}

/// Extracts all message-ids from References (RFC 5322 Section 3.6.4).
///
/// RFC 5322: `references = "References:" 1*msg-id CRLF`. Returns each
/// `<...>` message-ID with brackets stripped.
///
/// # References
/// - RFC 5322 Section 3.6.4 (identification fields)
pub(crate) fn extract_references(headers: &[(String, String)]) -> Vec<String> {
    // Concatenate message-IDs from all References headers (Postel's law:
    // tolerate broken mailers that emit duplicate headers, matching the
    // treatment of From/To/Cc/Bcc/Reply-To in extract_header_fields).
    headers
        .iter()
        .filter(|(k, _)| k == "references")
        .flat_map(|(_, v)| extract_all_msg_ids(v))
        .collect()
}

/// Extracts the first `<...>` message-id from a header value.
///
/// # References
/// - RFC 5322 Section 3.6.4 (msg-id syntax)
fn extract_first_msg_id(value: &str) -> Option<String> {
    let uncommented = address::strip_comments(value);
    let mut offset = 0usize;

    while let Some(start) = find_next_unquoted_msg_id_delim(&uncommented, offset, b'<') {
        let Some(end) = find_next_unquoted_msg_id_delim(&uncommented, start + 1, b'>') else {
            break;
        };

        let id = uncommented[start + 1..end].trim();
        let normalized = normalize_obs_msg_id_body(id);
        // RFC 5322 Section 3.6.4: the text inside angle brackets must still
        // be a syntactically valid msg-id body.
        if is_valid_bare_message_id_body(&normalized) {
            return Some(normalized);
        }

        offset = end + 1;
    }
    None
}

/// Extracts all `<...>` message-ids from a header value.
///
/// # References
/// - RFC 5322 Section 3.6.4 (msg-id syntax)
/// - RFC 1122 Section 1.2.2 (Postel's law — tolerate bare msg-ids)
fn extract_all_msg_ids(value: &str) -> Vec<String> {
    let uncommented = address::strip_comments(value);
    let mut ids = Vec::new();
    let mut offset = 0usize;

    while let Some(start) = find_next_unquoted_msg_id_delim(&uncommented, offset, b'<') {
        let before = &uncommented[offset..start];
        ids.extend(extract_bare_msg_ids_segment(before));

        if let Some(end) = find_next_unquoted_msg_id_delim(&uncommented, start + 1, b'>') {
            let id = uncommented[start + 1..end].trim();
            let normalized = normalize_obs_msg_id_body(id);
            // RFC 5322 Section 3.6.4: each bracketed token must contain a
            // valid msg-id body, not arbitrary text.
            if is_valid_bare_message_id_body(&normalized) {
                ids.push(normalized);
            }
            offset = end + 1;
        } else {
            ids.extend(extract_bare_msg_ids_segment(&uncommented[start + 1..]));
            break;
        }
    }

    ids.extend(extract_bare_msg_ids_segment(&uncommented[offset..]));
    ids
}

/// Normalizes obsolete CFWS inside a bracketed `msg-id` body.
///
/// RFC 5322 Section 4.5.4 defines `obs-id-left = local-part` and
/// `obs-id-right = domain`, and states that optional CFWS inside those
/// productions is not semantically part of the identifier. For parser
/// recovery, strip ASCII WSP outside quoted strings and around domain
/// literal contents before validating against the modern `msg-id` shape.
///
/// # References
/// - RFC 5322 Section 3.6.4
/// - RFC 5322 Section 4.5.4
fn normalize_obs_msg_id_body(value: &str) -> String {
    let mut normalized = String::with_capacity(value.len());
    let mut in_quotes = false;
    let mut escaped = false;
    for ch in value.chars() {
        if in_quotes {
            normalized.push(ch);
            if escaped {
                escaped = false;
            } else if ch == '\\' {
                escaped = true;
            } else if ch == '"' {
                in_quotes = false;
            }
            continue;
        }

        match ch {
            '"' => {
                in_quotes = true;
                normalized.push(ch);
            }
            // RFC 5322 Section 4.5.4: CFWS inside obs-id-left / obs-id-right
            // is not semantically part of the identifier body.
            ' ' | '\t' => {}
            _ => normalized.push(ch),
        }
    }

    normalized
}

/// Finds the next `<` or `>` delimiter that is outside quoted strings and comments.
///
/// `msg-id` fields use angle brackets as structural delimiters (RFC 5322
/// Section 3.6.4). Broken senders sometimes include `<...>` inside quoted
/// explanatory text, which must not be treated as a real identifier.
///
/// # References
/// - RFC 5322 Section 3.6.4
/// - RFC 5322 Section 3.2.2
/// - RFC 5322 Section 3.2.4
fn find_next_unquoted_msg_id_delim(value: &str, start: usize, target: u8) -> Option<usize> {
    value
        .as_bytes()
        .iter()
        .enumerate()
        .skip(start)
        .find_map(|(idx, byte)| {
            (*byte == target
                && !params::is_inside_quotes(value, idx)
                && !params::is_inside_comment(value, idx))
            .then_some(idx)
        })
}

/// Recovers whitespace-separated bare `msg-id` bodies from a non-bracketed
/// header segment as a Postel's law accommodation.
///
/// # References
/// - RFC 1122 Section 1.2.2 (Postel's law)
/// - RFC 5322 Section 3.6.4 (canonical form still requires `msg-id`)
fn extract_bare_msg_ids_segment(segment: &str) -> Vec<String> {
    segment
        .split_whitespace()
        .filter(|token| is_valid_bare_message_id_body(token))
        .map(str::to_string)
        .collect()
}