daaki-message 0.2.0

//! Semantic interpreter for wire-parsed email messages.
//!
//! Converts a [`WireMessage`] (raw headers + body bytes) into a fully
//! interpreted [`ParsedEmail`] by performing RFC 2047 decoding, address
//! parsing, date parsing, MIME tree walking, charset conversion, and
//! content-transfer-encoding decoding.
//!
//! # References
//! - RFC 5322 (Internet Message Format)
//! - RFC 2045 (MIME Part One — body format, Content-Transfer-Encoding)
//! - RFC 2046 (MIME Part Two — media types, multipart boundaries)
//! - RFC 2047 (MIME Part Three — encoded words in headers)
//! - RFC 2183 (Content-Disposition)
//! - RFC 2231 (MIME parameter encoding)
//! - RFC 6532 (Internationalized email headers)

mod address;
mod date;
mod encoded_words;
mod message_id;
mod mime;
mod params;

use crate::error::Error;
use crate::types::{Address, DateTime, ParsedEmail};

use super::wire::WireMessage;

// Re-export pub(crate) items so parser/mod.rs can reach them.
pub(crate) use address::find_paren_outside_quotes;
pub(crate) use address::normalize_display_name_phrase;
pub use address::parse_address_list;
pub(crate) use address::strip_comments;
pub(crate) use date::parse_rfc5322_date;
pub(crate) use encoded_words::decode_encoded_words;

// Re-export items used by parser/tests.rs (only needed under cfg(test)
// since no non-test code in the parent module references them directly).
#[cfg(test)]
pub(super) use address::contains_at_outside_quotes;
#[cfg(test)]
pub(super) use address::extract_comment_text;
#[cfg(test)]
pub(super) use address::parse_single_address;
#[cfg(test)]
pub(super) use address::unescape_quoted_string;
#[cfg(test)]
pub(super) use date::parse_timezone;
#[cfg(test)]
pub(super) use date::parse_year;
#[cfg(test)]
pub(super) use encoded_words::decode_q_encoding;
#[cfg(test)]
pub(super) use mime::decode_body;
#[cfg(test)]
pub(super) use mime::decode_quoted_printable;
#[cfg(test)]
pub(super) use mime::decode_transfer_encoding;
#[cfg(test)]
pub(super) use params::decode_hex_pair;
#[cfg(test)]
pub(super) use params::extract_filename;
#[cfg(test)]
pub(super) use params::extract_mime_type;
#[cfg(test)]
pub(super) use params::extract_param;
#[cfg(test)]
pub(super) use params::extract_rfc2231_continuation;
#[cfg(test)]
pub(super) use params::extract_rfc2231_param;
#[cfg(test)]
pub(super) use params::find_closing_quote;
#[cfg(test)]
pub(super) use params::find_param_value;
#[cfg(test)]
pub(super) use params::hex_digit;
#[cfg(test)]
pub(super) use params::is_disposition_type;
#[cfg(test)]
pub(super) use params::is_inside_quotes;
#[cfg(test)]
pub(super) use params::percent_decode;
#[cfg(test)]
pub(super) use params::strip_outer_quotes;

/// Maximum MIME nesting depth to prevent stack overflow on pathological input.
/// RFC 2046 does not specify a limit; 64 is generous for real-world messages.
pub(super) const MAX_MIME_DEPTH: u32 = 64;

/// Lenient base64 engine that accepts both padded and unpadded input
/// (RFC 2045 Section 6.8).
pub(super) const LENIENT_BASE64: base64::engine::GeneralPurpose =
    base64::engine::GeneralPurpose::new(
        &base64::alphabet::STANDARD,
        base64::engine::GeneralPurposeConfig::new()
            .with_decode_padding_mode(base64::engine::DecodePaddingMode::Indifferent),
    );

/// Structured header fields extracted from an RFC 5322 message.
///
/// Used internally to deduplicate the shared header extraction logic
/// between [`parse_email`] and [`parse_headers_only`].
///
/// # References
/// - RFC 5322 Section 3.6 (field definitions)
#[derive(Default)]
struct HeaderFields {
    message_id: Option<String>,
    in_reply_to: Vec<String>,
    references: Vec<String>,
    subject: Option<String>,
    from: Vec<Address>,
    /// RFC 5322 Section 3.6.2: `sender = "Sender:" mailbox`.
    sender: Option<Address>,
    to: Vec<Address>,
    cc: Vec<Address>,
    bcc: Vec<Address>,
    reply_to: Vec<Address>,
    date: Option<DateTime>,
    /// Optional fields (RFC 5322 Section 3.6.8) — headers not in the
    /// well-known set, stored as `(lowercase-name, value)` pairs.
    extra_headers: Vec<(String, String)>,
}

/// Well-known header names that are extracted into dedicated fields.
///
/// Headers not in this set are collected into `extra_headers`
/// (RFC 5322 Section 3.6.8: optional fields). `Content-Disposition` is kept
/// in `extra_headers` as well as being interpreted for top-level body
/// classification, and `Content-ID` is kept as well as being consulted for
/// top-level inline classification, so header-only consumers can still inspect
/// RFC 2183 Section 2.10 and RFC 2045 Section 7 metadata.
const WELL_KNOWN_HEADERS: &[&str] = &[
    "from",
    "to",
    "cc",
    "bcc",
    "reply-to",
    "sender",
    "subject",
    "date",
    "message-id",
    "in-reply-to",
    "references",
    "content-type",
    "content-transfer-encoding",
    "mime-version",
];

/// Structured header fields where RFC 2047 encoded-words MUST NOT appear
/// (RFC 2047 Section 5). These headers have their own syntax rules and
/// `=?charset?encoding?text?=` sequences must be treated as literal text.
///
/// Includes trace fields (RFC 5321), authentication results
/// (RFC 8601), and DKIM/ARC signature headers (RFC 6376, RFC 8617).
///
/// NOTE: Resent address fields (`Resent-From`, `Resent-Sender`, `Resent-To`,
/// `Resent-Cc`, `Resent-Bcc`, `Resent-Reply-To`) are intentionally excluded.
/// RFC 5322 Section 3.6.6 says each uses the same syntax as its non-Resent
/// counterpart (mailbox / address-list), whose `phrase` production permits
/// encoded-words per RFC 2047 Section 5 rule (3). Only `Resent-Date` and
/// `Resent-Message-ID` remain because they contain no `phrase` production.
const STRUCTURED_HEADERS: &[&str] = &[
    "content-disposition",
    "content-id",
    "received",
    "return-path",
    "resent-date",
    "resent-message-id",
    "dkim-signature",
    "domainkey-signature",
    "arc-seal",
    "arc-message-signature",
    "arc-authentication-results",
    "authentication-results",
];

/// Interprets a wire-parsed message into a fully structured [`ParsedEmail`].
///
/// When `headers_only` is true, body/MIME processing is skipped and body-related
/// fields are set to `None`/empty.
///
/// # References
/// - RFC 5322 (Internet Message Format)
/// - RFC 2045–2047 (MIME)
/// - RFC 2183 (Content-Disposition)
/// - RFC 2231 (MIME parameter encoding)
pub(crate) fn interpret(wire_msg: &WireMessage, headers_only: bool) -> Result<ParsedEmail, Error> {
    let hf = if wire_msg.headerless {
        HeaderFields::default()
    } else {
        extract_header_fields(&wire_msg.headers, &wire_msg.raw_headers)?
    };

    if headers_only {
        return Ok(ParsedEmail {
            message_id: hf.message_id,
            in_reply_to: hf.in_reply_to,
            references: hf.references,
            subject: hf.subject,
            from: hf.from,
            sender: hf.sender,
            to: hf.to,
            cc: hf.cc,
            bcc: hf.bcc,
            reply_to: hf.reply_to,
            date: hf.date,
            body_text: None,
            body_html: None,
            attachments: Vec::new(),
            raw_headers: wire_msg.raw_headers.clone(),
            extra_headers: hf.extra_headers,
            size: wire_msg.size,
        });
    }

    // Determine Content-Type and walk MIME tree or extract simple body
    let content_type = get_header_value(&wire_msg.headers, "content-type")
        .unwrap_or_else(|| "text/plain; charset=us-ascii".to_string());
    // RFC 2045 Section 6.1: default Content-Transfer-Encoding is "7bit".
    let transfer_encoding = get_header_value(&wire_msg.headers, "content-transfer-encoding")
        .unwrap_or_else(|| "7bit".to_string());
    let content_disposition =
        get_header_value(&wire_msg.headers, "content-disposition").unwrap_or_default();
    let content_id = get_header_value(&wire_msg.headers, "content-id");

    let body_bytes = &wire_msg.body;

    let (body_text, body_html, attachments) = if params::is_multipart(&content_type) {
        match params::extract_boundary_for_body(&content_type, body_bytes) {
            Some(boundary) => {
                let mime_type = params::extract_mime_type(&content_type);
                let is_digest = mime_type == "multipart/digest";
                // RFC 2046 Section 5.1.4: multipart/alternative lists parts
                // in order of increasing faithfulness — prefer the last match.
                let is_alternative = mime_type == "multipart/alternative";
                mime::walk_mime_tree(body_bytes, &boundary, "", 0, is_digest, is_alternative)
            }
            // Multipart with no usable boundary parameter: gracefully
            // degrade to text/plain since we cannot split the MIME parts
            // (RFC 2046 Section 5.1.1 — boundary is required for multipart).
            None => mime::extract_simple_body(
                body_bytes,
                "text/plain; charset=us-ascii",
                &transfer_encoding,
                &content_disposition,
                content_id.as_deref(),
            ),
        }
    } else {
        mime::extract_simple_body(
            body_bytes,
            &content_type,
            &transfer_encoding,
            &content_disposition,
            content_id.as_deref(),
        )
    };

    Ok(ParsedEmail {
        message_id: hf.message_id,
        in_reply_to: hf.in_reply_to,
        references: hf.references,
        subject: hf.subject,
        from: hf.from,
        sender: hf.sender,
        to: hf.to,
        cc: hf.cc,
        bcc: hf.bcc,
        reply_to: hf.reply_to,
        date: hf.date,
        body_text,
        body_html,
        attachments,
        raw_headers: wire_msg.raw_headers.clone(),
        extra_headers: hf.extra_headers,
        size: wire_msg.size,
    })
}

// ---------------------------------------------------------------------------
// Header field extraction
// ---------------------------------------------------------------------------

/// Extracts all structured header fields from parsed header pairs.
///
/// Well-known headers are mapped to dedicated fields; all remaining
/// headers are collected into `extra_headers` (RFC 5322 Section 3.6.8).
///
/// # References
/// - RFC 5322 (Internet Message Format — address, date-time, identification)
/// - RFC 2047 (MIME encoded words in headers)
fn extract_header_fields(
    headers: &[(String, String)],
    raw_headers: &str,
) -> Result<HeaderFields, Error> {
    // RFC 5322 Section 3.6.8 permits optional fields outside the well-known
    // set. Keep rejecting inputs with no syntactically valid header fields at
    // all, but allow header blocks made entirely of optional/custom fields so
    // callers can still inspect partial or malformed messages.
    if headers.is_empty() {
        return Err(Error::MissingFrom);
    }

    let continuation_flags = header_body_starts_on_continuation_flags(raw_headers);

    // RFC 5322 Section 3.6.8: collect optional fields — any header not in
    // the well-known set. Values are decoded for RFC 2047 encoded words
    // (RFC 2047 Section 5) so callers get human-readable text, EXCEPT for
    // structured headers where encoded-words MUST NOT appear
    // (RFC 2047 Section 5).
    let extra_headers: Vec<(String, String)> = headers
        .iter()
        .zip(
            continuation_flags
                .iter()
                .copied()
                .chain(std::iter::repeat(false)),
        )
        .filter(|((k, _), _)| !WELL_KNOWN_HEADERS.contains(&k.as_str()))
        .map(|((k, v), starts_on_continuation)| {
            let normalized = if starts_on_continuation {
                strip_leading_structural_wsp(v)
            } else {
                v.as_str()
            };
            // RFC 2047 Section 5: encoded-words MUST NOT appear in
            // structured header fields. Only decode unstructured fields.
            let decoded = if STRUCTURED_HEADERS.contains(&k.as_str()) {
                normalized.to_string()
            } else {
                decode_encoded_words(normalized)
            };
            (k.clone(), decoded)
        })
        .collect();

    Ok(HeaderFields {
        message_id: message_id::extract_message_id(headers),
        in_reply_to: message_id::extract_in_reply_to(headers),
        references: message_id::extract_references(headers),
        // RFC 5322 Section 2.2.3: if the field body begins on the first
        // continuation line, one leading SP/HTAB is only a structural
        // separator before the body and should not survive into the
        // consumer-facing unstructured value.
        subject: get_header_value_with_continuation_flag(headers, &continuation_flags, "subject")
            .map(|(v, starts_on_continuation)| {
                let normalized = if starts_on_continuation {
                    strip_leading_structural_wsp(&v)
                } else {
                    v.as_str()
                };
                decode_encoded_words(normalized)
            }),
        from: address::extract_from(headers),
        sender: address::extract_sender(headers),
        to: address::extract_address_list(headers, "to"),
        cc: address::extract_address_list(headers, "cc"),
        bcc: address::extract_address_list(headers, "bcc"),
        reply_to: address::extract_address_list(headers, "reply-to"),
        date: date::extract_date(headers),
        extra_headers,
    })
}

/// Returns the value of the first header matching `name` (case-insensitive).
///
/// # References
/// - RFC 5322 Section 2.2 (header fields)
pub(super) fn get_header_value(headers: &[(String, String)], name: &str) -> Option<String> {
    headers
        .iter()
        .find(|(k, _)| k == name)
        .map(|(_, v)| v.clone())
}

/// Returns the first header matching `name` together with whether its field
/// body began on the first folded continuation line.
///
/// `continuation_flags` must be aligned with `headers` in parse order; any
/// missing flags default to `false`.
fn get_header_value_with_continuation_flag(
    headers: &[(String, String)],
    continuation_flags: &[bool],
    name: &str,
) -> Option<(String, bool)> {
    headers
        .iter()
        .enumerate()
        .find(|(_, (k, _))| k == name)
        .map(|(idx, (_, v))| {
            (
                v.clone(),
                continuation_flags.get(idx).copied().unwrap_or(false),
            )
        })
}

/// Normalize the leading structural separator of an unfolded field body.
///
/// RFC 5322 Section 2.2 allows a field body to begin after optional WSP
/// following `field-name:`. When the body starts on the first folded
/// continuation line, RFC 5322 Section 2.2.3 unfolding preserves that
/// continuation WSP byte even though it serves the same structural role.
/// Strip exactly one leading SP or HTAB so higher-level consumers see the
/// semantic field body while preserving any additional leading WSP as data.
fn strip_leading_structural_wsp(value: &str) -> &str {
    value
        .strip_prefix(' ')
        .or_else(|| value.strip_prefix('\t'))
        .unwrap_or(value)
}

/// Records, in header parse order, which field bodies begin only on their
/// first folded continuation line.
///
/// RFC 5322 Section 2.2.3 unfolding preserves the continuation line's leading
/// WSP. When the initial header line contains no non-WSP field-body content,
/// that first continuation SP/HTAB serves only as a structural separator for
/// the field body rather than semantic content.
fn header_body_starts_on_continuation_flags(raw_headers: &str) -> Vec<bool> {
    let mut flags = Vec::new();
    let mut lines = raw_headers.split('\n').peekable();

    while let Some(line) = lines.next() {
        let line = line.strip_suffix('\r').unwrap_or(line);
        if line.is_empty() || line.starts_with(' ') || line.starts_with('\t') {
            continue;
        }

        let Some(colon_pos) = line.find(':') else {
            continue;
        };

        let field_name = line[..colon_pos].trim();
        if crate::types::HeaderName::new(field_name).is_err() {
            continue;
        }

        let raw_value = &line[colon_pos + 1..];
        let starts_on_continuation = raw_value.bytes().all(|byte| byte == b' ' || byte == b'\t')
            && lines.peek().is_some_and(|next| {
                let next = next.strip_suffix('\r').unwrap_or(next);
                next.starts_with(' ') || next.starts_with('\t')
            });
        flags.push(starts_on_continuation);
    }

    flags
}