daaki-message 0.2.0

//! RFC 2046 MIME tree walking and RFC 2045 body decoding.
//!
//! Walks multipart MIME trees extracting body text, body HTML, and
//! attachments with IMAP section numbers. Handles Content-Transfer-Encoding
//! (base64, quoted-printable) and charset conversion.
//!
//! # References
//! - RFC 2045 (MIME Part One — body format, Content-Transfer-Encoding)
//! - RFC 2046 (MIME Part Two — media types, multipart boundaries)
//! - RFC 2183 (Content-Disposition)
//! - RFC 3501 Section 6.4.5 (IMAP section numbering)

use super::{encoded_words, get_header_value, params, LENIENT_BASE64, MAX_MIME_DEPTH};

use crate::types::ParsedAttachment;

use super::super::wire;

use base64::Engine as _;

/// Walks the MIME tree, extracting body text, body HTML, and attachments
/// with computed IMAP section numbers (dot notation).
///
/// `is_digest` indicates the parent is `multipart/digest`, in which case the
/// default Content-Type for parts without an explicit header is
/// `message/rfc822` instead of `text/plain; charset=us-ascii`
/// (RFC 2046 Section 5.1.5).
///
/// `is_alternative` indicates the parent is `multipart/alternative`. Per
/// RFC 2046 Section 5.1.4, alternatives appear in order of increasing
/// faithfulness, so the *last* matching text/plain or text/html part is
/// preferred. When true, later parts overwrite earlier ones instead of
/// being skipped.
///
/// # References
/// - RFC 2046 Section 5.1 (multipart structure)
/// - RFC 2046 Section 5.1.4 (multipart/alternative)
/// - RFC 2046 Section 5.1.5 (multipart/digest)
/// - RFC 3501 Section 6.4.5 (IMAP section numbering)
fn merge_mime_results(
    body_text: &mut Option<String>,
    body_html: &mut Option<String>,
    attachments: &mut Vec<ParsedAttachment>,
    text: Option<String>,
    html: Option<String>,
    more_attachments: Vec<ParsedAttachment>,
    is_alternative: bool,
) {
    // RFC 2046 Section 5.1.4: in multipart/alternative, later parts are more
    // faithful representations and therefore replace earlier text/plain or
    // text/html bodies. In other multipart types, preserve the first body of
    // each text type and keep collecting attachments from every child.
    if (is_alternative || body_text.is_none()) && text.is_some() {
        *body_text = text;
    }
    if (is_alternative || body_html.is_none()) && html.is_some() {
        *body_html = html;
    }
    attachments.extend(more_attachments);
}

/// Walks a MIME multipart tree and extracts body text, HTML, and attachments.
///
/// # References
/// - RFC 2046 Section 5.1 (multipart structure)
/// - RFC 2046 Section 5.1.4 (multipart/alternative)
/// - RFC 2046 Section 5.1.5 (multipart/digest)
/// - RFC 3501 Section 6.4.5 (IMAP section numbering)
#[allow(clippy::too_many_lines)]
pub(crate) fn walk_mime_tree(
    body: &[u8],
    boundary: &str,
    section_prefix: &str,
    depth: u32,
    is_digest: bool,
    is_alternative: bool,
) -> (Option<String>, Option<String>, Vec<ParsedAttachment>) {
    if depth > MAX_MIME_DEPTH {
        return (None, None, Vec::new());
    }

    let parts = wire::split_mime_parts(body, boundary);
    let mut body_text: Option<String> = None;
    let mut body_html: Option<String> = None;
    let mut attachments: Vec<ParsedAttachment> = Vec::new();

    for (i, part) in parts.iter().enumerate() {
        let section_num = i + 1;
        let section = if section_prefix.is_empty() {
            section_num.to_string()
        } else {
            format!("{section_prefix}.{section_num}")
        };

        let (part_header_bytes, part_body) = wire::split_header_body(part);
        let part_headers = wire::parse_headers(part_header_bytes);

        // RFC 2045 Section 5.2: default Content-Type is "text/plain; charset=us-ascii".
        // RFC 2046 Section 5.1.5: inside multipart/digest, the default is
        // "message/rfc822" instead.
        let default_ct = if is_digest {
            "message/rfc822"
        } else {
            "text/plain; charset=us-ascii"
        };
        let ct = get_header_value(&part_headers, "content-type")
            .unwrap_or_else(|| default_ct.to_string());
        // RFC 2045 Section 6.1: default Content-Transfer-Encoding is "7bit".
        let cte = get_header_value(&part_headers, "content-transfer-encoding")
            .unwrap_or_else(|| "7bit".to_string());
        let cd = get_header_value(&part_headers, "content-disposition").unwrap_or_default();
        let content_id = get_header_value(&part_headers, "content-id");
        let filename = params::extract_filename(&cd, &ct);
        let has_attachment_filename = filename.is_some();

        if params::is_multipart(&ct) {
            // Recurse into nested multipart (RFC 2046 Section 5.1)
            if let Some(inner_boundary) = params::extract_boundary_for_body(&ct, part_body) {
                let inner_mime = params::extract_mime_type(&ct);
                let inner_digest = inner_mime == "multipart/digest";
                // RFC 2046 Section 5.1.4: multipart/alternative lists parts
                // in order of increasing faithfulness — prefer the last match.
                let inner_alternative = inner_mime == "multipart/alternative";
                let (t, h, a) = walk_mime_tree(
                    part_body,
                    &inner_boundary,
                    &section,
                    depth + 1,
                    inner_digest,
                    inner_alternative,
                );
                merge_mime_results(
                    &mut body_text,
                    &mut body_html,
                    &mut attachments,
                    t,
                    h,
                    a,
                    is_alternative,
                );
            } else {
                // RFC 2046 Section 5.1.1 requires a `boundary` parameter for
                // multipart bodies. When a nested part is malformed and omits
                // it, preserve any recoverable text instead of discarding the
                // part entirely (Postel's law, RFC 1122 Section 1.2.2).
                let (t, h, a) = extract_simple_body_with_section(
                    part_body,
                    "text/plain; charset=us-ascii",
                    &cte,
                    &cd,
                    content_id.as_deref(),
                    &section,
                );
                merge_mime_results(
                    &mut body_text,
                    &mut body_html,
                    &mut attachments,
                    t,
                    h,
                    a,
                    is_alternative,
                );
            }
        } else {
            let mime_type = params::extract_mime_type(&ct);
            let is_explicit_attachment = params::is_disposition_type(&cd, "attachment");
            let is_explicit_inline = params::is_disposition_type(&cd, "inline");

            if !is_explicit_attachment
                && (!has_attachment_filename || is_explicit_inline)
                && mime_type == "text/plain"
                && (is_alternative || body_text.is_none())
            {
                // RFC 2046 Section 5.1.4: in multipart/alternative, later
                // parts are preferred, so overwrite rather than skip.
                // RFC 2183 Sections 2.1 and 2.3: an explicit `inline`
                // disposition still denotes body presentation even when the
                // sender provides a `filename`/`name` hint for saving.
                // An empty decoded body is semantically absent — treat it as
                // None for round-trip consistency (RFC 2046 Section 5.1.1).
                let decoded = decode_body(part_body, &cte, &ct);
                if !decoded.is_empty() {
                    body_text = Some(decoded);
                }
            } else if !is_explicit_attachment
                && (!has_attachment_filename || is_explicit_inline)
                && mime_type == "text/html"
                && (is_alternative || body_html.is_none())
            {
                // RFC 2046 Section 5.1.4: in multipart/alternative, later
                // parts are preferred, so overwrite rather than skip.
                // Same empty-body treatment for HTML parts.
                let decoded = decode_body(part_body, &cte, &ct);
                if !decoded.is_empty() {
                    body_html = Some(decoded);
                }
            } else if !mime_type.starts_with("multipart/") {
                // Attachment: explicit attachment, non-text part, or extra text part
                // RFC 2183 Section 2: an explicit Content-Disposition takes
                // precedence. Content-ID (RFC 2392) only implies inline when
                // no explicit disposition is set.
                let is_inline = params::is_disposition_type(&cd, "inline")
                    || (!is_explicit_attachment && content_id.is_some());

                attachments.push(ParsedAttachment {
                    filename,
                    content_type: mime_type,
                    // RFC 2392: Content-ID is `"<" addr-spec ">"`. Strip
                    // brackets and trim whitespace that some mailers add
                    // inside the brackets.
                    content_id: content_id
                        .map(|s| s.trim_matches(|c| c == '<' || c == '>').trim().to_string()),
                    is_inline,
                    size: Some(part_body.len() as u64),
                    section: Some(section),
                });
            }
        }
    }

    (body_text, body_html, attachments)
}

// ---------------------------------------------------------------------------
// Body decoding
// ---------------------------------------------------------------------------

/// Extracts body content from a non-multipart message.
///
/// Checks Content-Disposition and MIME type to determine whether the content
/// is body text, body HTML, or an attachment (RFC 2046; RFC 2183).
///
/// # References
/// - RFC 2045 Section 5.2 (default Content-Type)
/// - RFC 2046 (media types)
/// - RFC 2183 (Content-Disposition)
pub(crate) fn extract_simple_body(
    body: &[u8],
    content_type: &str,
    transfer_encoding: &str,
    content_disposition: &str,
    content_id: Option<&str>,
) -> (Option<String>, Option<String>, Vec<ParsedAttachment>) {
    extract_simple_body_with_section(
        body,
        content_type,
        transfer_encoding,
        content_disposition,
        content_id,
        "1",
    )
}

/// Extracts body content from a non-multipart message and records the
/// corresponding IMAP section number for any recovered attachment.
///
/// # References
/// - RFC 2045 Section 5.2 (default Content-Type)
/// - RFC 2046 (media types)
/// - RFC 2183 (Content-Disposition)
/// - RFC 3501 Section 6.4.5 (part section numbering)
fn extract_simple_body_with_section(
    body: &[u8],
    content_type: &str,
    transfer_encoding: &str,
    content_disposition: &str,
    content_id: Option<&str>,
    section: &str,
) -> (Option<String>, Option<String>, Vec<ParsedAttachment>) {
    let mime_type = params::extract_mime_type(content_type);
    let is_explicit_attachment = params::is_disposition_type(content_disposition, "attachment");
    let is_explicit_inline = params::is_disposition_type(content_disposition, "inline");
    let filename = params::extract_filename(content_disposition, content_type);
    let has_attachment_filename = filename.is_some();

    // Content-Disposition: attachment overrides MIME type (RFC 2183 Section 2).
    // Non-text MIME types are always attachments regardless of disposition
    // (requirements: "A part is an attachment if it has Content-Disposition:
    // attachment, or is a non-text/non-multipart part").
    //
    // Consumer-facing heuristic: a text/* part that carries a filename hint
    // (`filename=` or `name=`) is better preserved as an attachment. RFC 2183
    // Section 2.3 defines `filename` as presentation metadata, and RFC 2183
    // Section 2 leaves presentation to the receiving MUA when disposition is
    // absent. Treating these parts as body text would discard that attachment
    // metadata entirely. RFC 2183 Sections 2.1 and 2.3 also make clear that
    // an explicit `inline` disposition still denotes body presentation even
    // when a filename hint is present.
    if is_explicit_attachment
        || (has_attachment_filename && !is_explicit_inline)
        || (mime_type != "text/plain" && mime_type != "text/html")
    {
        // RFC 2046 Section 5.1.1 defines a body part as headers plus optional
        // `*OCTET` body data, so attachment metadata must still survive even
        // when the decoded body is zero bytes long.
        // RFC 2183 Section 2: an explicit Content-Disposition takes
        // precedence. Content-ID (RFC 2392) only implies inline when
        // no explicit disposition is set.
        let is_inline = params::is_disposition_type(content_disposition, "inline")
            || (!is_explicit_attachment && content_id.is_some());

        let attachment = ParsedAttachment {
            filename,
            content_type: mime_type,
            // RFC 2392: Content-ID is `"<" addr-spec ">"`. Strip brackets
            // and trim whitespace that some mailers add inside the brackets.
            content_id: content_id
                .map(|s| s.trim_matches(|c| c == '<' || c == '>').trim().to_string()),
            is_inline,
            size: Some(body.len() as u64),
            // `section` names the MIME part being surfaced to callers.
            // For top-level single-part messages this is "1"; for recovered
            // malformed nested multiparts, it is the enclosing part number.
            // RFC 3501 Section 6.4.5 defines these dotted section numbers.
            section: Some(section.to_string()),
        };
        return (None, None, vec![attachment]);
    }

    if body.is_empty() {
        return (None, None, Vec::new());
    }

    let text = decode_body(body, transfer_encoding, content_type);

    // An empty decoded body is semantically absent — treat it as None rather
    // than Some(""). This ensures round-trip consistency: a message built with
    // no body content (e.g., an empty text/plain part in multipart/mixed)
    // parses back as None, not Some(""). The builder's write_text_part appends
    // a trailing CRLF (RFC 2046 Section 5.1.1), which decode_body strips,
    // leaving an empty string for originally-empty bodies.
    if text.is_empty() {
        return (None, None, Vec::new());
    }

    if mime_type == "text/html" {
        (None, Some(text), Vec::new())
    } else {
        // text/plain (RFC 2045 Section 5.2)
        (Some(text), None, Vec::new())
    }
}

/// Decodes a body part: applies Content-Transfer-Encoding, then charset conversion.
///
/// When no `charset` parameter is present, defaults to `us-ascii` per
/// RFC 2045 Section 5.2.
///
/// # References
/// - RFC 2045 Section 5.2 (default charset)
/// - RFC 2045 Section 6 (Content-Transfer-Encoding)
pub(crate) fn decode_body(data: &[u8], transfer_encoding: &str, content_type: &str) -> String {
    let decoded = decode_transfer_encoding(data, transfer_encoding);
    // RFC 2045 Section 5.2: default charset is US-ASCII.
    // Try RFC 2231 forms first (charset*=charset'lang'value and
    // charset*0=...; charset*1=...) before falling back to the plain
    // form, mirroring the extraction order used by extract_filename()
    // (RFC 2231 Sections 3–4).
    let charset = params::extract_rfc2231_param(content_type, "charset")
        .or_else(|| params::extract_rfc2231_continuation(content_type, "charset"))
        .or_else(|| params::extract_param(content_type, "charset"))
        .unwrap_or_else(|| "us-ascii".to_string());
    let text = encoded_words::decode_charset(&charset, &decoded);
    // Strip a single trailing CRLF or LF. In single-part messages the body
    // typically ends with CRLF as a message-format artifact (RFC 5322
    // Section 3.5), not semantic content. In multipart parts,
    // split_mime_parts strips the CRLF that serves as the boundary
    // delimiter prefix (RFC 2046 Section 5.1.1), but the builder's
    // write_text_part appends an additional CRLF after the body content
    // which this strip removes. For externally-produced multipart messages
    // where the part body itself ends with CRLF, one CRLF will also be
    // stripped — consistent with the single-part behavior. Bare CR is
    // tolerated as a non-conformant line ending per Postel's law
    // (RFC 1122 Section 1.2.2).
    if let Some(stripped) = text.strip_suffix("\r\n") {
        stripped.to_string()
    } else if let Some(stripped) = text.strip_suffix('\n') {
        stripped.to_string()
    } else if let Some(stripped) = text.strip_suffix('\r') {
        stripped.to_string()
    } else {
        text
    }
}

/// Applies Content-Transfer-Encoding decoding (RFC 2045 Section 6).
///
/// RFC 2045 Section 6.1 defines CTE as a single token (unquoted), but
/// non-conformant mailers may wrap it in double-quotes (e.g., `"base64"`),
/// append trailing parameters (e.g., `base64; name=foo`), or append
/// comments (e.g., `quoted-printable (standard)`).
/// We strip quotes and extract only the first token per Postel's law.
///
/// # References
/// - RFC 2045 Section 6 (Content-Transfer-Encoding)
/// - RFC 2045 Section 6.1 (CTE syntax)
pub(crate) fn decode_transfer_encoding(data: &[u8], encoding: &str) -> Vec<u8> {
    let normalized = encoding.trim().to_ascii_lowercase();
    // RFC 2045 Section 6.1: CTE is a single token. Extract only the first
    // token before any semicolon, whitespace, or opening parenthesis to
    // tolerate trailing garbage from non-conformant mailers.
    let token_end = normalized
        .find(|c: char| c == ';' || c == '(' || c.is_ascii_whitespace())
        .unwrap_or(normalized.len());
    let normalized = normalized[..token_end]
        .strip_prefix('"')
        .and_then(|s| s.strip_suffix('"'))
        .unwrap_or(&normalized[..token_end]);
    match normalized {
        "base64" => {
            // RFC 2045 Section 6.8: "Any characters outside of the base64
            // alphabet are to be ignored in base64-encoded data."
            // Keep only valid base64 alphabet characters: A-Z, a-z, 0-9, +, /, =
            let cleaned: Vec<u8> = data
                .iter()
                .copied()
                .filter(|b| b.is_ascii_alphanumeric() || *b == b'+' || *b == b'/' || *b == b'=')
                .collect();
            LENIENT_BASE64
                .decode(&cleaned)
                .unwrap_or_else(|_| data.to_vec())
        }
        "quoted-printable" => decode_quoted_printable(data),
        // 7bit, 8bit, binary — pass through (RFC 2045 Section 6.2)
        _ => data.to_vec(),
    }
}

/// Decodes quoted-printable encoding (RFC 2045 Section 6.7).
///
/// RFC 2045 Section 6.7 Rule #3: literal SP (0x20) and HTAB (0x09) MUST NOT
/// appear at the end of an encoded line. The decoder strips any trailing
/// literal whitespace before a hard line break (CRLF/LF) or at end-of-data
/// because it was likely added by transport gateways. Hex-encoded whitespace
/// (`=20`, `=09`) is preserved because the sender explicitly encoded it.
///
/// To distinguish literal from hex-encoded whitespace, literal SP/HTAB bytes
/// are buffered and only flushed to the result when followed by a non-WSP,
/// non-line-break byte or a hex-encoded sequence. A hard line break or
/// end-of-data discards the pending literal whitespace.
///
/// # References
/// - RFC 2045 Section 6.7 (quoted-printable)
pub(crate) fn decode_quoted_printable(data: &[u8]) -> Vec<u8> {
    let mut result = Vec::with_capacity(data.len());
    // Buffer for literal SP/HTAB bytes that may be trailing whitespace.
    // Flushed to `result` when followed by non-WSP content; discarded
    // before a hard line break or at end-of-data per RFC 2045
    // Section 6.7 Rule #3.
    let mut pending_wsp: Vec<u8> = Vec::new();
    let mut i = 0;
    while i < data.len() {
        if data[i] == b'=' {
            if i + 2 < data.len() {
                // Soft line break: =\r\n — flush pending WSP because it is
                // mid-line content that the encoder preserved before the
                // line-wrapping `=` (RFC 2045 Section 6.7 Rule #5).
                if data[i + 1] == b'\r' && data[i + 2] == b'\n' {
                    result.append(&mut pending_wsp);
                    i += 3;
                    continue;
                }
                // Soft line break: =\n
                if data[i + 1] == b'\n' {
                    result.append(&mut pending_wsp);
                    i += 2;
                    continue;
                }
                // Bare-CR soft line break: =\r followed by non-LF byte
                // (RFC 2045 Section 6.7; Postel's law — tolerate bare CR as line ending)
                if data[i + 1] == b'\r' && data[i + 2] != b'\n' {
                    result.append(&mut pending_wsp);
                    i += 2;
                    continue;
                }
                // Hex-encoded byte — flush any pending literal WSP first
                // because the hex sequence confirms the whitespace was
                // followed by visible content on this line.
                if let Some(val) = params::decode_hex_pair(data[i + 1], data[i + 2]) {
                    result.append(&mut pending_wsp);
                    result.push(val);
                    i += 3;
                    continue;
                }
            } else if i + 1 < data.len() && data[i + 1] == b'\n' {
                // Soft line break at end: =\n
                result.append(&mut pending_wsp);
                i += 2;
                continue;
            } else if i + 1 < data.len() && data[i + 1] == b'\r' {
                // Soft line break: =\r (bare CR without LF)
                result.append(&mut pending_wsp);
                i += 2;
                continue;
            } else if i + 1 == data.len() {
                // Trailing '=' at end-of-data is a soft line break
                // (RFC 2045 Section 6.7) — skip it. Pending WSP before
                // end-of-data is discarded per Rule #3.
                break;
            }
            // Malformed '=' — flush pending WSP (non-break context) and
            // fall through to push the literal '=' byte.
            result.append(&mut pending_wsp);
        }

        // RFC 2045 Section 6.7 Rule #3: hard line break — discard any
        // pending literal SP/HTAB that the encoder MUST NOT have produced.
        // These bytes were likely injected by transport gateways.
        if data[i] == b'\r' && i + 1 < data.len() && data[i + 1] == b'\n' {
            pending_wsp.clear();
            result.push(b'\r');
            result.push(b'\n');
            i += 2;
            continue;
        }
        if data[i] == b'\n' {
            pending_wsp.clear();
            result.push(b'\n');
            i += 1;
            continue;
        }

        // Buffer literal SP/HTAB — will be flushed or discarded later.
        if data[i] == b' ' || data[i] == b'\t' {
            pending_wsp.push(data[i]);
            i += 1;
            continue;
        }

        // Non-whitespace, non-line-break byte: flush pending WSP and push.
        result.append(&mut pending_wsp);
        result.push(data[i]);
        i += 1;
    }

    // RFC 2045 Section 6.7 Rule #3: discard trailing literal whitespace
    // at end of data (no final line break). `pending_wsp` is simply not
    // flushed.

    result
}