daaki-message 0.2.0

//! RFC 2231 MIME parameter extraction.
//!
//! Extracts and decodes MIME parameters from Content-Type, Content-Disposition,
//! and other structured header fields. Handles RFC 2231 parameter value
//! encoding, continuations, and charset conversion.
//!
//! # References
//! - RFC 2045 Section 5.1 (Content-Type parameters)
//! - RFC 2046 Section 5.1.1 (boundary parameter)
//! - RFC 2183 Section 2 (Content-Disposition parameters)
//! - RFC 2231 (MIME parameter encoding and continuations)
//! - RFC 5322 Section 3.2.2 (comments / CFWS)
//! - RFC 5322 Section 3.2.4 (quoted-string)

use super::{address, encoded_words};

use super::super::wire;

/// Checks whether a Content-Type is `multipart/*`.
///
/// # References
/// - RFC 2046 Section 5.1 (multipart type)
pub(crate) fn is_multipart(content_type: &str) -> bool {
    extract_mime_type(content_type).starts_with("multipart/")
}

/// Extracts the disposition-type token from a Content-Disposition header value
/// and compares it to `expected` (case-insensitive).
///
/// RFC 2183 Section 2:
///   Content-Disposition = disposition-type *(";" disposition-parm)
///   disposition-type    = "inline" / "attachment" / extension-token
///
/// The disposition-type is a complete token terminated by `;`, whitespace,
/// or end-of-string. Using `starts_with()` would incorrectly match
/// extension-tokens like `attachmentfoo` as `attachment`.
///
/// # References
/// - RFC 2183 Section 2 (Content-Disposition)
pub(crate) fn is_disposition_type(header_value: &str, expected: &str) -> bool {
    // RFC 5322 Section 3.2.2: comments may appear as CFWS between lexical
    // tokens in a structured field body. Strip them before extracting the
    // disposition token so `(note) attachment` still classifies correctly.
    let lower = address::strip_comments(header_value)
        .trim()
        .to_ascii_lowercase();
    // Extract the token before the first `;` or whitespace.
    let end = lower
        .find(|c: char| c == ';' || c.is_ascii_whitespace())
        .unwrap_or(lower.len());
    let token = &lower[..end];
    // RFC 2183 Section 2: disposition-type is a bare token, but some
    // non-conformant mailers wrap it in double-quotes. Strip surrounding
    // quotes per Postel's law (be liberal in what you accept).
    let token = token
        .strip_prefix('"')
        .and_then(|t| t.strip_suffix('"'))
        .unwrap_or(token);
    token == expected
}

/// Extracts the MIME type (e.g., `text/plain`) from a full Content-Type value.
///
/// Strips RFC 5322 Section 3.2.2 parenthesized comments that may appear in
/// CFWS positions within the type/subtype production (RFC 2045 Section 5.1).
///
/// # References
/// - RFC 2045 Section 5.1 (Content-Type)
/// - RFC 5322 Section 3.2.2 (comments)
pub(crate) fn extract_mime_type(content_type: &str) -> String {
    let ct = content_type.trim();
    let end = ct.find(';').unwrap_or(ct.len());
    // Strip RFC 5322 Section 3.2.2 comments that may appear in CFWS
    // positions within the type/subtype production.
    let raw = ct[..end].trim();
    let stripped = address::strip_comments(raw).trim().to_lowercase();
    // RFC 2045 Section 5.1: normalize whitespace around the `/` separator.
    // After comment stripping, residual whitespace may remain around the
    // slash (e.g., "text  /plain" from "text (comment) /plain"). Split on
    // `/`, trim each part, and rejoin.
    if let Some(slash_pos) = stripped.find('/') {
        let type_part = stripped[..slash_pos].trim();
        let subtype_part = stripped[slash_pos + 1..].trim();
        format!("{type_part}/{subtype_part}")
    } else {
        // RFC 2045 Section 5.2: a Content-Type value without a `/` is
        // syntactically invalid.  Default to "text/plain" so the body is
        // not misclassified as an attachment.
        "text/plain".to_string()
    }
}

/// Extracts the `boundary` parameter from a Content-Type header (RFC 2046 Section 5.1.1).
///
/// # References
/// - RFC 2046 Section 5.1.1 (boundary parameter)
fn extract_boundary(content_type: &str) -> Option<String> {
    extract_param_with_policy(content_type, "boundary", AlphaContinuationPolicy::Always)
}

/// Extracts the most plausible multipart boundary for the given body.
///
/// For interoperability, `extract_boundary` tolerates alphabetic folded
/// continuations in unquoted boundary tokens. When a malformed sender instead
/// appends stray bare text after the boundary parameter, that permissive
/// recovery can over-join the token. Prefer the permissive form only when it
/// actually matches multipart delimiters in the body; otherwise fall back to
/// the strict token form.
///
/// # References
/// - RFC 2046 Section 5.1.1 (boundary delimiter matching)
/// - RFC 5322 Section 2.2.3 (header folding)
/// - RFC 1122 Section 1.2.2 (Postel's law)
pub(crate) fn extract_boundary_for_body(content_type: &str, body: &[u8]) -> Option<String> {
    let preferred = extract_boundary(content_type)?;
    if boundary_matches_body(body, &preferred) {
        return Some(preferred);
    }

    let fallback =
        extract_param_with_policy(content_type, "boundary", AlphaContinuationPolicy::Never)?;
    if fallback != preferred && boundary_matches_body(body, &fallback) {
        return Some(fallback);
    }

    // No candidate boundary actually matches the body. Fall back to the
    // simple-body path so recoverable text is not discarded.
    None
}

/// Returns `true` when `body` contains at least one MIME part split by
/// `boundary`, meaning the candidate boundary is consistent with the body.
///
/// # References
/// - RFC 2046 Section 5.1.1
fn boundary_matches_body(body: &[u8], boundary: &str) -> bool {
    !wire::split_mime_parts(body, boundary).is_empty()
        || body_contains_boundary_delimiter(body, boundary)
}

/// Returns `true` when `body` contains at least one syntactically valid MIME
/// boundary delimiter line for `boundary`, even if there are zero enclosed
/// parts (for example, an empty multipart body).
///
/// # References
/// - RFC 2046 Section 5.1.1
fn body_contains_boundary_delimiter(body: &[u8], boundary: &str) -> bool {
    let delim = format!("--{boundary}");
    let delim_bytes = delim.as_bytes();
    let mut search_from = 0;

    while let Some(rel_pos) = wire::find_subsequence(&body[search_from..], delim_bytes) {
        let pos = search_from + rel_pos;

        // RFC 2046 Section 5.1.1: delimiter lines begin at start-of-body or
        // immediately after a line ending.
        if pos > 0 && body[pos - 1] != b'\n' && body[pos - 1] != b'\r' {
            search_from = pos + delim_bytes.len();
            continue;
        }

        let after = pos + delim_bytes.len();
        if after < body.len() {
            let next_byte = body[after];
            if next_byte == b'-' {
                // Closing delimiter is exactly `--boundary--` plus optional
                // LWSP and line ending.
                if body.get(after + 1) != Some(&b'-') {
                    search_from = after + 1;
                    continue;
                }
                let closing_after = after + 2;
                if closing_after < body.len() {
                    let closing_next = body[closing_after];
                    if closing_next != b'\r'
                        && closing_next != b'\n'
                        && closing_next != b' '
                        && closing_next != b'\t'
                    {
                        search_from = closing_after;
                        continue;
                    }
                }
            } else if next_byte != b'\r'
                && next_byte != b'\n'
                && next_byte != b' '
                && next_byte != b'\t'
            {
                search_from = after;
                continue;
            }
        }

        return true;
    }

    false
}

/// Returns `true` if `pos` is at the start of the string or preceded by a
/// parameter delimiter (`;`, space, or tab).  Used to reject substring
/// matches like `xfilename=` when searching for `filename=`.
///
/// # References
/// - RFC 2045 Section 5.1 (Content-Type parameters)
/// - RFC 5322 Section 3.2.2 (comments / CFWS)
/// - RFC 5322 Section 3.2.4 (quoted-string)
fn is_param_boundary(lower: &str, pos: usize) -> bool {
    if pos == 0 {
        return true;
    }

    // RFC 2045 Section 5.1 inherits RFC 5322 CFWS, so a parameter name may
    // legally follow a parenthesized comment with no extra literal space,
    // e.g. `text/plain;(note)charset=utf-8`.
    address::strip_comments(&lower[..pos])
        .trim_end_matches([' ', '\t'])
        .ends_with(';')
}

/// Extracts a quoted or unquoted parameter value from `rest` (the text
/// immediately after `param_name=`).
///
/// Handles quoted-strings with backslash escaping per RFC 5322 Section 3.2.4,
/// and unquoted tokens terminated by `;` or whitespace.
///
/// The `alpha_policy` controls how aggressively the parser rejoins
/// alphabetic-only fragments that appear after unfolding a mid-token fold
/// (RFC 5322 Section 2.2.3) in an unquoted MIME token (RFC 2045 Section 5.1).
///
/// # References
/// - RFC 5322 Section 3.2.4 (quoted-string)
/// - RFC 2045 Section 5.1 (MIME token)
/// - RFC 5322 Section 2.2.3 (header folding)
fn extract_param_value_internal(
    rest: &str,
    alpha_policy: AlphaContinuationPolicy,
) -> Option<String> {
    let value = if let Some(stripped) = rest.strip_prefix('"') {
        // Find closing quote, skipping escaped quotes (RFC 5322 Section 3.2.4)
        let end = find_closing_quote(stripped);
        if end < stripped.len() {
            // Found a proper closing quote — use everything up to it.
            &stripped[..end]
        } else {
            // No closing quote found (malformed).  Per Postel's law
            // (RFC 1122 Section 1.2.2), fall back to `;` as the value
            // terminator — the same delimiter used for unquoted values —
            // so that subsequent parameters separated by `;` are not
            // swallowed into this value.
            let fallback_end = stripped.find(';').unwrap_or(stripped.len());
            stripped[..fallback_end].trim_end()
        }
    } else {
        // RFC 2045 Section 5.1: a token is `1*<any CHAR except SPACE, CTLs,
        // or tspecials>` — spaces are not allowed within tokens.  However,
        // header unfolding (RFC 5322 Section 2.2.3) replaces a CRLF+WSP fold
        // with a single space, so a non-conformant encoder that folded
        // mid-token produces a space inside the value.  Per Postel's law
        // (RFC 1122 Section 1.2.2) we reassemble the token by stripping the
        // fold-introduced whitespace.
        let end = rest
            .find(|c: char| c == ';' || c.is_whitespace())
            .unwrap_or(rest.len());
        let initial = &rest[..end];
        if initial.is_empty() {
            return None;
        }
        let mut assembled = initial.to_string();
        let mut tail = &rest[end..];

        loop {
            // Look past whitespace for a possible folded continuation.
            let after_ws = tail.trim_start();
            if after_ws.is_empty() || after_ws.starts_with(';') {
                return Some(assembled);
            }

            // Check whether the text after whitespace is a new parameter
            // (i.e., contains `=` before the next `;`).  If so, the whitespace
            // is a genuine separator and we must not concatenate.
            let segment = match after_ws.find(';') {
                Some(pos) => &after_ws[..pos],
                None => after_ws,
            };
            if segment.contains('=') {
                return Some(assembled);
            }

            // The segment after whitespace has no `=`, so it might be a folded
            // continuation of the current token (RFC 5322 Section 2.2.3 fold
            // artifact) or an unrelated bare word / comment. Only concatenate
            // valid MIME-token fragments (RFC 2045 Section 5.1). A purely
            // alphabetic word like `unexpected` is almost certainly stray text
            // unless the caller opted into an alphabetic-tail policy for this
            // specific parameter.
            let continuation_end = after_ws
                .find(|c: char| c == ';' || c.is_whitespace())
                .unwrap_or(after_ws.len());
            let continuation = &after_ws[..continuation_end];
            let continuation_is_token = continuation.chars().all(is_unquoted_mime_token_char);
            let is_fold_fragment = continuation_is_token
                && (continuation.chars().any(|c| !c.is_ascii_alphabetic())
                    || alpha_policy.allows_alpha_continuation(&assembled, continuation));
            if !is_fold_fragment {
                return Some(assembled);
            }

            assembled.push_str(continuation);
            tail = &after_ws[continuation_end..];
        }
    };
    if value.is_empty() {
        None
    } else if rest.starts_with('"') {
        // Unescape quoted-pair sequences (RFC 5322 Section 3.2.4)
        Some(address::unescape_quoted_string(value))
    } else {
        Some(value.to_string())
    }
}

/// Recovery policy for alphabetic-only folded MIME-token continuations.
///
/// RFC 2045 Section 5.1 forbids spaces inside unquoted parameter tokens, so a
/// space introduced by unfolding (RFC 5322 Section 2.2.3) may need to be
/// stripped back out. Some parameters can safely accept alphabetic-only tail
/// fragments more broadly than others.
///
/// # References
/// - RFC 2045 Section 5.1 (MIME token)
/// - RFC 5322 Section 2.2.3 (header folding)
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum AlphaContinuationPolicy {
    /// Accept only continuations that already contain a non-alphabetic token
    /// character such as `-`, `.`, or a digit.
    Never,
    /// Accept alphabetic-only continuations as well.
    Always,
    /// Accept alphabetic-only continuations only when the reassembled token is
    /// a known charset label that `encoding_rs` can decode.
    CharsetLabel,
}

impl AlphaContinuationPolicy {
    /// Returns `true` when `continuation` should be treated as an alphabetic
    /// fold fragment for the already-assembled token.
    ///
    /// The charset path is intentionally narrow: it recovers common labels
    /// like `koi8-r` or `us-ascii` while refusing stray words such as
    /// `unexpected` that would turn a valid label into junk.
    fn allows_alpha_continuation(self, assembled: &str, continuation: &str) -> bool {
        match self {
            Self::Never => false,
            Self::Always => true,
            Self::CharsetLabel => {
                let mut candidate = String::with_capacity(assembled.len() + continuation.len());
                candidate.push_str(assembled);
                candidate.push_str(continuation);
                encoding_rs::Encoding::for_label(candidate.as_bytes()).is_some()
            }
        }
    }
}

/// Extracts a parameter value from the text after the `=` sign.
///
/// # References
/// - RFC 2045 Section 5.1 (MIME parameter syntax)
fn extract_param_value(rest: &str) -> Option<String> {
    extract_param_value_internal(rest, AlphaContinuationPolicy::Never)
}

/// Skips optional linear white space (SP / HTAB) starting at `pos`.
///
/// Returns the index of the first non-whitespace byte, or `bytes.len()`
/// if only whitespace remains. Used to tolerate spaces around `=` in
/// MIME parameters per Postel's law (RFC 1122 Section 1.2.2).
///
/// # References
/// - RFC 5322 Section 3.2.3 (FWS)
/// - RFC 1122 Section 1.2.2 (Postel's law)
fn skip_lwsp(bytes: &[u8], pos: usize) -> usize {
    let mut i = pos;
    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
        i += 1;
    }
    i
}

/// Skips RFC 5322 CFWS (comments and SP / HTAB) starting at `pos`.
///
/// MIME structured header fields inherit RFC 822 / RFC 5322 comment syntax, so
/// comments may legally appear around parameter separators and before values.
/// This helper preserves byte indexing by scanning the original string and
/// skipping nested comments with quoted-pair escapes.
///
/// # References
/// - RFC 2045 Section 5.1 (comments in structured header fields)
/// - RFC 5322 Section 3.2.2 (comments)
/// - RFC 5322 Section 3.2.3 (FWS)
fn skip_cfws(input: &str, pos: usize) -> usize {
    let mut i = pos;

    loop {
        i = skip_lwsp(input.as_bytes(), i);

        let Some(rest) = input.get(i..) else {
            return input.len();
        };
        if !rest.starts_with('(') {
            return i;
        }

        let Some(comment_end) = find_comment_end(input, i) else {
            // Malformed comment: stop skipping so the caller can recover
            // conservatively from the raw bytes instead of discarding data.
            return i;
        };
        i = comment_end;
    }
}

/// Finds the byte offset immediately after a parenthesized RFC 5322 comment.
///
/// Handles nested comments and backslash-escaped characters inside the comment.
///
/// # References
/// - RFC 5322 Section 3.2.2 (comments)
fn find_comment_end(input: &str, pos: usize) -> Option<usize> {
    let rest = input.get(pos..)?;
    if !rest.starts_with('(') {
        return Some(pos);
    }

    let mut depth = 0u32;
    let mut escaped = false;

    for (offset, ch) in rest.char_indices() {
        if escaped {
            escaped = false;
            continue;
        }

        match ch {
            '\\' => escaped = true,
            '(' => depth = depth.saturating_add(1),
            ')' => {
                depth = depth.saturating_sub(1);
                if depth == 0 {
                    return Some(pos + offset + ch.len_utf8());
                }
            }
            _ => {}
        }
    }

    None
}

/// Returns `true` if `c` is valid in an unquoted MIME token parameter value.
///
/// RFC 2045 Section 5.1 defines `token` as ASCII excluding SPACE, CTLs, and
/// tspecials: `()<>@,;:\\\"/[]?=`.
///
/// # References
/// - RFC 2045 Section 5.1 (token definition)
fn is_unquoted_mime_token_char(c: char) -> bool {
    c.is_ascii()
        && !c.is_ascii_whitespace()
        && !c.is_ascii_control()
        && !matches!(
            c,
            '(' | ')'
                | '<'
                | '>'
                | '@'
                | ','
                | ';'
                | ':'
                | '\\'
                | '"'
                | '/'
                | '['
                | ']'
                | '?'
                | '='
        )
}

/// Extracts a named parameter from a header value with default alpha policy.
///
/// # References
/// - RFC 2045 Section 5.1 (Content-Type parameters)
pub(crate) fn extract_param(header_value: &str, param_name: &str) -> Option<String> {
    let alpha_policy = if param_name.eq_ignore_ascii_case("boundary") {
        AlphaContinuationPolicy::Always
    } else if param_name.eq_ignore_ascii_case("charset") {
        AlphaContinuationPolicy::CharsetLabel
    } else {
        AlphaContinuationPolicy::Never
    };
    extract_param_with_policy(header_value, param_name, alpha_policy)
}

/// Extracts a named parameter from a header value, with optional tolerance for
/// alphabetic-only folded continuations.
///
/// RFC 2046 Section 5.1.1 boundary values are MIME tokens unless quoted, so an
/// unfolded space inside an unquoted boundary token can only come from header
/// folding (RFC 5322 Section 2.2.3), not from the boundary syntax itself.
///
/// # References
/// - RFC 2045 Section 5.1 (Content-Type parameters)
/// - RFC 2046 Section 5.1.1 (boundary)
/// - RFC 5322 Section 2.2.3 (header folding)
fn extract_param_with_policy(
    header_value: &str,
    param_name: &str,
    alpha_policy: AlphaContinuationPolicy,
) -> Option<String> {
    // ASCII-only lowercasing preserves byte length for non-ASCII characters,
    // ensuring byte offsets from the lowered string match the original.
    // Full Unicode to_lowercase() can change byte length (e.g., İ: 2→3 bytes),
    // which would misalign indexing into the original string.
    let lower = header_value.to_ascii_lowercase();
    // RFC 2045 Section 5.1: parameter names are case-insensitive.
    let param_lower = param_name.to_ascii_lowercase();
    let mut search_from = 0;

    loop {
        let rel_idx = lower[search_from..].find(&param_lower)?;
        let abs_idx = search_from + rel_idx;

        // Ensure we're matching a parameter boundary, not a substring
        if is_param_boundary(&lower, abs_idx) {
            // Skip matches that fall inside a quoted-string value of another
            // parameter (RFC 5322 Section 3.2.4).
            if is_inside_quotes(&lower, abs_idx) {
                search_from = abs_idx + param_lower.len();
                continue;
            }
            // Skip matches that fall inside a parenthesized comment
            // (RFC 5322 Section 3.2.2, RFC 2045 Section 5.1).
            if is_inside_comment(&lower, abs_idx) {
                search_from = abs_idx + param_lower.len();
                continue;
            }

            // Skip optional whitespace between param name and '='
            // (Postel's law: tolerate `charset = utf-8` from non-conformant mailers,
            // RFC 1122 Section 1.2.2)
            let after_name = abs_idx + param_lower.len();
            let eq_pos = skip_cfws(header_value, after_name);

            if eq_pos < lower.len() && lower.as_bytes()[eq_pos] == b'=' {
                // RFC 2045 Section 5.1: comments are allowed in structured
                // MIME header fields, so skip full CFWS before the value.
                let val_start = skip_cfws(header_value, eq_pos + 1);
                let rest = &header_value[val_start..];
                return extract_param_value_internal(rest, alpha_policy);
            }
        }

        search_from = abs_idx + param_lower.len();
    }
}

/// Finds the position of the closing (unescaped) double-quote in a quoted-string body.
///
/// Skips backslash-escaped characters (`\"`, `\\`) per RFC 5322 Section 3.2.4.
/// Returns the byte offset of the closing `"`, or the string length if no
/// unescaped quote is found.
///
/// # References
/// - RFC 5322 Section 3.2.4 (quoted-string)
pub(crate) fn find_closing_quote(s: &str) -> usize {
    let bytes = s.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'\\' {
            // Skip escaped character (quoted-pair per RFC 5322 Section 3.2.4)
            i += 2;
            continue;
        }
        if bytes[i] == b'"' {
            return i;
        }
        i += 1;
    }
    bytes.len()
}

/// Returns `true` if the byte position `pos` falls inside a parenthesized
/// comment (RFC 5322 Section 3.2.2).
///
/// Scans the string from the beginning, tracking parenthesis depth while
/// respecting backslash-escaped characters (quoted-pair) and nested comments.
///
/// # References
/// - RFC 5322 Section 3.2.2 (comment = "(" *([FWS] ccontent) [FWS] ")")
pub(crate) fn is_inside_comment(s: &str, pos: usize) -> bool {
    let bytes = s.as_bytes();
    let mut depth: u32 = 0;
    let mut i = 0;
    while i < pos && i < bytes.len() {
        match bytes[i] {
            b'\\' => {
                // Skip escaped character (quoted-pair per RFC 5322 Section 3.2.4)
                i += 2;
                continue;
            }
            b'"' => {
                // Skip over quoted-string content — parentheses inside
                // quoted-strings are literal, not comment delimiters.
                i += 1;
                while i < bytes.len() {
                    if bytes[i] == b'\\' {
                        i += 2;
                        continue;
                    }
                    if bytes[i] == b'"' {
                        i += 1;
                        break;
                    }
                    i += 1;
                }
                continue;
            }
            b'(' => {
                depth = depth.saturating_add(1);
            }
            b')' => {
                depth = depth.saturating_sub(1);
            }
            _ => {}
        }
        i += 1;
    }
    depth > 0
}

/// Returns `true` if the byte position `pos` falls inside a quoted-string.
///
/// Scans the string from the beginning, counting unescaped double-quote
/// characters (skipping backslash-escaped pairs). If the count of unescaped
/// quotes before `pos` is odd, the position is inside a quoted-string.
///
/// # References
/// - RFC 5322 Section 3.2.4 (quoted-string and quoted-pair)
pub(crate) fn is_inside_quotes(s: &str, pos: usize) -> bool {
    let bytes = s.as_bytes();
    let mut i = 0;
    while i < pos && i < bytes.len() {
        if bytes[i] == b'"' {
            // Found an opening quote — look for its closing counterpart.
            i += 1;
            let mut found_close = false;
            while i < bytes.len() {
                if bytes[i] == b'\\' {
                    // Skip escaped character (quoted-pair per RFC 5322 Section 3.2.4)
                    i += 2;
                    continue;
                }
                if bytes[i] == b'"' {
                    // Closing quote found.
                    found_close = true;
                    i += 1;
                    break;
                }
                i += 1;
            }
            if !found_close {
                // Unterminated quote — per Postel's law (RFC 1122 Section 1.2.2),
                // do not treat the remainder as quoted so that subsequent
                // parameters separated by `;` remain discoverable.
                return false;
            }
            // If pos falls within the range of this properly-closed quoted
            // string we already scanned past pos, so check.
            if i > pos {
                return true;
            }
        } else {
            // RFC 5322 Section 3.2.4: quoted-pair only valid inside
            // quoted-string and comment — backslash outside quotes is literal.
            i += 1;
        }
    }
    false
}

/// Extracts filename from Content-Disposition and Content-Type parameters.
///
/// Tries (in order):
/// 1. RFC 2231 `filename*` (charset-encoded, non-continuation)
/// 2. RFC 2231 `filename*0` / `filename*0*` (continuation parameters)
/// 3. Plain `filename` parameter (RFC 2183)
/// 4. Same search order for `name` in Content-Type
///
/// # References
/// - RFC 2183 Section 2 (Content-Disposition parameters)
/// - RFC 2231 Section 3 (parameter continuation)
/// - RFC 2231 Section 4 (parameter value encoding)
pub(crate) fn extract_filename(disposition: &str, content_type: &str) -> Option<String> {
    // Try RFC 2231 filename* first (non-continuation)
    if let Some(name) = extract_rfc2231_param(disposition, "filename") {
        return Some(name);
    }
    // Try RFC 2231 continuation: filename*0, filename*1, ...
    if let Some(name) = extract_rfc2231_continuation(disposition, "filename") {
        return Some(name);
    }
    // Try plain filename parameter (RFC 2183)
    if let Some(name) = extract_param(disposition, "filename") {
        return Some(encoded_words::decode_encoded_words(&name));
    }
    // Try Content-Type name* parameter
    if let Some(name) = extract_rfc2231_param(content_type, "name") {
        return Some(name);
    }
    // Try RFC 2231 continuation: name*0, name*1, ...
    if let Some(name) = extract_rfc2231_continuation(content_type, "name") {
        return Some(name);
    }
    // Try Content-Type name parameter
    if let Some(name) = extract_param(content_type, "name") {
        return Some(encoded_words::decode_encoded_words(&name));
    }
    None
}

/// Extracts and decodes an RFC 2231 encoded parameter (`param*=charset'lang'value`).
///
/// # References
/// - RFC 2231 Section 4 (parameter value character set and language)
pub(crate) fn extract_rfc2231_param(header_value: &str, param_name: &str) -> Option<String> {
    let lower = header_value.to_ascii_lowercase();
    // RFC 2045 Section 5.1: parameter names are case-insensitive.
    // RFC 2231 Section 4: param*=charset'lang'value
    let name_star = format!("{}*", param_name.to_ascii_lowercase());
    let mut search_from = 0;

    let idx = loop {
        let rel_idx = lower[search_from..].find(&name_star)?;
        let abs_idx = search_from + rel_idx;
        // Ensure we're at a parameter boundary (same check as extract_param)
        if is_param_boundary(&lower, abs_idx) {
            // Skip matches inside a quoted-string (RFC 5322 Section 3.2.4)
            if is_inside_quotes(&lower, abs_idx) {
                search_from = abs_idx + name_star.len();
                continue;
            }
            // Skip matches inside a comment (RFC 5322 Section 3.2.2)
            if is_inside_comment(&lower, abs_idx) {
                search_from = abs_idx + name_star.len();
                continue;
            }
            // Ensure what follows `param*` is optional whitespace then `=`
            // (not a digit, which would be continuation like `param*0=`)
            // Tolerate spaces around `=` per Postel's law (RFC 1122 Section 1.2.2)
            let after_star = abs_idx + name_star.len();
            let eq_pos = skip_cfws(header_value, after_star);
            if eq_pos < lower.len() && lower.as_bytes()[eq_pos] == b'=' {
                break abs_idx;
            }
        }
        search_from = abs_idx + name_star.len();
    };

    // Skip past name_star, optional whitespace, '=', and optional whitespace after '='
    let after_star = idx + name_star.len();
    let eq_pos = skip_cfws(header_value, after_star);
    let val_start = skip_cfws(header_value, eq_pos + 1);
    let rest = &header_value[val_start..];

    // Postel's law: non-conformant mailers may wrap RFC 2231 extended values
    // in double-quotes (the spec uses ext-value syntax which is NOT a
    // quoted-string, but tolerant parsing requires handling this common
    // deviation). When the value is quoted, find the closing quote first
    // to correctly delimit the value — a raw `;` inside quotes is part of
    // the value, not a parameter separator (RFC 5322 Section 3.2.4). If
    // the closing quote is missing, fall back to the next semicolon so a
    // malformed filename does not swallow later parameters.
    let value = if let Some(inner) = rest.strip_prefix('"') {
        let close = find_closing_quote(inner);
        if close < inner.len() {
            inner[..close].trim()
        } else {
            let fallback_end = inner.find(';').unwrap_or(inner.len());
            inner[..fallback_end].trim_end()
        }
    } else {
        let end = rest.find(';').unwrap_or(rest.len());
        rest[..end].trim()
    };

    // Format: charset'language'percent-encoded-value
    let mut parts_iter = value.splitn(3, '\'');
    let charset = parts_iter.next()?;
    let _language = parts_iter.next()?; // Ignored
    let encoded = parts_iter.next()?;

    let decoded_bytes = strict_percent_decode(encoded)?;
    Some(encoded_words::decode_charset(charset, &decoded_bytes))
}

/// Reassembles RFC 2231 continuation parameters (`param*0=`, `param*1=`, etc.).
///
/// Sections with a trailing `*` (e.g., `param*0*=`) are charset/percent-encoded.
/// The charset is taken from the first section (`param*0*=charset'lang'value`);
/// subsequent `*` sections are just percent-encoded with the same charset.
/// Sections without `*` are plain quoted or unquoted values.
///
/// # References
/// - RFC 2231 Section 3 (parameter value continuations)
pub(crate) fn extract_rfc2231_continuation(header_value: &str, param_name: &str) -> Option<String> {
    let mut sections = collect_rfc2231_continuation_sections(header_value, param_name);
    let mut charset = String::new();

    if sections.is_empty() {
        return None;
    }

    // Sort by index (should already be in order, but be safe)
    sections.sort_by_key(|(idx, _, _)| *idx);

    // RFC 2231 Section 3: continuation counts start at 0 and gaps are not
    // allowed. Preserve only the contiguous prefix beginning at section 0 so
    // malformed input does not fabricate bytes from later fragments.
    if sections.first().map_or(true, |(idx, _, _)| *idx != 0) {
        return None;
    }
    let mut contiguous_len = 0usize;
    for (expected, (idx, _, _)) in sections.iter().enumerate() {
        let Ok(expected_idx) = u32::try_from(expected) else {
            break;
        };
        if *idx != expected_idx {
            break;
        }
        contiguous_len += 1;
    }
    sections.truncate(contiguous_len);

    // RFC 2231 Section 3: the charset/lang prefix appears only on the first
    // encoded section, which may not be section 0 if earlier sections are plain.
    let first_encoded_idx = sections
        .iter()
        .find_map(|(idx, _, encoded)| encoded.as_ref().map(|_| *idx));

    // Concatenate: encoded sections get percent-decoded, plain sections used as-is.
    // If the first encoded section is malformed and a plain duplicate for the
    // same index exists, preserve the plain section instead of letting the
    // malformed encoded duplicate poison the decoded filename.
    let mut raw_bytes: Vec<u8> = Vec::new();
    for (idx, plain, encoded) in &sections {
        let is_first_encoded = first_encoded_idx.is_some_and(|first_idx| *idx == first_idx);

        if is_first_encoded {
            if let Some(value) = encoded {
                let mut parts = value.splitn(3, '\'');
                if let (Some(cs), Some(_lang), Some(encoded_value)) =
                    (parts.next(), parts.next(), parts.next())
                {
                    charset = cs.to_string();
                    raw_bytes.extend(percent_decode(encoded_value));
                    continue;
                }
            }

            if let Some(value) = plain {
                raw_bytes.extend(value.as_bytes());
                continue;
            }

            if let Some(value) = encoded {
                raw_bytes.extend(value.as_bytes());
                continue;
            }
        }

        if let Some(value) = encoded {
            raw_bytes.extend(percent_decode(value));
        } else if let Some(value) = plain {
            raw_bytes.extend(value.as_bytes());
        }
    }

    if charset.is_empty() {
        // RFC 2231 Section 4: when no charset is declared in the first
        // encoded section, the default is the charset of the enclosing
        // entity — US-ASCII per RFC 2045 Section 5.2. We use UTF-8
        // instead as a Postel's law accommodation: US-ASCII is a strict
        // subset of UTF-8, so ASCII-only values decode identically,
        // while non-ASCII bytes (from non-conformant senders) are
        // preserved rather than mis-interpreted through encoding_rs's
        // us-ascii → Windows-1252 mapping.
        charset = "utf-8".to_string();
    }

    Some(encoded_words::decode_charset(&charset, &raw_bytes))
}

/// Collects RFC 2231 continuation sections (`param*0=`, `param*1*=`, etc.)
/// from a header value without imposing an arbitrary upper bound on the index.
///
/// Matches only real parameter boundaries, skipping occurrences inside
/// quoted-strings and comments. For duplicate section indices, preserves the
/// first plain and first encoded candidates separately so later decoding can
/// prefer a valid encoded form without discarding a plain fallback.
///
/// # References
/// - RFC 2231 Section 3 (parameter continuations)
/// - RFC 2045 Section 5.1 (parameter syntax)
/// - RFC 5322 Sections 3.2.2, 3.2.4 (comments and quoted-strings)
fn collect_rfc2231_continuation_sections(
    header_value: &str,
    param_name: &str,
) -> Vec<(u32, Option<String>, Option<String>)> {
    use std::collections::BTreeMap;

    let lower = header_value.to_ascii_lowercase();
    let param_lower = param_name.to_ascii_lowercase();
    let prefix = format!("{param_lower}*");
    let mut search_from = 0;
    let mut sections: BTreeMap<u32, (Option<String>, Option<String>)> = BTreeMap::new();

    while let Some(rel_idx) = lower[search_from..].find(&prefix) {
        let abs_idx = search_from + rel_idx;
        if !is_param_boundary(&lower, abs_idx)
            || is_inside_quotes(&lower, abs_idx)
            || is_inside_comment(&lower, abs_idx)
        {
            search_from = abs_idx + prefix.len();
            continue;
        }

        let mut pos = abs_idx + prefix.len();
        let digits_start = pos;
        while pos < lower.len() && lower.as_bytes()[pos].is_ascii_digit() {
            pos += 1;
        }
        if pos == digits_start {
            // `param*=` is the standalone RFC 2231 form, not a continuation.
            search_from = abs_idx + prefix.len();
            continue;
        }

        let digits = &lower[digits_start..pos];
        if digits.len() > 1 && digits.starts_with('0') {
            search_from = abs_idx + prefix.len();
            continue;
        }

        let mut is_encoded = false;
        if pos < lower.len() && lower.as_bytes()[pos] == b'*' {
            is_encoded = true;
            pos += 1;
        }

        let eq_pos = skip_cfws(header_value, pos);
        if eq_pos >= lower.len() || lower.as_bytes()[eq_pos] != b'=' {
            search_from = abs_idx + prefix.len();
            continue;
        }

        let Some(index) = digits.parse::<u32>().ok() else {
            search_from = abs_idx + prefix.len();
            continue;
        };

        let val_start = skip_cfws(header_value, eq_pos + 1);
        let rest = &header_value[val_start..];
        if let Some(value) = extract_param_value(rest) {
            let entry = sections.entry(index).or_insert((None, None));
            if is_encoded {
                entry.1.get_or_insert(value);
            } else {
                entry.0.get_or_insert(value);
            }
        }

        search_from = abs_idx + prefix.len();
    }

    sections
        .into_iter()
        .map(|(idx, (plain, encoded))| (idx, plain, encoded))
        .collect()
}

/// Finds a parameter value in a header, given a lowercase pattern like `"filename*0="`.
///
/// Checks parameter boundaries and handles both quoted and unquoted values.
/// Tolerates optional whitespace around `=` per Postel's law (RFC 1122 Section 1.2.2).
/// Quoted-string values are unescaped per RFC 5322 Section 3.2.4.
///
/// # References
/// - RFC 2045 Section 5.1 (parameter syntax)
/// - RFC 5322 Section 3.2.4 (quoted-string)
#[cfg(test)]
pub(crate) fn find_param_value(lower: &str, original: &str, pattern: &str) -> Option<String> {
    // Strip the trailing '=' from the pattern so we can handle optional
    // whitespace around it, matching the tolerance in `extract_param`.
    let name_pattern = pattern.strip_suffix('=').unwrap_or(pattern);
    let mut search_from = 0;
    loop {
        let rel_idx = lower[search_from..].find(name_pattern)?;
        let abs_idx = search_from + rel_idx;

        // Ensure parameter boundary
        if is_param_boundary(lower, abs_idx) {
            // Skip matches inside a quoted-string (RFC 5322 Section 3.2.4)
            if is_inside_quotes(lower, abs_idx) {
                search_from = abs_idx + name_pattern.len();
                continue;
            }
            // Skip matches inside a comment (RFC 5322 Section 3.2.2)
            if is_inside_comment(lower, abs_idx) {
                search_from = abs_idx + name_pattern.len();
                continue;
            }

            // Skip optional whitespace before '=' (Postel's law, RFC 1122 Section 1.2.2)
            let after_name = abs_idx + name_pattern.len();
            let eq_pos = skip_cfws(original, after_name);

            // Verify '=' is present
            if eq_pos >= lower.len() || lower.as_bytes()[eq_pos] != b'=' {
                search_from = abs_idx + name_pattern.len();
                continue;
            }

            // Skip optional CFWS after '=' in structured MIME fields.
            let val_start = skip_cfws(original, eq_pos + 1);
            let rest = &original[val_start..];
            return extract_param_value(rest);
        }

        search_from = abs_idx + name_pattern.len();
    }
}

/// Decodes percent-encoded bytes (RFC 2231 / RFC 3986 Section 2.1).
///
/// # References
/// - RFC 2231 Section 4 (percent-encoding in parameters)
/// - RFC 3986 Section 2.1 (percent-encoding)
pub(crate) fn percent_decode(input: &str) -> Vec<u8> {
    let bytes = input.as_bytes();
    let mut result = Vec::with_capacity(bytes.len());
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'%' && i + 2 < bytes.len() {
            if let Some(val) = decode_hex_pair(bytes[i + 1], bytes[i + 2]) {
                result.push(val);
                i += 3;
                continue;
            }
        }
        result.push(bytes[i]);
        i += 1;
    }
    result
}

/// Strictly decodes RFC 2231 percent-encoded bytes.
///
/// Unlike [`percent_decode`], this rejects malformed `%HH` escapes so an
/// unusable standalone extended parameter (`param*=`) can fall back to a
/// valid plain parameter instead of surfacing corrupted output.
///
/// # References
/// - RFC 2231 Section 4 (percent-encoding in parameters)
/// - RFC 3986 Section 2.1 (percent-encoding)
fn strict_percent_decode(input: &str) -> Option<Vec<u8>> {
    let bytes = input.as_bytes();
    let mut result = Vec::with_capacity(bytes.len());
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'%' {
            if i + 2 >= bytes.len() {
                return None;
            }
            let value = decode_hex_pair(bytes[i + 1], bytes[i + 2])?;
            result.push(value);
            i += 3;
            continue;
        }
        result.push(bytes[i]);
        i += 1;
    }
    Some(result)
}

// ---------------------------------------------------------------------------
// Shared low-level utility functions
// ---------------------------------------------------------------------------

/// Decodes a pair of hex ASCII characters into a byte value.
///
/// # References
/// - RFC 2045 Section 6.7 (quoted-printable hex encoding)
/// - RFC 2047 Section 4.2 (Q-encoding hex)
pub(crate) fn decode_hex_pair(high: u8, low: u8) -> Option<u8> {
    let h = hex_digit(high)?;
    let l = hex_digit(low)?;
    Some(h * 16 + l)
}

/// Converts a single ASCII hex digit to its numeric value.
///
/// # References
/// - RFC 2045 Section 6.7 (quoted-printable)
pub(crate) fn hex_digit(b: u8) -> Option<u8> {
    match b {
        b'0'..=b'9' => Some(b - b'0'),
        b'A'..=b'F' => Some(b - b'A' + 10),
        b'a'..=b'f' => Some(b - b'a' + 10),
        _ => None,
    }
}

/// Strips only the outer pair of quotes from a quoted-string.
///
/// If `input` starts with `"` and ends with `"`, removes those two characters.
/// Otherwise returns the input unchanged. Unlike `trim_matches('"')`, this does
/// not greedily strip multiple consecutive quotes, which is critical when the
/// display name ends with an escaped quote like `"She said \"hello\""`.
///
/// # References
/// - RFC 5322 Section 3.2.4 (quoted-string structure)
#[cfg(test)]
pub(crate) fn strip_outer_quotes(input: &str) -> &str {
    if input.len() >= 2 && input.starts_with('"') && input.ends_with('"') {
        &input[1..input.len() - 1]
    } else {
        input
    }
}