daaki-message 0.2.0

RFC 5322 email message parser and builder
Documentation
//! RFC 2047 encoded-word decoding.
//!
//! Handles `=?charset?encoding?text?=` tokens in header values, including
//! whitespace collapsing between adjacent encoded words.
//!
//! # References
//! - RFC 2047 Section 2 (encoded-word syntax)
//! - RFC 2047 Section 4.2 (Q-encoding)
//! - RFC 2047 Section 5 (valid encoded-word placement)
//! - RFC 2047 Section 6.2 (whitespace between encoded words)
//! - RFC 2231 Section 5 (charset language tag)

use super::{params, LENIENT_BASE64};

use base64::Engine as _;

/// Decodes RFC 2047 encoded words in a header value.
///
/// Handles `=?charset?encoding?text?=` tokens and collapses whitespace
/// between adjacent encoded words per RFC 2047 Section 6.2.
///
/// # References
/// - RFC 2047 Section 2 (encoded-word syntax)
/// - RFC 2047 Section 5 (valid encoded-word placement)
/// - RFC 2047 Section 6.2 (whitespace between encoded words)
pub(crate) fn decode_encoded_words(input: &str) -> String {
    let mut result = String::new();
    let mut remaining = input;

    while !remaining.is_empty() {
        if let Some(start) = remaining.find("=?") {
            // Text before the encoded word
            result.push_str(&remaining[..start]);
            let candidate = &remaining[start..];

            let decoded = if has_linear_whitespace_boundary_before(&remaining[..start]) {
                try_decode_encoded_word(candidate).filter(|(_, consumed)| {
                    has_linear_whitespace_boundary_after(candidate, *consumed)
                })
            } else {
                None
            };

            if let Some((decoded, consumed)) = decoded {
                result.push_str(&decoded);
                remaining = &candidate[consumed..];

                // RFC 2047 Section 6.2: collapse whitespace between adjacent
                // encoded words, but ONLY when the next token is actually a
                // valid, complete encoded word.  If it starts with `=?` but
                // fails to decode, the whitespace must be preserved.
                let trimmed = remaining.trim_start_matches([' ', '\t']);
                let next_is_valid = trimmed.starts_with("=?")
                    && try_decode_encoded_word(trimmed).is_some_and(|(_, consumed)| {
                        has_linear_whitespace_boundary_after(trimmed, consumed)
                    });
                if next_is_valid {
                    remaining = trimmed;
                }
            } else {
                // Not a valid encoded word — emit literal "=?" and advance
                result.push_str("=?");
                remaining = &candidate[2..];
            }
        } else {
            result.push_str(remaining);
            break;
        }
    }

    result
}

/// Returns `true` when an RFC 2047 candidate begins at a valid boundary.
///
/// RFC 2047 Section 5 requires encoded-words in `*text` and `phrase`
/// contexts to appear at the start of the field body or immediately after
/// linear whitespace.
fn has_linear_whitespace_boundary_before(prefix: &str) -> bool {
    prefix.is_empty() || prefix.ends_with([' ', '\t'])
}

/// Returns `true` when an RFC 2047 candidate ends at a valid boundary.
///
/// RFC 2047 Section 5 requires encoded-words in `*text` and `phrase`
/// contexts to end at the field-body end or immediately before linear
/// whitespace.
fn has_linear_whitespace_boundary_after(candidate: &str, consumed: usize) -> bool {
    match candidate[consumed..].chars().next() {
        None => true,
        Some(c) => c == ' ' || c == '\t',
    }
}

/// Attempts to decode a single RFC 2047 encoded word at the start of `input`.
///
/// Returns `(decoded_text, bytes_consumed)` on success.
///
/// # References
/// - RFC 2047 Section 2 (encoded-word format)
/// - RFC 2231 Section 5 (charset language tag)
fn try_decode_encoded_word(input: &str) -> Option<(String, usize)> {
    // Format: =?charset?encoding?encoded_text?=
    let rest = input.strip_prefix("=?")?;
    let q1 = rest.find('?')?;
    let charset_raw = &rest[..q1];
    // RFC 2231 Section 5: charset may include a language tag
    // separated by '*' (e.g., "UTF-8*en"). Strip it before
    // charset lookup.
    let charset = charset_raw.split('*').next().unwrap_or(charset_raw);
    let rest2 = &rest[q1 + 1..];
    let q2 = rest2.find('?')?;
    let encoding = &rest2[..q2];
    let rest3 = &rest2[q2 + 1..];
    let q3 = rest3.find("?=")?;
    let encoded_text = &rest3[..q3];

    // RFC 2047 Section 2: charset and encoded-text must each be non-empty
    // (charset = token = 1*<...>, encoded-text = 1*<...>).
    // When malformed, return None so the caller leaves the raw text
    // undecoded — do not attempt to interpret a clearly broken
    // encoded-word (Postel's law: pass through what you cannot parse).
    if charset.is_empty() || encoded_text.is_empty() {
        return None;
    }

    // Header unfolding can inject a literal SP into encoded-text when a
    // non-conformant encoder folds mid-token. Strip SP before validation so
    // the encoded word can still decode (Postel's law, RFC 1122 Section
    // 1.2.2). Fold-generated HTAB is normalized earlier by the wire-layer
    // unfolding logic, so any remaining HTAB here is a raw control character
    // that must still cause rejection.
    let encoded_text_clean: String = encoded_text.chars().filter(|&c| c != ' ').collect();
    let encoded_text = encoded_text_clean.as_str();

    // RFC 2047 Section 2: encoded-text = 1*<Any printable ASCII character
    // other than "?" or SPACE>.  Printable ASCII is 33–126 (0x21–0x7E).
    // Reject remaining control characters, DEL, high bytes, and '?' so the
    // caller falls back to literal text for malformed encoded words.
    // Fold-introduced SP has already been stripped above.
    if encoded_text
        .bytes()
        .any(|b| b <= 0x20 || b >= 0x7F || b == b'?')
    {
        return None;
    }

    let consumed = 2 + q1 + 1 + q2 + 1 + q3 + 2;
    // RFC 2047 Section 2: an encoded-word MUST NOT be more than 75
    // characters long, including delimiters. However, that is a generation
    // constraint, not a parsing constraint (RFC 2047 Section 6.1). Per
    // Postel's law (RFC 1122 Section 1.2.2), we accept overlong encoded
    // words since real-world mailers (Thunderbird, Asian-locale clients)
    // frequently produce them. The encoder still enforces the 75-char
    // limit when building outgoing messages.

    let bytes = match encoding.to_ascii_uppercase().as_str() {
        "B" => {
            // RFC 2045 Section 6.8: "Any characters outside of the base64
            // alphabet are to be ignored in base64-encoded data."
            // Strip non-base64-alphabet characters that may remain after
            // fold-whitespace removal, matching the body decoder's behavior
            // in `decode_transfer_encoding`.
            let cleaned: Vec<u8> = encoded_text
                .bytes()
                .filter(|b| b.is_ascii_alphanumeric() || *b == b'+' || *b == b'/' || *b == b'=')
                .collect();
            LENIENT_BASE64.decode(&cleaned).ok()?
        }
        "Q" => {
            if !is_valid_q_encoded_word_text(encoded_text) {
                return None;
            }
            decode_q_encoding(encoded_text)
        }
        _ => return None,
    };

    Some((decode_charset(charset, &bytes), consumed))
}

/// Returns `true` when RFC 2047 Q-encoded text uses only complete `=XX`
/// escapes.
///
/// RFC 2047 Section 4.2 defines Q-encoding as underscore plus `=HH`
/// hexadecimal escapes. Inside a full encoded-word, a bare `=` or malformed
/// hex escape makes the token invalid and the caller must leave it literal.
///
/// # References
/// - RFC 2047 Section 2 (encoded-word syntax)
/// - RFC 2047 Section 4.2 (Q encoding)
fn is_valid_q_encoded_word_text(input: &str) -> bool {
    let bytes = input.as_bytes();
    let mut i = 0;

    while i < bytes.len() {
        if bytes[i] != b'=' {
            i += 1;
            continue;
        }

        if i + 2 >= bytes.len() || params::decode_hex_pair(bytes[i + 1], bytes[i + 2]).is_none() {
            return false;
        }

        i += 3;
    }

    true
}

/// Decodes RFC 2047 Q-encoding (a variant of quoted-printable).
///
/// Underscores represent spaces; `=XX` represents a hex-encoded byte.
///
/// # References
/// - RFC 2047 Section 4.2 (Q-encoding)
pub(crate) fn decode_q_encoding(input: &str) -> Vec<u8> {
    let bytes = input.as_bytes();
    let mut result = Vec::with_capacity(bytes.len());
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'=' && i + 2 < bytes.len() {
            if let Some(val) = params::decode_hex_pair(bytes[i + 1], bytes[i + 2]) {
                result.push(val);
                i += 3;
                continue;
            }
        }
        if bytes[i] == b'_' {
            // Underscore represents space in Q-encoding (RFC 2047 Section 4.2)
            result.push(b' ');
        } else {
            result.push(bytes[i]);
        }
        i += 1;
    }
    result
}

/// Converts bytes from the given charset to UTF-8 using lossy conversion.
///
/// Falls back to UTF-8 lossy conversion for unknown charsets.
///
/// # References
/// - RFC 2047 Section 2 (charset names)
/// - RFC 6532 (UTF-8 headers)
pub(crate) fn decode_charset(charset: &str, bytes: &[u8]) -> String {
    let charset_lower = charset.to_lowercase();
    if charset_lower == "utf-8" || charset_lower == "utf8" {
        return String::from_utf8_lossy(bytes).into_owned();
    }

    let encoding =
        encoding_rs::Encoding::for_label(charset.as_bytes()).unwrap_or(encoding_rs::UTF_8);
    let (decoded, _, _) = encoding.decode(bytes);
    decoded.into_owned()
}