daaki-message 0.2.0

//! Wire-level (syntactic) email message parser.
//!
//! Splits raw email bytes into structural components — header block, body,
//! individual header fields — without performing any semantic interpretation
//! such as RFC 2047 decoding, address parsing, or date parsing.
//!
//! # References
//! - RFC 5322 Section 2 (lexical analysis and message structure)
//! - RFC 2046 Section 5.1 (MIME multipart boundary splitting)

use crate::error::Error;

/// A parsed message at the wire level — structure only, no semantic
/// interpretation.
///
/// Contains the raw header pairs (name lowercased, value unfolded but NOT
/// RFC 2047 decoded), the body bytes, the raw header block as a string,
/// and the total message size.
///
/// # References
/// - RFC 5322 Section 2.1 (general description of message format)
pub(crate) struct WireMessage {
    /// Header pairs: `(lowercase_name, unfolded_value)`.
    ///
    /// Values have continuation lines unfolded per RFC 5322 Section 2.2.3
    /// but are NOT RFC 2047 decoded.
    pub headers: Vec<(String, String)>,
    /// The raw body bytes after the header/body separator.
    pub body: Vec<u8>,
    /// The entire header block as a string (for `ParsedEmail::raw_headers`).
    pub raw_headers: String,
    /// Total size of the original input in bytes.
    pub size: u64,
    /// Whether the top-level message was classified as headerless body text.
    pub headerless: bool,
}

/// Parses raw email bytes into a wire-level structure.
///
/// Splits the input into headers and body, unfolds continuation lines, and
/// validates header field names. No semantic interpretation is performed —
/// RFC 2047 encoded words, addresses, dates, and MIME parameters are left
/// as raw strings for the interpreter layer.
///
/// # References
/// - RFC 5322 Section 2 (message format)
/// - RFC 5322 Section 2.2.3 (long header fields / folding)
pub(crate) fn parse_wire(raw: &[u8]) -> Result<WireMessage, Error> {
    if raw.is_empty() {
        return Err(Error::EmptyInput);
    }

    let size = raw.len() as u64;

    // Split headers and body at \r\n\r\n or \n\n (RFC 5322 Section 2.1).
    // If the top-level message has no syntactically valid headers at all but
    // still looks like usable textual body data, surface it as a headerless
    // message instead of failing with MissingFrom.
    let (header_bytes, body_bytes, headerless_top_level) = classify_top_level_message(raw);
    let raw_headers = if headerless_top_level {
        String::new()
    } else {
        String::from_utf8_lossy(header_bytes).into_owned()
    };

    // Parse headers into (lowercase-name, decoded-value) pairs, unfolding
    // continuation lines per RFC 5322 Section 2.2.3.
    let headers = if headerless_top_level {
        Vec::new()
    } else {
        parse_headers(header_bytes)
    };

    Ok(WireMessage {
        headers,
        body: body_bytes.to_vec(),
        raw_headers,
        size,
        headerless: headerless_top_level,
    })
}

// ---------------------------------------------------------------------------
// Header parsing
// ---------------------------------------------------------------------------

/// Splits raw message bytes into (headers, body) at the first blank line.
///
/// Tries `\r\n\r\n` first, falls back to `\n\n`, and tolerates `\r\r`
/// from non-conformant sources per Postel's law (RFC 1122 Section 1.2.2).
/// Also handles MIME parts with no headers: if the input starts with
/// `\r\n`, `\n`, or bare `\r`, the header section is empty and the rest is body.
/// If no separator is found, the entire input is treated as headers.
///
/// # References
/// - RFC 5322 Section 2.1 (header/body separation)
/// - RFC 2046 (MIME body-part may have empty header section)
/// - RFC 1122 Section 1.2.2 (Postel's law)
pub(super) fn split_header_body(raw: &[u8]) -> (&[u8], &[u8]) {
    // Handle MIME parts with no headers: input starts with a blank line
    // (RFC 2046 — a body-part may have an empty header section).
    if raw.starts_with(b"\r\n") {
        return (&[], &raw[2..]);
    }
    if raw.starts_with(b"\n") {
        return (&[], &raw[1..]);
    }
    // A leading bare CR (\r not followed by \n) is only an empty-header
    // separator when it looks like a blank line — i.e., followed by another
    // \r, \n, or end-of-input. If the byte after the CR is a printable
    // ASCII character (potential header field name start), the CR is a stray
    // control character rather than an empty-header separator. Skip it and
    // continue parsing the rest as normal headers+body.
    // This prevents a stray leading \r from misclassifying valid headers
    // as body text (Postel's law, RFC 1122 Section 1.2.2).
    if raw.starts_with(b"\r") {
        let next = raw.get(1).copied();
        if next.is_none() || next == Some(b'\r') || next == Some(b'\n') {
            return (&[], &raw[1..]);
        }
        // Skip the stray CR and re-split so the header block does not
        // start with a bare CR that would confuse parse_headers.
        return split_header_body(&raw[1..]);
    }
    if let Some(pos) = find_subsequence(raw, b"\r\n\r\n") {
        return (&raw[..pos], &raw[pos + 4..]);
    }
    if let Some(pos) = find_subsequence(raw, b"\n\n") {
        return (&raw[..pos], &raw[pos + 2..]);
    }
    if let Some(pos) = find_subsequence(raw, b"\r\r") {
        return (&raw[..pos], &raw[pos + 2..]);
    }
    // No body separator — treat entire input as headers
    (raw, &[])
}

/// Detects malformed top-level messages that have no usable header fields but
/// do carry a readable body payload.
///
/// RFC 5322 requires a header block before the body, but real-world maildrops
/// and truncated spool files sometimes contain body-only text. Follow Postel's
/// law for these recoverable cases while still rejecting binary garbage.
///
/// # References
/// - RFC 5322 Section 2 (message format)
/// - RFC 1122 Section 1.2.2 (Postel's law)
fn classify_top_level_message(raw: &[u8]) -> (&[u8], &[u8], bool) {
    let (header_bytes, body_bytes) = split_header_body(raw);

    // An explicitly empty header block (`\r\n<body>` / `\n<body>`) should be
    // treated as headerless body text when the payload is still usable.
    if header_bytes.is_empty() && !body_bytes.is_empty() {
        return if looks_like_headerless_body(body_bytes) {
            (&[], body_bytes, true)
        } else {
            (header_bytes, body_bytes, false)
        };
    }

    // If there is no header/body separator and we cannot extract a single
    // syntactically valid header field, treat the entire input as body text
    // when it looks textual rather than binary garbage.
    if body_bytes.is_empty()
        && parse_headers(header_bytes).is_empty()
        && looks_like_headerless_body(raw)
    {
        (&[], raw, true)
    } else {
        (header_bytes, body_bytes, false)
    }
}

/// Returns `true` when `raw` looks like a recoverable textual body rather than
/// irredeemable binary garbage.
///
/// RFC 5322 / RFC 6532 body text may contain printable ASCII, CRLF, HTAB, and
/// 8-bit bytes. NUL and other C0 controls are a strong signal that the input is
/// not usable as a top-level headerless message, so keep rejecting those.
///
/// # References
/// - RFC 5322 Section 2.3 (body)
/// - RFC 6532 (8-bit UTF-8 in headers and body)
fn looks_like_headerless_body(raw: &[u8]) -> bool {
    !raw.is_empty()
        && raw
            .iter()
            .all(|&b| matches!(b, b'\r' | b'\n' | b'\t') || (b >= 0x20 && b != 0x7F))
}

/// Parses raw header bytes into `(lowercase_name, value)` pairs.
///
/// Unfolds continuation lines (lines starting with whitespace) per
/// RFC 5322 Section 2.2.3. Values are NOT RFC 2047 decoded — that is the
/// interpreter layer's responsibility.
///
/// # References
/// - RFC 5322 Section 2.2 (header fields)
/// - RFC 5322 Section 2.2.3 (long header fields / folding)
pub(super) fn parse_headers(raw: &[u8]) -> Vec<(String, String)> {
    let text = String::from_utf8_lossy(raw);
    // RFC 5322 Section 2.1 requires CRLF, but callers already tolerate bare
    // LF at the top level. Normalize bare CR the same way so legacy CR-only
    // messages do not collapse multiple physical header lines into one
    // logical field (RFC 1122 Section 1.2.2).
    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
    let mut headers: Vec<(String, String)> = Vec::new();
    let mut current_name = String::new();
    let mut current_value = String::new();
    let mut current_encoded_word_state = EncodedWordFoldState::default();
    let mut lines = normalized.split('\n').peekable();

    while let Some(line) = lines.next() {
        if line.is_empty() {
            break;
        }
        if line.starts_with(' ') || line.starts_with('\t') {
            // Continuation line — unfold by removing only the CRLF,
            // preserving all whitespace (RFC 5322 Section 2.2.3:
            // "Unfolding is accomplished by simply removing any CRLF
            // that is immediately followed by WSP.").
            if !current_name.is_empty() {
                if current_encoded_word_state.inside_encoded_text() {
                    let unfolded = strip_one_leading_wsp(line);
                    current_value.push_str(unfolded);
                    current_encoded_word_state.feed_str(unfolded);
                } else {
                    current_value.push_str(line);
                    current_encoded_word_state.feed_str(line);
                }
            }
        } else if let Some(colon_pos) = line.find(':') {
            let field_name = line[..colon_pos].trim();
            if !is_valid_header_name(field_name) {
                // RFC 5322 Section 2.2: malformed field-names are not headers.
                // Preserve any previous valid header, then skip this line.
                if !current_name.is_empty() {
                    headers.push((current_name.to_ascii_lowercase(), current_value));
                    current_name = String::new();
                    current_value = String::new();
                    current_encoded_word_state = EncodedWordFoldState::default();
                }
                continue;
            }
            // New header — save previous one
            if !current_name.is_empty() {
                headers.push((current_name.to_ascii_lowercase(), current_value));
            }
            current_name = field_name.to_string();
            let raw_value = &line[colon_pos + 1..];
            let next_is_continuation = lines
                .peek()
                .is_some_and(|next| next.starts_with(' ') || next.starts_with('\t'));

            current_value = if next_is_continuation
                && !raw_value.is_empty()
                && raw_value.bytes().all(|byte| byte == b' ' || byte == b'\t')
            {
                // RFC 5322 Section 2.2.3: if the first physical line contains
                // only WSP after `field-name:`, unfolding must preserve that
                // WSP because only the CRLF is removed.
                raw_value.to_string()
            } else {
                // RFC 5322 Section 2.2 allows the field body to start
                // immediately after the colon or after one structural WSP
                // separator. Remove only that first separator byte so any
                // additional leading WSP in the same physical line remains
                // part of the parsed field body.
                raw_value
                    .strip_prefix(' ')
                    .or_else(|| raw_value.strip_prefix('\t'))
                    .unwrap_or(raw_value)
                    .to_string()
            };
            current_encoded_word_state = EncodedWordFoldState::default();
            current_encoded_word_state.feed_str(&current_value);
        }
        // Lines without a colon and not continuations are silently skipped
    }
    // Save the last header
    if !current_name.is_empty() {
        headers.push((current_name.to_ascii_lowercase(), current_value));
    }

    headers
}

/// Strip exactly one leading structural WSP octet from a continuation line.
///
/// RFC 5322 Section 2.2.3 unfolding preserves the continuation line's first
/// SP/HTAB byte. When a fold occurs inside RFC 2047 encoded-text, that WSP is
/// structural rather than semantic payload and must not reach the decoder.
fn strip_one_leading_wsp(line: &str) -> &str {
    line.strip_prefix(' ')
        .or_else(|| line.strip_prefix('\t'))
        .unwrap_or(line)
}

/// Streaming state for whether header unfolding is currently inside RFC 2047
/// `encoded-text`.
///
/// RFC 2047 Section 2 defines `=?charset?encoding?encoded-text?=`. This state
/// machine tracks that syntax incrementally so folded continuation lines can
/// strip one structural SP/HTAB only when the fold lands inside the
/// `encoded-text` portion, without rescanning the entire accumulated header
/// value on every physical line.
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
struct EncodedWordFoldState {
    phase: EncodedWordFoldPhase,
}

impl EncodedWordFoldState {
    /// Feed additional unfolded header text into the RFC 2047 fold tracker.
    fn feed_str(&mut self, text: &str) {
        for byte in text.bytes() {
            self.feed(byte);
        }
    }

    /// Returns `true` when the next unfolded continuation begins inside
    /// `encoded-text`.
    fn inside_encoded_text(self) -> bool {
        self.phase == EncodedWordFoldPhase::EncodedText
    }

    /// Consume one byte of header text.
    fn feed(&mut self, byte: u8) {
        self.phase = match self.phase {
            EncodedWordFoldPhase::Outside => {
                if byte == b'=' {
                    EncodedWordFoldPhase::SawEquals
                } else {
                    EncodedWordFoldPhase::Outside
                }
            }
            EncodedWordFoldPhase::SawEquals => {
                if byte == b'?' {
                    EncodedWordFoldPhase::Charset
                } else if byte == b'=' {
                    EncodedWordFoldPhase::SawEquals
                } else {
                    EncodedWordFoldPhase::Outside
                }
            }
            EncodedWordFoldPhase::Charset => {
                if byte == b'?' {
                    EncodedWordFoldPhase::Encoding
                } else {
                    EncodedWordFoldPhase::Charset
                }
            }
            EncodedWordFoldPhase::Encoding => {
                if byte == b'?' {
                    EncodedWordFoldPhase::EncodedText
                } else {
                    EncodedWordFoldPhase::Encoding
                }
            }
            EncodedWordFoldPhase::EncodedText => {
                if byte == b'?' {
                    EncodedWordFoldPhase::MaybeClose
                } else {
                    EncodedWordFoldPhase::EncodedText
                }
            }
            EncodedWordFoldPhase::MaybeClose => {
                if byte == b'=' {
                    EncodedWordFoldPhase::Outside
                } else {
                    EncodedWordFoldPhase::EncodedText
                }
            }
        };
    }
}

/// Incremental phase for `EncodedWordFoldState`.
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
enum EncodedWordFoldPhase {
    #[default]
    Outside,
    SawEquals,
    Charset,
    Encoding,
    EncodedText,
    MaybeClose,
}

/// RFC 5322 Section 2.2: `field-name = 1*ftext`.
///
/// `ftext` is limited to printable US-ASCII excluding `:`. RFC 6532 extends
/// header field bodies, not field names, so non-ASCII header names remain
/// invalid and must not be parsed as headers.
///
/// # References
/// - RFC 5322 Section 2.2 (header field syntax)
fn is_valid_header_name(name: &str) -> bool {
    !name.is_empty() && name.bytes().all(|b| matches!(b, 33..=57 | 59..=126))
}

/// Splits a multipart body into its component parts using the given boundary.
///
/// Handles both `\r\n` and `\n` line endings, and tolerates truncated input
/// (missing closing boundary).
///
/// # References
/// - RFC 2046 Section 5.1.1 (multipart boundary delimiters)
pub(super) fn split_mime_parts<'a>(body: &'a [u8], boundary: &str) -> Vec<&'a [u8]> {
    let delim = format!("--{boundary}");
    let delim_bytes = delim.as_bytes();
    let end_delim = format!("--{boundary}--");
    let end_delim_bytes = end_delim.as_bytes();

    let mut parts: Vec<&'a [u8]> = Vec::new();
    let mut search_from: usize = 0;
    let mut part_start: Option<usize> = None;

    loop {
        let Some(rel_pos) = find_subsequence(&body[search_from..], delim_bytes) else {
            // No more boundaries — include trailing content if a part was started
            // (tolerance for truncated input per requirements)
            if let Some(start) = part_start {
                if start < body.len() {
                    parts.push(&body[start..]);
                }
            }
            break;
        };
        let pos = search_from + rel_pos;

        // RFC 2046 Section 5.1.1: boundary delimiters must appear at the
        // beginning of a line (position 0, or preceded by a line ending).
        // Canonical CRLF is handled via the LF check; bare LF and bare CR
        // are accepted per Postel's law (RFC 1122 Section 1.2.2).
        if pos > 0 && body[pos - 1] != b'\n' && body[pos - 1] != b'\r' {
            search_from = pos + delim_bytes.len();
            continue;
        }

        // RFC 2046 Section 5.1.1: the delimiter line is
        //   `"--" boundary [LWSP] CRLF`
        // The byte immediately after the boundary text must be a valid
        // terminator: CR, LF (end of delimiter line), `-` (closing `--`),
        // SP/HTAB (optional trailing whitespace), or end-of-input.
        // Without this check, boundary `abc` would incorrectly match
        // `--abcdef` because `find_subsequence` only finds the prefix.
        let after = pos + delim_bytes.len();
        if after < body.len() {
            let next_byte = body[after];
            if next_byte == b'-' {
                // RFC 2046 Section 5.1.1: a closing delimiter is exactly
                // `"--" boundary "--" [LWSP] CRLF`. A lone `-` after the
                // boundary, or extra non-whitespace text after the closing
                // `--`, is not a valid delimiter line and must be treated as
                // body content.
                if body.get(after + 1) != Some(&b'-') {
                    search_from = after + 1;
                    continue;
                }
                let closing_after = after + 2;
                if closing_after < body.len() {
                    let closing_next = body[closing_after];
                    if closing_next != b'\r'
                        && closing_next != b'\n'
                        && closing_next != b' '
                        && closing_next != b'\t'
                    {
                        search_from = closing_after;
                        continue;
                    }
                }
            } else if next_byte != b'\r'
                && next_byte != b'\n'
                && next_byte != b' '
                && next_byte != b'\t'
            {
                search_from = after;
                continue;
            }
        }

        // Save content from previous boundary to this one
        if let Some(start) = part_start {
            // Strip the trailing line ending before the boundary delimiter.
            // CRLF is canonical (RFC 2046 Section 5.1.1); bare LF and bare CR
            // are accepted per Postel's law (RFC 1122 Section 1.2.2).
            let end = if pos >= 2 && body[pos - 2] == b'\r' && body[pos - 1] == b'\n' {
                pos - 2
            } else if pos >= 1 && (body[pos - 1] == b'\n' || body[pos - 1] == b'\r') {
                pos - 1
            } else {
                pos
            };
            if start <= end {
                parts.push(&body[start..end]);
            }
        }

        // Check for closing boundary
        if body[pos..].starts_with(end_delim_bytes) {
            break;
        }

        // Advance past the boundary line to the start of the next part
        let mut next = pos + delim_bytes.len();
        // Skip optional trailing whitespace on boundary line
        while next < body.len() && (body[next] == b' ' || body[next] == b'\t') {
            next += 1;
        }
        if next < body.len() && body[next] == b'\r' {
            next += 1;
        }
        if next < body.len() && body[next] == b'\n' {
            next += 1;
        }

        part_start = Some(next);
        search_from = next;
    }

    parts
}

/// Finds the first occurrence of `needle` in `haystack`.
///
/// # References
/// - Used for RFC 5322 Section 2.1 header/body splitting and
///   RFC 2046 Section 5.1.1 boundary detection.
pub(super) fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
    haystack.windows(needle.len()).position(|w| w == needle)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
#[path = "wire_tests.rs"]
mod tests;