daaki-message 0.2.0

//! RFC 5322 Section 3.3 date-time parsing.
//!
//! Parses RFC 5322 date-time strings including obsolete formats, two-digit
//! years, named timezone abbreviations, and CFWS-laden input.
//!
//! # References
//! - RFC 5322 Section 3.3 (date and time specification)
//! - RFC 5322 Section 4.3 (obsolete date syntax)

use super::{address, get_header_value};

use crate::types::DateTime;

/// Attempts to parse the `Date` header.
///
/// # References
/// - RFC 5322 Section 3.6.1 (origination date field)
pub(crate) fn extract_date(headers: &[(String, String)]) -> Option<DateTime> {
    get_header_value(headers, "date").and_then(|v| parse_rfc5322_date(&v))
}

/// Parses an RFC 5322 date-time string.
///
/// Accepts: `[day-of-week ","] day month year hour ":" minute [":" second] zone`
///
/// Strips CFWS (comments and folding white space) before parsing, as allowed
/// by the obsolete date syntax (RFC 5322 Section 4.3).
///
/// # References
/// - RFC 5322 Section 3.3 (date and time specification)
/// - RFC 5322 Section 4.3 (obsolete syntax — CFWS between tokens)
pub(crate) fn parse_rfc5322_date(input: &str) -> Option<DateTime> {
    let input = address::strip_comments(input);
    // Normalize folding white space: CRLF followed by SP or HTAB is replaced
    // with a single space (RFC 5322 Section 2.2.3). This ensures tokens are
    // not split by line folding before we tokenize on whitespace.
    let input = input.replace("\r\n ", " ").replace("\r\n\t", " ");
    let input = input.trim();

    // Skip optional day-of-week (RFC 5322 Section 3.3):
    //   day-of-week = ([FWS] day-name) / obs-day-of-week
    //   day-name    = "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
    // Only skip past the first comma if the text before it is a recognized
    // day-name.  A comma elsewhere (e.g., after the month in non-conformant
    // dates like "13 Feb, 2025 …") must not be consumed here.
    let (day_of_week, input) = if let Some(comma_pos) = input.find(',') {
        let before_comma = input[..comma_pos].trim();
        if let Some(day_of_week) = parse_day_name(before_comma) {
            (Some(day_of_week), input[comma_pos + 1..].trim())
        } else {
            (None, input)
        }
    } else {
        (None, input)
    };

    let parts: Vec<&str> = input.split_whitespace().collect();
    if parts.len() < 4 {
        return None;
    }

    let day: u8 = parts[0].parse().ok()?;
    // RFC 5322 Section 3.3 / Postel's law: non-conformant mailers sometimes
    // append punctuation to the month abbreviation (e.g., "Feb," or "Feb.").
    // Strip trailing non-alphabetic characters so the three-letter name matches.
    let month_token = parts[1].trim_end_matches(|c: char| !c.is_ascii_alphabetic());
    let month = parse_month_name(month_token)?;
    let year: u16 = parse_year(parts[2])?;

    // RFC 5322 Section 4.3 (obsolete date/time syntax) allows CFWS around
    // the colons in the time-of-day:
    //   obs-hour   = [CFWS] 2DIGIT [CFWS]
    //   obs-minute = [CFWS] 2DIGIT [CFWS]
    //   obs-second = [CFWS] 2DIGIT [CFWS]
    //
    // When whitespace surrounds the colons, `split_whitespace()` scatters
    // the time across multiple tokens (e.g., `"09 : 55 : 06"` becomes
    // `["09", ":", "55", ":", "06"]`). Reconstruct the time string by
    // joining consecutive tokens composed solely of ASCII digits, colons,
    // or periods (for non-standard fractional seconds), then locate the
    // timezone token immediately after the time span.
    let mut time_string = String::new();
    let mut tz_index = parts.len(); // fallback: no timezone found
    for (i, &part) in parts.iter().enumerate().skip(3) {
        if part
            .bytes()
            .all(|b| b.is_ascii_digit() || b == b':' || b == b'.')
        {
            time_string.push_str(part);
        } else {
            // First non-time token — this is the timezone (or end of input).
            tz_index = i;
            break;
        }
    }

    let time_parts: Vec<&str> = time_string.split(':').collect();
    if time_parts.len() < 2 {
        return None;
    }

    let hour: u8 = time_parts[0].parse().ok()?;
    let minute: u8 = time_parts[1].parse().ok()?;
    let second: u8 = time_parts
        .get(2)
        .and_then(|s| {
            // RFC 5322 Section 3.3 does not define fractional seconds, but
            // non-conformant mailers may include them (e.g., "45.123").
            // Per Postel's law, strip the fractional part and parse the
            // integer portion.
            let int_part = s.split('.').next().unwrap_or(s);
            int_part.parse().ok()
        })
        .unwrap_or(0);

    // Validate field ranges per RFC 5322 Section 3.3:
    //   day   = 1*2DIGIT (accept 1-31 regardless of month; Postel's law —
    //           calendar-invalid but syntactically valid dates like "31 Feb"
    //           appear in real-world email and must not be rejected)
    //   hour  = 0-23, minute = 0-59, second = 0-60 (60 = leap second)
    if day == 0 || day > 31 || hour > 23 || minute > 59 || second > 60 {
        return None;
    }

    // RFC 5322 Section 3.3: when day-of-week is present, it MUST be the
    // day implied by the date. However, that is a generation/conformance
    // requirement, not a parsing requirement. Per Postel's law (RFC 1122
    // Section 1.2.2), we accept dates with incorrect day-of-week since
    // real-world mailers frequently compute the wrong weekday. The numeric
    // date fields (day, month, year) are authoritative; the day-of-week is
    // purely advisory and is intentionally ignored during parsing.
    let _ = day_of_week;

    let tz_offset_minutes = parts.get(tz_index).map_or(0, |tz| parse_timezone(tz));

    Some(DateTime {
        year,
        month,
        day,
        hour,
        minute,
        second,
        tz_offset_minutes,
    })
}

/// Parses a three-letter day-of-week abbreviation (RFC 5322 Section 3.3).
///
/// Returns 0 for Sunday, 1 for Monday, ..., 6 for Saturday so the result
/// matches [`DateTime::weekday`].
///
/// # References
/// - RFC 5322 Section 3.3 (day-name)
fn parse_day_name(s: &str) -> Option<u8> {
    match s.to_ascii_lowercase().as_str() {
        "sun" => Some(0),
        "mon" => Some(1),
        "tue" => Some(2),
        "wed" => Some(3),
        "thu" => Some(4),
        "fri" => Some(5),
        "sat" => Some(6),
        _ => None,
    }
}

/// Parses a three-letter month abbreviation (RFC 5322 Section 3.3).
///
/// # References
/// - RFC 5322 Section 3.3 (month names)
fn parse_month_name(s: &str) -> Option<u8> {
    match s.to_ascii_lowercase().as_str() {
        "jan" => Some(1),
        "feb" => Some(2),
        "mar" => Some(3),
        "apr" => Some(4),
        "may" => Some(5),
        "jun" => Some(6),
        "jul" => Some(7),
        "aug" => Some(8),
        "sep" => Some(9),
        "oct" => Some(10),
        "nov" => Some(11),
        "dec" => Some(12),
        _ => None,
    }
}

/// Parses a year, handling 2-digit obsolete years (RFC 5322 Section 4.3).
///
/// # References
/// - RFC 5322 Section 4.3 (obsolete year syntax)
pub(crate) fn parse_year(s: &str) -> Option<u16> {
    let y: u16 = s.parse().ok()?;
    // RFC 5322 Section 4.3: obs-year rules apply to 2-digit and 3-digit
    // year strings from legacy messages. A 4-digit string (e.g., "0050")
    // is taken at face value to allow round-tripping through
    // to_rfc5322_string() which always emits 4-digit years.
    let digit_count = s.len();
    if digit_count <= 2 {
        // 2-digit year: 00-49 → +2000, 50-99 → +1900
        Some(if y >= 50 { 1900 + y } else { 2000 + y })
    } else if digit_count == 3 {
        // 3-digit year: add 1900
        Some(1900 + y)
    } else {
        Some(y)
    }
}

/// Parses a timezone offset: `+HHMM`, `-HHMM`, or named zones (RFC 5322 Section 4.3).
///
/// # References
/// - RFC 5322 Section 3.3 (zone)
/// - RFC 5322 Section 4.3 (obsolete zone names)
pub(crate) fn parse_timezone(s: &str) -> i16 {
    let s = s.trim();
    // Numeric offset: +HHMM or -HHMM
    // Use `get()` for slicing — the length check alone does not guarantee that
    // byte positions 1..3 and 3..5 land on char boundaries when the input
    // contains multi-byte UTF-8 characters (e.g., replacement chars from
    // lossy conversion of non-ASCII header bytes).
    // RFC 5322 Section 3.3: numeric zones are exactly sign + 4DIGIT. Do not
    // partially consume overlong tokens such as `+12345`.
    if (s.starts_with('+') || s.starts_with('-')) && s.len() == 5 {
        let sign: i16 = if s.starts_with('-') { -1 } else { 1 };
        if let (Some(h_str), Some(m_str)) = (s.get(1..3), s.get(3..5)) {
            if let (Ok(h), Ok(m)) = (h_str.parse::<i16>(), m_str.parse::<i16>()) {
                // RFC 5322 Section 3.3: zone = ±HHMM. Reject semantically
                // invalid offsets (hours > 23 or minutes > 59) to prevent
                // to_rfc5322_string from producing a >4-digit zone field.
                // Unknown/invalid zones map to +0000 per RFC 5322 Section 4.3.
                if h <= 23 && m <= 59 {
                    return sign * (h * 60 + m);
                }
            }
        }
    }
    // Named zones (RFC 5322 Section 4.3 / obsolete).
    // Arms for UT/GMT/UTC, Z, and the catch-all all return 0 but have
    // distinct RFC semantics (+0000 vs -0000), so identical bodies are
    // intentional.
    #[allow(clippy::match_same_arms)]
    match s.to_ascii_uppercase().as_str() {
        // RFC 5322 Section 4.3: "'UT' and 'GMT' are indications of
        // 'Universal Time' and 'Greenwich Mean Time', respectively,
        // and are both semantically identical to '+0000'."
        // UTC is not listed in RFC 5322 but is universally understood
        // as equivalent to +0000.
        "UT" | "GMT" | "UTC" => 0,
        // RFC 5322 Section 4.3: Z is defined as +0000 (UTC) in the
        // military zone table. Unlike the other single-letter zones,
        // Z is universally understood and unambiguous.
        "Z" => 0,
        // Named US civil zones — these have well-understood meanings.
        "EDT" => -240,
        "EST" | "CDT" => -300,
        "CST" | "MDT" => -360,
        "MST" | "PDT" => -420,
        "PST" => -480,
        // RFC 5322 Section 4.3: single-letter military time zones (A–I,
        // K–Y) were defined in an earlier version of this specification
        // but "have been used incorrectly" — their meanings are
        // unreliable. The RFC says they "SHOULD all be considered
        // equivalent to '-0000' unless there is out-of-band information
        // confirming their meaning."
        //
        // Our i16 return type cannot distinguish +0000 from -0000, so
        // we map them to 0 just like the well-defined zones above,
        // even though semantically -0000 means "unknown local time"
        // whereas +0000 means "known to be UTC."
        _ => 0,
    }
}