pter 0.1.0 - Docs.rs

use scraper::node::Node;
use scraper::ElementRef;

/// Check if an element marks the beginning of a quoted reply.
///
/// This is the central abstraction for reply detection. Rather than
/// building per-client logic throughout the converter, all client-specific
/// knowledge lives here behind a single predicate.
///
/// An element is a reply boundary if it's a container that wraps quoted
/// content from a previous message in the thread. The converter treats
/// these identically to `<blockquote>` — children get `>` prefixed.
pub fn is_reply_boundary(el: ElementRef) -> bool {
    let element = el.value();
    let name = element.name();

    // <blockquote> is already handled by the element classifier.
    // This function catches non-blockquote reply wrappers.

    // Structural: elements with type="cite" (Apple Mail, some webmail)
    if element.attr("type") == Some("cite") {
        return true;
    }

    // Class/ID-based detection — thin per-client checks
    if element.attr("class").is_some_and(is_reply_class) {
        return true;
    }

    if element.attr("id").is_some_and(is_reply_id) {
        return true;
    }

    // Heuristic: a <div> whose first meaningful text child matches
    // an attribution pattern ("On ... wrote:") followed by a blockquote
    if name == "div" && has_attribution_then_quote(el) {
        return true;
    }

    false
}

/// Extract an attribution line from just before or at the start of a reply boundary.
///
/// Returns the attribution text (e.g. "On Mon, Jan 5, Alice wrote:") if found,
/// so the converter can render it above the quoted block.
pub fn find_attribution(el: ElementRef) -> Option<String> {
    // Check the element's own leading text for attribution patterns
    for child in el.children() {
        match child.value() {
            Node::Text(text) => {
                let trimmed = text.text.trim();
                if is_attribution_text(trimmed) {
                    return Some(trimmed.to_string());
                }
            }
            Node::Element(_) => {
                // Stop at the first child element — attribution is leading text
                break;
            }
            _ => {}
        }
    }

    // Check for a preceding sibling text node or element with attribution
    if let Some(prev) = previous_sibling_text(el) {
        let trimmed = prev.trim().to_string();
        if is_attribution_text(&trimmed) {
            return Some(trimmed);
        }
    }

    None
}

/// Check if text matches common email attribution patterns.
///
/// These patterns are cross-client — every email client generates some
/// variant of "On [date], [person] wrote:" or "--- Forwarded message ---".
fn is_attribution_text(text: &str) -> bool {
    let t = text.trim();

    // "On ... wrote:" (Gmail, Apple Mail, Thunderbird, most clients)
    if t.starts_with("On ") && t.ends_with("wrote:") {
        return true;
    }
    // Localized variants: "Le ... a écrit :" (French), "Am ... schrieb" (German)
    if (t.starts_with("Le ") || t.starts_with("El "))
        && (t.ends_with("crit :") || t.ends_with("crit:"))
    {
        return true;
    }
    if t.starts_with("Am ") && (t.ends_with("schrieb:") || t.ends_with("schrieb :")) {
        return true;
    }

    // Forwarded message separators
    if t.contains("Forwarded message")
        || t.contains("Begin forwarded message")
        || t.contains("Original Message")
    {
        return true;
    }

    false
}

/// Thin per-client class checks. Each is one line — easy to add new clients.
fn is_reply_class(class: &str) -> bool {
    // Split on whitespace to check individual class names
    class.split_whitespace().any(|c| {
        matches!(
            c,
            "gmail_quote"
                | "gmail_extra"
                | "yahoo_quoted"
                | "protonmail_quote"
                | "tutanota_quote"
                | "moz-cite-prefix"      // Thunderbird
                | "zmail_extra"          // Zoho
                | "WordSection1"         // Outlook (sometimes wraps replies)
        )
    })
}

/// Thin per-client ID checks.
fn is_reply_id(id: &str) -> bool {
    matches!(
        id,
        "divRplyFwdMsg"             // Outlook
            | "reply-message"       // Generic
            | "OLK_SRC_BODY_SECTION" // Outlook Mac
    )
}

/// Check if a div contains attribution text followed by a blockquote.
///
/// This catches the common pattern where no class/id is present but
/// the structure is: `<div>On ... wrote:<br><blockquote>...</blockquote></div>`
fn has_attribution_then_quote(el: ElementRef) -> bool {
    let mut found_attribution = false;

    for child in el.children() {
        match child.value() {
            Node::Text(text) => {
                if is_attribution_text(text.text.trim()) {
                    found_attribution = true;
                }
            }
            Node::Element(e) => {
                if found_attribution && e.name() == "blockquote" {
                    return true;
                }
                // Skip <br> tags between attribution and blockquote
                if e.name() != "br" {
                    // If we hit a non-br element before finding attribution, stop
                    if !found_attribution {
                        return false;
                    }
                }
            }
            _ => {}
        }
    }

    false
}

/// Get text from the previous sibling, if it exists and is a text or inline element.
fn previous_sibling_text(el: ElementRef) -> Option<String> {
    let prev = el.prev_sibling()?;

    match prev.value() {
        Node::Text(text) => Some(text.text.to_string()),
        Node::Element(e) => {
            // Check inline elements like <span>, <font> that might wrap attribution
            if matches!(e.name(), "span" | "font" | "b" | "i" | "div" | "p") {
                let el_ref = ElementRef::wrap(prev)?;
                let text: String = el_ref.text().collect();
                if !text.trim().is_empty() {
                    return Some(text);
                }
            }
            None
        }
        _ => None,
    }
}

/// Check if a separator element marks the boundary between original
/// content and a forwarded/replied message.
///
/// This catches `<hr>` or styled divs that act as visual separators
/// before reply content (common in Outlook "From: ... Sent: ..." blocks).
pub fn is_outlook_separator(el: ElementRef) -> bool {
    let element = el.value();

    // Outlook uses a specific pattern: a div containing
    // "From: ... Sent: ... To: ... Subject: ..." as a reply header
    if element.name() == "div" || element.name() == "p" {
        let text: String = el.text().collect();
        let t = text.trim();

        // Must have at least From + Sent/Date or Subject
        let has_from = t.contains("From:");
        let has_sent = t.contains("Sent:") || t.contains("Date:");
        let has_subject = t.contains("Subject:");

        if has_from && (has_sent || has_subject) {
            return true;
        }
    }

    false
}

#[cfg(test)]
mod tests {
    use super::*;
    use scraper::{Html, Selector};

    fn parse_and_select(html: &str, selector: &str) -> (Html, Selector) {
        let doc = Html::parse_document(html);
        let sel = Selector::parse(selector).unwrap();
        (doc, sel)
    }

    // -- Attribution detection --

    #[test]
    fn attribution_on_wrote() {
        assert!(is_attribution_text("On Mon, Jan 5, 2026 at 3:00 PM Alice <alice@example.com> wrote:"));
    }

    #[test]
    fn attribution_forwarded() {
        assert!(is_attribution_text("---------- Forwarded message ----------"));
    }

    #[test]
    fn attribution_original_message() {
        assert!(is_attribution_text("-----Original Message-----"));
    }

    #[test]
    fn attribution_begin_forwarded() {
        assert!(is_attribution_text("Begin forwarded message:"));
    }

    #[test]
    fn not_attribution() {
        assert!(!is_attribution_text("Hello, how are you?"));
        assert!(!is_attribution_text("On the other hand, this is fine."));
    }

    // -- Reply class detection --

    #[test]
    fn gmail_quote_class() {
        assert!(is_reply_class("gmail_quote"));
    }

    #[test]
    fn multiple_classes_with_reply() {
        assert!(is_reply_class("some-class gmail_quote another"));
    }

    #[test]
    fn non_reply_class() {
        assert!(!is_reply_class("regular-div content-wrapper"));
    }

    // -- Reply boundary detection --

    #[test]
    fn type_cite_is_boundary() {
        let html = r#"<div type="cite"><p>quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, r#"div[type="cite"]"#);
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn gmail_quote_is_boundary() {
        let html = r#"<div class="gmail_quote"><p>quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, "div.gmail_quote");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn outlook_id_is_boundary() {
        let html = r#"<div id="divRplyFwdMsg"><p>quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, "#divRplyFwdMsg");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn plain_div_not_boundary() {
        let html = r#"<div class="content"><p>not quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, "div.content");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    // -- Outlook separator --

    #[test]
    fn outlook_from_sent_subject() {
        let html = "<div>From: Alice\nSent: Monday\nTo: Bob\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn regular_div_not_separator() {
        let html = "<div>Just a normal paragraph.</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_outlook_separator(el));
    }

    // -- Boundary tests for `is_attribution_text`: each arm needs both sides --

    #[test]
    fn attribution_on_without_wrote_is_false() {
        // "On ..." without "wrote:" — catches mutating && to ||
        assert!(!is_attribution_text("On the bright side, this is fine."));
    }

    #[test]
    fn attribution_wrote_without_on_is_false() {
        // "... wrote:" without leading "On " — catches mutating && to ||
        assert!(!is_attribution_text("Alice wrote:"));
    }

    #[test]
    fn attribution_french_le_with_colon_space() {
        assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :"));
    }

    #[test]
    fn attribution_french_le_no_space_before_colon() {
        // "écrit:" without space — covers L89 || mutation between the two ending forms
        assert!(is_attribution_text("Le lundi, Alice a écrit:"));
    }

    #[test]
    fn attribution_spanish_el_with_colon_space() {
        assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :"));
    }

    #[test]
    fn attribution_spanish_el_no_space_before_colon() {
        assert!(is_attribution_text("El lunes, Alice a escrit:"));
    }

    #[test]
    fn attribution_french_le_without_wrote_ending_is_false() {
        // "Le X" without "écrit" — catches L89 mutating || to &&
        assert!(!is_attribution_text("Le lundi, Alice est ici."));
    }

    #[test]
    fn attribution_starts_with_le_but_not_french_pattern() {
        // Word starts with "Le" but isn't the French attribution form.
        assert!(!is_attribution_text("Le sigh."));
    }

    #[test]
    fn attribution_german_am_with_colon() {
        assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:"));
    }

    #[test]
    fn attribution_german_am_with_space_colon() {
        assert!(is_attribution_text("Am Montag schrieb :"));
    }

    #[test]
    fn attribution_german_am_without_schrieb_is_false() {
        // "Am X" without "schrieb" — catches L93 && mutation
        assert!(!is_attribution_text("Am very fine, thanks."));
    }

    #[test]
    fn attribution_german_schrieb_without_am_is_false() {
        // "schrieb:" without leading "Am " — catches L93 && mutation
        assert!(!is_attribution_text("Bob schrieb:"));
    }

    #[test]
    fn attribution_begin_forwarded_only() {
        // Only "Begin forwarded message" present — catches the || chain mutating to &&
        assert!(is_attribution_text("Begin forwarded message"));
    }

    #[test]
    fn attribution_original_message_only() {
        // Only "Original Message" present — catches the || chain mutating to &&
        assert!(is_attribution_text("-----Original Message-----"));
    }

    // -- Boundary tests for `is_reply_id` --

    #[test]
    fn reply_id_reply_message() {
        assert!(is_reply_id("reply-message"));
    }

    #[test]
    fn reply_id_olk_src_body_section() {
        assert!(is_reply_id("OLK_SRC_BODY_SECTION"));
    }

    #[test]
    fn reply_id_unknown_is_false() {
        // Catches `replace is_reply_id -> bool with true` mutant
        assert!(!is_reply_id("main-content"));
        assert!(!is_reply_id(""));
        assert!(!is_reply_id("reply"));
    }

    // -- Boundary tests for `find_attribution` --

    #[test]
    fn find_attribution_in_leading_text() {
        let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        let attr = find_attribution(el);
        assert!(attr.is_some());
        assert!(attr.unwrap().contains("wrote:"));
    }

    #[test]
    fn find_attribution_none_when_no_match() {
        let html = r#"<div>Just regular text here, nothing fancy.</div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_none());
    }

    #[test]
    fn find_attribution_stops_at_first_element_child() {
        // Element-then-text: the Text(_) arm should still match leading text BEFORE
        // hitting any element. With a leading element, the loop should `break`
        // out without inspecting later text. Catches "delete match arm Node::Element(_)".
        let html = r#"<div><span>hi</span>On Mon, Alice wrote:</div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        // Leading content is an element, not text — and the later text falls outside
        // the leading-text scan. So no attribution should be found from leading text.
        // Also, no preceding sibling. → None.
        assert!(find_attribution(el).is_none());
    }

    #[test]
    fn find_attribution_in_preceding_sibling() {
        let html = r#"<div><p>On Mon, Alice wrote:</p><div class="quote">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.quote");
        let el = doc.select(&sel).next().unwrap();
        let attr = find_attribution(el);
        assert!(attr.is_some(), "expected attribution from preceding <p>");
    }

    // -- Boundary tests for `has_attribution_then_quote` --
    // These exercise the function via `is_reply_boundary` since it's private.

    #[test]
    fn boundary_div_with_attribution_then_blockquote() {
        let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_blockquote_without_attribution_is_false() {
        // A bare blockquote-wrapping div without attribution text is not a boundary.
        // Catches "replace has_attribution_then_quote -> bool with false" (would
        // make this still pass, but the positive case above would fail).
        let html = r#"<div><blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_attribution_no_blockquote_is_false() {
        // Attribution text but no blockquote → not a boundary.
        // Catches the L151 == mutation (would treat any element as blockquote).
        let html = r#"<div>On Mon, Alice wrote:<p>not a quote</p></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_attribution_br_blockquote() {
        // Attribution → <br> → blockquote. The <br> must be skipped.
        // Catches the L155 != mutation in br-handling.
        let html = r#"<div>On Mon, Alice wrote:<br><blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_non_br_element_before_attribution_is_false() {
        // Non-br element BEFORE finding attribution → early return false.
        // Catches the L157 `!` deletion.
        let html = r#"<div><p>preface</p>On Mon, Alice wrote:<blockquote>q</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    // -- Boundary tests for `previous_sibling_text` --
    // Exercised via find_attribution since the function is private.

    #[test]
    fn prev_sibling_text_node() {
        // Raw Text node as preceding sibling. Inside a parent <div>, a leading
        // text run followed by a child <div class="q"> means the inner div's
        // `prev_sibling()` is a `Node::Text`. Catches `delete match arm Node::Text(text)`.
        let html = r#"<div>On Mon, Alice wrote:<div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_some());
    }

    #[test]
    fn prev_sibling_inline_span_with_attribution() {
        let html = r#"<div><span>On Mon, Alice wrote:</span><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_some());
    }

    #[test]
    fn prev_sibling_inline_font_with_attribution() {
        // <font> is also inline-treated; covers a different arm in the matches!.
        let html = r#"<div><font>On Mon, Alice wrote:</font><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_some());
    }

    #[test]
    fn prev_sibling_non_inline_element_returns_none() {
        // <table> is not in the inline whitelist → preceding-sibling lookup fails.
        let html = r#"<div><table><tr><td>On Mon, Alice wrote:</td></tr></table><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_none());
    }

    #[test]
    fn prev_sibling_empty_inline_returns_none() {
        let html = r#"<div><span>   </span><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        // Whitespace-only preceding span → no attribution match.
        assert!(find_attribution(el).is_none());
    }

    // -- Boundary tests for `is_outlook_separator` --

    #[test]
    fn outlook_from_date_subject_is_separator() {
        // Date instead of Sent → covers L206 || (Sent || Date) mutation
        let html = "<div>From: Alice\nDate: Monday\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn outlook_from_sent_no_subject_is_separator() {
        // From + Sent, no Subject → catches L209 mutating || to &&
        let html = "<div>From: Alice\nSent: Monday</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn outlook_from_subject_no_sent_is_separator() {
        // From + Subject, no Sent/Date → catches L209 mutating || to &&
        let html = "<div>From: Alice\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn outlook_from_only_is_not_separator() {
        // From alone (no Sent/Date/Subject) → must be false.
        // Catches L209 && mutation to ||.
        let html = "<div>From: Alice</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_outlook_separator(el));
    }

    #[test]
    fn outlook_sent_subject_no_from_is_not_separator() {
        // No From → must be false regardless of Sent/Subject presence.
        let html = "<div>Sent: Monday\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_outlook_separator(el));
    }
}