use scraper::node::Node;
use scraper::ElementRef;
pub fn is_reply_boundary(el: ElementRef) -> bool {
let element = el.value();
let name = element.name();
if element.attr("type") == Some("cite") {
return true;
}
if element.attr("class").is_some_and(is_reply_class) {
return true;
}
if element.attr("id").is_some_and(is_reply_id) {
return true;
}
if name == "div" && has_attribution_then_quote(el) {
return true;
}
false
}
pub fn find_attribution(el: ElementRef) -> Option<String> {
for child in el.children() {
match child.value() {
Node::Text(text) => {
let trimmed = text.text.trim();
if is_attribution_text(trimmed) {
return Some(trimmed.to_string());
}
}
Node::Element(_) => {
break;
}
_ => {}
}
}
if let Some(prev) = previous_sibling_text(el) {
let trimmed = prev.trim().to_string();
if is_attribution_text(&trimmed) {
return Some(trimmed);
}
}
None
}
fn is_attribution_text(text: &str) -> bool {
let t = text.trim();
if t.starts_with("On ") && t.ends_with("wrote:") {
return true;
}
if (t.starts_with("Le ") || t.starts_with("El "))
&& (t.ends_with("crit :") || t.ends_with("crit:"))
{
return true;
}
if t.starts_with("Am ") && (t.ends_with("schrieb:") || t.ends_with("schrieb :")) {
return true;
}
if t.contains("Forwarded message")
|| t.contains("Begin forwarded message")
|| t.contains("Original Message")
{
return true;
}
false
}
fn is_reply_class(class: &str) -> bool {
class.split_whitespace().any(|c| {
matches!(
c,
"gmail_quote"
| "gmail_extra"
| "yahoo_quoted"
| "protonmail_quote"
| "tutanota_quote"
| "moz-cite-prefix" | "zmail_extra" | "WordSection1" )
})
}
fn is_reply_id(id: &str) -> bool {
matches!(
id,
"divRplyFwdMsg" | "reply-message" | "OLK_SRC_BODY_SECTION" )
}
fn has_attribution_then_quote(el: ElementRef) -> bool {
let mut found_attribution = false;
for child in el.children() {
match child.value() {
Node::Text(text) => {
if is_attribution_text(text.text.trim()) {
found_attribution = true;
}
}
Node::Element(e) => {
if found_attribution && e.name() == "blockquote" {
return true;
}
if e.name() != "br" {
if !found_attribution {
return false;
}
}
}
_ => {}
}
}
false
}
fn previous_sibling_text(el: ElementRef) -> Option<String> {
let prev = el.prev_sibling()?;
match prev.value() {
Node::Text(text) => Some(text.text.to_string()),
Node::Element(e) => {
if matches!(e.name(), "span" | "font" | "b" | "i" | "div" | "p") {
let el_ref = ElementRef::wrap(prev)?;
let text: String = el_ref.text().collect();
if !text.trim().is_empty() {
return Some(text);
}
}
None
}
_ => None,
}
}
pub fn is_outlook_separator(el: ElementRef) -> bool {
let element = el.value();
if element.name() == "div" || element.name() == "p" {
let text: String = el.text().collect();
let t = text.trim();
let has_from = t.contains("From:");
let has_sent = t.contains("Sent:") || t.contains("Date:");
let has_subject = t.contains("Subject:");
if has_from && (has_sent || has_subject) {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::{Html, Selector};
fn parse_and_select(html: &str, selector: &str) -> (Html, Selector) {
let doc = Html::parse_document(html);
let sel = Selector::parse(selector).unwrap();
(doc, sel)
}
#[test]
fn attribution_on_wrote() {
assert!(is_attribution_text("On Mon, Jan 5, 2026 at 3:00 PM Alice <alice@example.com> wrote:"));
}
#[test]
fn attribution_forwarded() {
assert!(is_attribution_text("---------- Forwarded message ----------"));
}
#[test]
fn attribution_original_message() {
assert!(is_attribution_text("-----Original Message-----"));
}
#[test]
fn attribution_begin_forwarded() {
assert!(is_attribution_text("Begin forwarded message:"));
}
#[test]
fn not_attribution() {
assert!(!is_attribution_text("Hello, how are you?"));
assert!(!is_attribution_text("On the other hand, this is fine."));
}
#[test]
fn gmail_quote_class() {
assert!(is_reply_class("gmail_quote"));
}
#[test]
fn multiple_classes_with_reply() {
assert!(is_reply_class("some-class gmail_quote another"));
}
#[test]
fn non_reply_class() {
assert!(!is_reply_class("regular-div content-wrapper"));
}
#[test]
fn type_cite_is_boundary() {
let html = r#"<div type="cite"><p>quoted</p></div>"#;
let (doc, sel) = parse_and_select(html, r#"div[type="cite"]"#);
let el = doc.select(&sel).next().unwrap();
assert!(is_reply_boundary(el));
}
#[test]
fn gmail_quote_is_boundary() {
let html = r#"<div class="gmail_quote"><p>quoted</p></div>"#;
let (doc, sel) = parse_and_select(html, "div.gmail_quote");
let el = doc.select(&sel).next().unwrap();
assert!(is_reply_boundary(el));
}
#[test]
fn outlook_id_is_boundary() {
let html = r#"<div id="divRplyFwdMsg"><p>quoted</p></div>"#;
let (doc, sel) = parse_and_select(html, "#divRplyFwdMsg");
let el = doc.select(&sel).next().unwrap();
assert!(is_reply_boundary(el));
}
#[test]
fn plain_div_not_boundary() {
let html = r#"<div class="content"><p>not quoted</p></div>"#;
let (doc, sel) = parse_and_select(html, "div.content");
let el = doc.select(&sel).next().unwrap();
assert!(!is_reply_boundary(el));
}
#[test]
fn outlook_from_sent_subject() {
let html = "<div>From: Alice\nSent: Monday\nTo: Bob\nSubject: Hello</div>";
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(is_outlook_separator(el));
}
#[test]
fn regular_div_not_separator() {
let html = "<div>Just a normal paragraph.</div>";
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(!is_outlook_separator(el));
}
#[test]
fn attribution_on_without_wrote_is_false() {
assert!(!is_attribution_text("On the bright side, this is fine."));
}
#[test]
fn attribution_wrote_without_on_is_false() {
assert!(!is_attribution_text("Alice wrote:"));
}
#[test]
fn attribution_french_le_with_colon_space() {
assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :"));
}
#[test]
fn attribution_french_le_no_space_before_colon() {
assert!(is_attribution_text("Le lundi, Alice a écrit:"));
}
#[test]
fn attribution_spanish_el_with_colon_space() {
assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :"));
}
#[test]
fn attribution_spanish_el_no_space_before_colon() {
assert!(is_attribution_text("El lunes, Alice a escrit:"));
}
#[test]
fn attribution_french_le_without_wrote_ending_is_false() {
assert!(!is_attribution_text("Le lundi, Alice est ici."));
}
#[test]
fn attribution_starts_with_le_but_not_french_pattern() {
assert!(!is_attribution_text("Le sigh."));
}
#[test]
fn attribution_german_am_with_colon() {
assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:"));
}
#[test]
fn attribution_german_am_with_space_colon() {
assert!(is_attribution_text("Am Montag schrieb :"));
}
#[test]
fn attribution_german_am_without_schrieb_is_false() {
assert!(!is_attribution_text("Am very fine, thanks."));
}
#[test]
fn attribution_german_schrieb_without_am_is_false() {
assert!(!is_attribution_text("Bob schrieb:"));
}
#[test]
fn attribution_begin_forwarded_only() {
assert!(is_attribution_text("Begin forwarded message"));
}
#[test]
fn attribution_original_message_only() {
assert!(is_attribution_text("-----Original Message-----"));
}
#[test]
fn reply_id_reply_message() {
assert!(is_reply_id("reply-message"));
}
#[test]
fn reply_id_olk_src_body_section() {
assert!(is_reply_id("OLK_SRC_BODY_SECTION"));
}
#[test]
fn reply_id_unknown_is_false() {
assert!(!is_reply_id("main-content"));
assert!(!is_reply_id(""));
assert!(!is_reply_id("reply"));
}
#[test]
fn find_attribution_in_leading_text() {
let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
let attr = find_attribution(el);
assert!(attr.is_some());
assert!(attr.unwrap().contains("wrote:"));
}
#[test]
fn find_attribution_none_when_no_match() {
let html = r#"<div>Just regular text here, nothing fancy.</div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(find_attribution(el).is_none());
}
#[test]
fn find_attribution_stops_at_first_element_child() {
let html = r#"<div><span>hi</span>On Mon, Alice wrote:</div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(find_attribution(el).is_none());
}
#[test]
fn find_attribution_in_preceding_sibling() {
let html = r#"<div><p>On Mon, Alice wrote:</p><div class="quote">body</div></div>"#;
let (doc, sel) = parse_and_select(html, "div.quote");
let el = doc.select(&sel).next().unwrap();
let attr = find_attribution(el);
assert!(attr.is_some(), "expected attribution from preceding <p>");
}
#[test]
fn boundary_div_with_attribution_then_blockquote() {
let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(is_reply_boundary(el));
}
#[test]
fn boundary_div_blockquote_without_attribution_is_false() {
let html = r#"<div><blockquote>quoted</blockquote></div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(!is_reply_boundary(el));
}
#[test]
fn boundary_div_attribution_no_blockquote_is_false() {
let html = r#"<div>On Mon, Alice wrote:<p>not a quote</p></div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(!is_reply_boundary(el));
}
#[test]
fn boundary_div_attribution_br_blockquote() {
let html = r#"<div>On Mon, Alice wrote:<br><blockquote>quoted</blockquote></div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(is_reply_boundary(el));
}
#[test]
fn boundary_div_non_br_element_before_attribution_is_false() {
let html = r#"<div><p>preface</p>On Mon, Alice wrote:<blockquote>q</blockquote></div>"#;
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(!is_reply_boundary(el));
}
#[test]
fn prev_sibling_text_node() {
let html = r#"<div>On Mon, Alice wrote:<div class="q">body</div></div>"#;
let (doc, sel) = parse_and_select(html, "div.q");
let el = doc.select(&sel).next().unwrap();
assert!(find_attribution(el).is_some());
}
#[test]
fn prev_sibling_inline_span_with_attribution() {
let html = r#"<div><span>On Mon, Alice wrote:</span><div class="q">body</div></div>"#;
let (doc, sel) = parse_and_select(html, "div.q");
let el = doc.select(&sel).next().unwrap();
assert!(find_attribution(el).is_some());
}
#[test]
fn prev_sibling_inline_font_with_attribution() {
let html = r#"<div><font>On Mon, Alice wrote:</font><div class="q">body</div></div>"#;
let (doc, sel) = parse_and_select(html, "div.q");
let el = doc.select(&sel).next().unwrap();
assert!(find_attribution(el).is_some());
}
#[test]
fn prev_sibling_non_inline_element_returns_none() {
let html = r#"<div><table><tr><td>On Mon, Alice wrote:</td></tr></table><div class="q">body</div></div>"#;
let (doc, sel) = parse_and_select(html, "div.q");
let el = doc.select(&sel).next().unwrap();
assert!(find_attribution(el).is_none());
}
#[test]
fn prev_sibling_empty_inline_returns_none() {
let html = r#"<div><span> </span><div class="q">body</div></div>"#;
let (doc, sel) = parse_and_select(html, "div.q");
let el = doc.select(&sel).next().unwrap();
assert!(find_attribution(el).is_none());
}
#[test]
fn outlook_from_date_subject_is_separator() {
let html = "<div>From: Alice\nDate: Monday\nSubject: Hello</div>";
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(is_outlook_separator(el));
}
#[test]
fn outlook_from_sent_no_subject_is_separator() {
let html = "<div>From: Alice\nSent: Monday</div>";
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(is_outlook_separator(el));
}
#[test]
fn outlook_from_subject_no_sent_is_separator() {
let html = "<div>From: Alice\nSubject: Hello</div>";
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(is_outlook_separator(el));
}
#[test]
fn outlook_from_only_is_not_separator() {
let html = "<div>From: Alice</div>";
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(!is_outlook_separator(el));
}
#[test]
fn outlook_sent_subject_no_from_is_not_separator() {
let html = "<div>Sent: Monday\nSubject: Hello</div>";
let (doc, sel) = parse_and_select(html, "div");
let el = doc.select(&sel).next().unwrap();
assert!(!is_outlook_separator(el));
}
}