use once_cell::sync::Lazy;
use regex::Regex;
use scraper::{Element, ElementRef, Html, Selector};
use crate::types::CallToAction;
static SEL_JSON_LD: Lazy<Selector> =
Lazy::new(|| Selector::parse(r#"script[type="application/ld+json"]"#).unwrap());
static SEL_LINKS: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());
static RE_BLACKLIST: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)unsubscribe|datenschutz|impressum|privacy|terms|abmelden").unwrap()
});
static RE_BARE_HOMEPAGE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^https?://(www\.)?[^/]+/?$").unwrap());
static RE_BTN_CLASS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\b(btn|button|cta)\b").unwrap());
static RE_ACTION_KW: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?i)geht\s+es\s+zum|hier\s+klicken|mehr\s+erfahren|\b(view|read|open|see|check|reply|continue|go|access|confirm|download|get|start|register|subscribe|buy|order|shop|learn|discover|explore|join)\b",
)
.unwrap()
});
static RE_ARIA_ACTION: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)\b(view|read|open|see|check|reply|continue|go|access|confirm)\b").unwrap()
});
static RE_TEXT_ALIGN_CENTER: Lazy<Regex> =
Lazy::new(|| Regex::new(r"text-align\s*:\s*center").unwrap());
static RE_PADDING_BIG: Lazy<Regex> = Lazy::new(|| Regex::new(r"padding\s*:\s*\d{2,}").unwrap());
pub fn extract_cta(html: &str) -> Option<CallToAction> {
let doc = Html::parse_document(html);
if let Some(cta) = extract_json_ld(&doc) {
return Some(cta);
}
extract_heuristic(&doc)
}
fn extract_json_ld(doc: &Html) -> Option<CallToAction> {
for script in doc.select(&SEL_JSON_LD) {
let text: String = script.text().collect();
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
if let Some(target) = json
.get("potentialAction")
.and_then(|a| a.get("target"))
.and_then(|t| t.as_str())
{
let name = json
.get("name")
.and_then(|n| n.as_str())
.unwrap_or("View")
.to_string();
return Some(CallToAction {
url: target.to_string(),
text: name,
confidence: 1.0,
});
}
}
}
None
}
fn extract_heuristic(doc: &Html) -> Option<CallToAction> {
let mut best: Option<(i32, CallToAction)> = None;
for link in doc.select(&SEL_LINKS) {
let href = match link.value().attr("href") {
Some(h) => h,
None => continue,
};
if !href.starts_with("http://") && !href.starts_with("https://") {
continue;
}
let text = link
.value()
.attr("title")
.map(|t| t.trim().to_string())
.unwrap_or_else(|| link.text().collect::<String>().trim().to_string());
if text.is_empty() {
continue;
}
if RE_BLACKLIST.is_match(&text) || RE_BLACKLIST.is_match(href) {
continue;
}
if RE_BARE_HOMEPAGE.is_match(href) {
continue;
}
if wraps_logo_img(&link) {
continue;
}
if in_excluded_ancestor(&link) {
continue;
}
let score = score_link(&link, &text);
if score < 40 {
continue;
}
if best.as_ref().is_none_or(|(s, _)| score > *s) {
best = Some((
score,
CallToAction {
url: href.to_string(),
text,
confidence: f64::min(score as f64 / 100.0, 1.0),
},
));
}
}
best.map(|(_, cta)| cta)
}
fn score_link(link: &ElementRef, text: &str) -> i32 {
let mut score = 0i32;
if is_button_styled(link) {
score += 30;
}
if RE_ACTION_KW.is_match(text) {
score += 25;
if text.len() >= 20 {
score += 40;
}
}
if prominent_count(link, text) >= 2 {
score += 20;
}
if let Some(aria) = link.value().attr("aria-label") {
if RE_ARIA_ACTION.is_match(aria) {
score += 15;
}
}
score
}
fn is_button_styled(link: &ElementRef) -> bool {
if let Some(cls) = link.value().attr("class") {
if RE_BTN_CLASS.is_match(cls) {
return true;
}
}
if link.value().attr("role") == Some("button") {
return true;
}
if let Some(parent) = link.parent_element() {
if parent.value().name() == "td" {
if let Some(style) = parent.value().attr("style") {
let has_padding = style.contains("padding");
let has_bg_or_border = style.contains("background") || style.contains("border");
if has_padding && has_bg_or_border {
return true;
}
}
}
}
false
}
fn prominent_count(link: &ElementRef, text: &str) -> usize {
let mut count = 0;
if let Some(parent) = link.parent_element() {
if let Some(style) = parent.value().attr("style") {
if RE_TEXT_ALIGN_CENTER.is_match(style) {
count += 1;
}
if RE_PADDING_BIG.is_match(style) {
count += 1;
}
}
}
if text.len() > 10 {
count += 1;
}
count
}
fn wraps_logo_img(link: &ElementRef) -> bool {
static SEL_IMG: Lazy<Selector> = Lazy::new(|| Selector::parse("img").unwrap());
for img in link.select(&SEL_IMG) {
if let Some(alt) = img.value().attr("alt") {
let lower = alt.to_lowercase();
if lower.contains("logo") || lower.contains("brand") {
return true;
}
}
}
false
}
fn in_excluded_ancestor(link: &ElementRef) -> bool {
let mut current = link.parent_element();
while let Some(el) = current {
let name = el.value().name();
let role = el.value().attr("role").unwrap_or("");
if name == "header" || name == "footer" || role == "banner" || role == "contentinfo" {
return true;
}
current = el.parent_element();
}
false
}
#[cfg(test)]
mod tests {
use super::*;
fn wrap(link: &str) -> String {
format!("<html><body>{}</body></html>", link)
}
#[test]
fn test_bare_homepage_filtered() {
let html = wrap(r#"<a href="https://example.com/">View your account now today</a>"#);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_bare_homepage_with_path_not_filtered() {
let html = wrap(
r#"<a href="https://example.com/dashboard" class="btn">Go to your dashboard now</a>"#,
);
let cta = extract_cta(&html);
assert!(
cta.is_some(),
"link with path should not be bare-homepage-filtered"
);
}
#[test]
fn test_unsubscribe_in_text_filtered() {
let html = wrap(r#"<a href="https://example.com/manage">Unsubscribe from this list</a>"#);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_unsubscribe_in_href_filtered() {
let html = wrap(r#"<a href="https://example.com/unsubscribe">Manage preferences</a>"#);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_mailto_filtered() {
let html = wrap(r#"<a href="mailto:support@example.com">Contact us</a>"#);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_low_score_discarded() {
let html = wrap(r#"<a href="https://example.com/page">Random link text here</a>"#);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_logo_image_filtered() {
let html = wrap(
r#"<a href="https://www.company.com/home"><img src="logo.png" alt="Company Logo" /></a>"#,
);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_brand_image_filtered() {
let html = wrap(
r#"<a href="https://example.com/brand"><img src="img.png" alt="Brand Image" />View offer</a>"#,
);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_empty_alt_image_not_logo_filtered() {
let html = wrap(r#"<a href="https://example.com/view"><img src="img.png" /></a>"#);
assert!(extract_cta(&html).is_none());
}
#[test]
fn test_button_class_detected() {
let html =
wrap(r#"<a href="https://app.example.com/dash" class="btn">Go to Dashboard</a>"#);
let cta = extract_cta(&html).expect("should find CTA with btn class");
assert_eq!(cta.url, "https://app.example.com/dash");
assert!(cta.confidence >= 0.5);
}
#[test]
fn test_de_keyword_long_text() {
let html =
wrap(r#"<a href="https://example.com/postfach">Hier geht es zum TK-Postfach</a>"#);
let cta = extract_cta(&html).expect("should find CTA with DE keyword");
assert!(
cta.confidence > 0.6,
"confidence should be > 0.6, got {}",
cta.confidence
);
assert_eq!(cta.url, "https://example.com/postfach");
}
#[test]
fn test_json_ld_fast_path() {
let html = r#"<html><head>
<script type="application/ld+json">
{"@type":"EmailMessage","name":"View Invoice","potentialAction":{"@type":"ViewAction","target":"https://example.com/invoice/42"}}
</script></head><body></body></html>"#;
let cta = extract_cta(html).expect("should find JSON-LD CTA");
assert_eq!(cta.url, "https://example.com/invoice/42");
assert_eq!(cta.text, "View Invoice");
assert_eq!(cta.confidence, 1.0);
}
#[test]
fn test_json_ld_missing_target_falls_through() {
let html = r#"<html><head>
<script type="application/ld+json">{"@type":"EmailMessage","name":"Hello"}</script>
</head><body>
<a href="https://example.com/view" class="btn">View your report now here</a>
</body></html>"#;
let cta = extract_cta(html).expect("should fall through to heuristic");
assert!(cta.url.contains("example.com/view"));
}
}