use once_cell::sync::Lazy;
use regex::Regex;
use scraper::{Html, Selector};
use serde::Serialize;
#[derive(Serialize)]
pub struct ContentStructure {
pub word_count: usize,
pub h1: Vec<String>,
pub h2: Vec<String>,
pub h3: Vec<String>,
pub has_tldr: bool,
pub has_faq: bool,
pub has_author: bool,
pub has_credentials: bool,
pub missing_alt_count: usize,
pub image_count: usize,
pub html_lang: Option<String>,
pub headings_in_order: Vec<HeadingOrderEntry>,
pub hreflangs: Vec<String>,
pub noscript_kind: NoscriptKind,
pub table_count: usize,
pub empty_heading_count: usize,
pub duplicate_heading_count: usize,
pub quotable_sentence_count: usize,
#[serde(skip_serializing)]
pub body_text: String,
}
#[derive(Serialize)]
pub struct HeadingOrderEntry {
pub level: u8,
pub text: String,
}
#[derive(Serialize, PartialEq, Clone, Copy)]
#[serde(rename_all = "snake_case")]
pub enum NoscriptKind {
Absent,
BoilerplateOnly,
Substantive,
}
static CREDENTIAL_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"\b[A-Z][a-zA-Z\-']+(?:\s+[A-Z][a-zA-Z\-']+){0,3}\s*,?\s*(MD|Ph\.?D\.?|MBA|MSc|MPH|DDS|DMD|JD|RN|DO|DPM|OD|PharmD|DVM|EdD|PsyD)\b",
)
.unwrap()
});
static TLDR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)TL;?DR:?\s*").unwrap());
static FAQ_HEADING_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)FAQ|Frequently\s+Asked\s+Questions").unwrap());
static AUTHOR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)author|by\s+[A-Z]|written\s+by").unwrap());
pub fn extract(doc: &Html) -> ContentStructure {
let body_text = extract_body_text(doc);
let h1 = headings(doc, "h1");
let h2 = headings(doc, "h2");
let h3 = headings(doc, "h3");
let heading_blob = format!("{} {} {}", h1.join(" "), h2.join(" "), h3.join(" "));
let (image_count, missing_alt_count) = count_images(doc);
let html_lang = extract_html_lang(doc);
let headings_in_order = extract_headings_in_order(doc);
let hreflangs = extract_hreflangs(doc);
let noscript_kind = classify_noscript(doc);
let table_count = count_tables(doc);
let (empty_heading_count, duplicate_heading_count) =
analyze_heading_quality(&headings_in_order);
let quotable_sentence_count = count_quotable_sentences(&body_text);
ContentStructure {
word_count: count_words(&body_text),
has_tldr: TLDR_RE.is_match(&body_text),
has_faq: FAQ_HEADING_RE.is_match(&heading_blob),
has_author: AUTHOR_RE.is_match(&body_text),
has_credentials: CREDENTIAL_RE.is_match(&body_text),
h1,
h2,
h3,
image_count,
missing_alt_count,
html_lang,
headings_in_order,
hreflangs,
noscript_kind,
table_count,
empty_heading_count,
duplicate_heading_count,
quotable_sentence_count,
body_text,
}
}
fn count_tables(doc: &Html) -> usize {
let sel = Selector::parse("table").unwrap();
doc.select(&sel).filter(|el| !is_in_chrome(*el)).count()
}
fn analyze_heading_quality(headings: &[HeadingOrderEntry]) -> (usize, usize) {
let empty = headings.iter().filter(|h| h.text.trim().is_empty()).count();
let mut seen: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
for h in headings {
let key = h.text.trim().to_ascii_lowercase();
if !key.is_empty() {
*seen.entry(key).or_insert(0) += 1;
}
}
let duplicates = seen.values().filter(|n| **n > 1).map(|n| n - 1).sum();
(empty, duplicates)
}
static QUOTABLE_SPLIT: Lazy<Regex> = Lazy::new(|| Regex::new(r"[.!?]+\s+").unwrap());
fn count_quotable_sentences(body_text: &str) -> usize {
QUOTABLE_SPLIT
.split(body_text)
.filter(|s| {
let words = s.split_whitespace().count();
(5..=25).contains(&words)
})
.count()
}
fn extract_headings_in_order(doc: &Html) -> Vec<HeadingOrderEntry> {
let sel = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap();
doc.select(&sel)
.filter(|el| !is_in_chrome(*el))
.filter_map(|el| {
let name = el.value().name();
let level: u8 = name.strip_prefix('h')?.parse().ok()?;
let text: String = el
.text()
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
if text.is_empty() {
return None;
}
Some(HeadingOrderEntry { level, text })
})
.collect()
}
fn is_in_chrome(el: scraper::ElementRef<'_>) -> bool {
let mut node = el.parent();
while let Some(n) = node {
if let scraper::Node::Element(elem) = n.value() {
match elem.name() {
"nav" | "footer" | "aside" => return true,
"header" => {
if let Some(eref) = scraper::ElementRef::wrap(n)
&& header_is_banner(eref)
{
return true;
}
}
_ => {}
}
}
node = n.parent();
}
false
}
fn extract_hreflangs(doc: &Html) -> Vec<String> {
let sel = Selector::parse("link[rel=\"alternate\"][hreflang]").unwrap();
let mut out: Vec<String> = doc
.select(&sel)
.filter_map(|el| el.value().attr("hreflang").map(|s| s.trim().to_string()))
.filter(|s| !s.is_empty())
.collect();
out.sort();
out.dedup();
out
}
fn classify_noscript(doc: &Html) -> NoscriptKind {
let sel = Selector::parse("noscript").unwrap();
let mut any_present = false;
let mut any_substantive = false;
for el in doc.select(&sel) {
any_present = true;
let text: String = el.text().collect::<String>().to_ascii_lowercase();
let trimmed = text.split_whitespace().collect::<Vec<_>>().join(" ");
let is_boilerplate = trimmed.contains("enable javascript")
|| trimmed.contains("requires javascript")
|| trimmed.contains("turn on javascript")
|| trimmed.contains("javascript is disabled");
if !is_boilerplate && trimmed.len() >= 40 {
any_substantive = true;
}
}
if !any_present {
NoscriptKind::Absent
} else if any_substantive {
NoscriptKind::Substantive
} else {
NoscriptKind::BoilerplateOnly
}
}
fn count_images(doc: &Html) -> (usize, usize) {
let sel = Selector::parse("img").unwrap();
let mut total = 0;
let mut missing = 0;
for el in doc.select(&sel) {
total += 1;
let alt = el.value().attr("alt").unwrap_or("").trim();
if alt.is_empty() {
missing += 1;
}
}
(total, missing)
}
fn extract_html_lang(doc: &Html) -> Option<String> {
let sel = Selector::parse("html").unwrap();
doc.select(&sel)
.next()
.and_then(|el| el.value().attr("lang"))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
fn headings(doc: &Html, tag: &str) -> Vec<String> {
let sel = Selector::parse(tag).unwrap();
doc.select(&sel)
.filter(|el| !is_in_chrome(*el))
.map(|el| el.text().collect::<String>().split_whitespace().collect::<Vec<_>>().join(" "))
.filter(|s| !s.is_empty())
.collect()
}
fn extract_body_text(doc: &Html) -> String {
let root_ref = pick_content_root(doc);
let mut buf = String::with_capacity(8 * 1024);
let mut stack: Vec<_> = root_ref.children().rev().collect();
while let Some(node) = stack.pop() {
match node.value() {
scraper::Node::Text(t) => {
buf.push_str(t);
buf.push(' ');
}
scraper::Node::Element(el) => {
if skip_element(el.name()) {
continue;
}
if el.name() == "header"
&& let Some(eref) = scraper::ElementRef::wrap(node)
&& header_is_banner(eref)
{
continue;
}
for child in node.children().rev() {
stack.push(child);
}
}
_ => {}
}
}
buf.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn header_is_banner(el: scraper::ElementRef<'_>) -> bool {
let nav_sel = Selector::parse("nav").unwrap();
el.select(&nav_sel).next().is_some()
}
fn pick_content_root(doc: &Html) -> scraper::ElementRef<'_> {
let main_sel = Selector::parse("main").unwrap();
if let Some(m) = doc.select(&main_sel).next() {
return m;
}
let body_sel = Selector::parse("body").unwrap();
doc.select(&body_sel).next().unwrap_or(doc.root_element())
}
fn skip_element(name: &str) -> bool {
matches!(
name,
"script"
| "style"
| "noscript"
| "template"
| "svg"
| "iframe"
| "object"
| "embed"
| "nav"
| "footer"
| "aside"
)
}
fn count_words(s: &str) -> usize {
s.split_whitespace().filter(|w| w.chars().any(char::is_alphanumeric)).count()
}