use scraper::ElementRef;
#[derive(Debug, Clone, Default)]
pub struct DomFeatures {
pub text_density: f64,
pub link_ratio: f64,
pub primary_root_tag: String,
}
pub fn compute(root: ElementRef<'_>) -> DomFeatures {
let total_text_chars: usize = root.text().map(|s| s.chars().count()).sum();
let element_count = count_descendant_elements(root).max(1);
let raw_density = total_text_chars as f64 / element_count as f64;
let text_density = normalized_text_density(raw_density);
let anchor_chars = anchor_text_chars(root);
let link_ratio = if total_text_chars == 0 {
0.0
} else {
(anchor_chars as f64 / total_text_chars as f64).clamp(0.0, 1.0)
};
DomFeatures {
text_density,
link_ratio,
primary_root_tag: root.value().name().to_string(),
}
}
pub fn normalized_text_density(raw: f64) -> f64 {
if raw <= 0.0 {
0.0
} else if raw <= 10.0 {
raw / 25.0
} else if raw <= 25.0 {
0.4 + (raw - 10.0) / 75.0
} else {
0.6 + ((raw - 25.0) / 100.0).min(0.4)
}
}
fn count_descendant_elements(el: ElementRef<'_>) -> usize {
let mut n = 0;
for desc in el.descendants() {
if desc.value().is_element() {
n += 1;
}
}
n
}
fn anchor_text_chars(root: ElementRef<'_>) -> usize {
let sel = scraper::Selector::parse("a").unwrap();
root.select(&sel)
.map(|a| a.text().map(|s| s.chars().count()).sum::<usize>())
.sum()
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::{Html, Selector};
fn first_root(html: &str, sel: &str) -> DomFeatures {
let doc = Html::parse_document(html);
let s = Selector::parse(sel).unwrap();
let root = doc.select(&s).next().expect("missing root");
compute(root)
}
#[test]
fn density_piecewise_monotonic() {
let xs = [0.0, 5.0, 10.0, 15.0, 25.0, 50.0, 100.0, 125.0, 200.0];
let ys: Vec<f64> = xs.iter().map(|x| normalized_text_density(*x)).collect();
for w in ys.windows(2) {
assert!(w[1] >= w[0], "non-monotonic at {w:?}");
}
assert!(ys.last().unwrap() <= &1.0);
}
#[test]
fn density_caps_near_one() {
assert!((normalized_text_density(125.0) - 1.0).abs() < 1e-9);
assert!(normalized_text_density(1000.0) <= 1.0);
}
#[test]
fn article_root_has_high_density_low_links() {
let mut html = String::from("<!doctype html><html><body><article>");
for _ in 0..6 {
html.push_str("<p>Researchers studying migration patterns observed that flocks travel surprising distances each season. The findings suggest environmental cues guide navigation across continents.</p>");
}
html.push_str("</article></body></html>");
let f = first_root(&html, "article");
assert!(f.text_density > 0.6, "got {}", f.text_density);
assert!(f.link_ratio < 0.1);
}
#[test]
fn listing_root_has_low_density_high_links() {
let mut html = String::from("<!doctype html><html><body><div class=\"cards\">");
for i in 0..12 {
html.push_str(&format!(
"<article class=\"card\"><a href=\"/p/{i}\"><h3>Title {i}</h3></a></article>"
));
}
html.push_str("</div></body></html>");
let f = first_root(&html, "div.cards");
assert!(f.link_ratio > 0.5, "got {}", f.link_ratio);
assert!(f.text_density < 0.5, "got {}", f.text_density);
}
}