use scraper::{ElementRef, Selector};
use std::collections::HashSet;
#[derive(Debug, Clone, Copy)]
pub struct ExtractionParams {
pub min_block_words: usize,
pub max_block_link_density: f32,
}
impl Default for ExtractionParams {
fn default() -> Self {
Self { min_block_words: 5, max_block_link_density: 0.5 }
}
}
impl ExtractionParams {
pub fn from_normalized(params: &[f32]) -> Self {
let p0 = params.first().copied().unwrap_or(0.0).clamp(-1.0, 1.0);
let p1 = params.get(1).copied().unwrap_or(0.0).clamp(-1.0, 1.0);
let min_block_words = (1.0 + (p0 + 1.0) * 19.5).round().clamp(1.0, 40.0) as usize;
let max_block_link_density = (0.1 + (p1 + 1.0) * 0.4).clamp(0.1, 0.9);
Self { min_block_words, max_block_link_density }
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct NodeFeatures {
pub word_count_norm: f32,
pub char_count_norm: f32,
pub link_density: f32,
pub stopword_ratio: f32,
pub p_count_norm: f32,
pub text_tag_ratio: f32,
pub depth_norm: f32,
pub comma_density: f32,
pub tag_article: f32,
pub tag_main: f32,
pub tag_section: f32,
pub tag_div: f32,
pub tag_other: f32,
pub class_positive: f32,
pub class_negative: f32,
pub unique_word_ratio: f32,
}
impl NodeFeatures {
pub const DIM: usize = 16;
pub fn zeros() -> Self {
Self {
word_count_norm: 0.0,
char_count_norm: 0.0,
link_density: 0.0,
stopword_ratio: 0.0,
p_count_norm: 0.0,
text_tag_ratio: 0.0,
depth_norm: 0.0,
comma_density: 0.0,
tag_article: 0.0,
tag_main: 0.0,
tag_section: 0.0,
tag_div: 0.0,
tag_other: 0.0,
class_positive: 0.0,
class_negative: 0.0,
unique_word_ratio: 0.0,
}
}
pub fn to_vec(&self) -> Vec<f32> {
vec![
self.word_count_norm,
self.char_count_norm,
self.link_density,
self.stopword_ratio,
self.p_count_norm,
self.text_tag_ratio,
self.depth_norm,
self.comma_density,
self.tag_article,
self.tag_main,
self.tag_section,
self.tag_div,
self.tag_other,
self.class_positive,
self.class_negative,
self.unique_word_ratio,
]
}
pub fn heuristic_content_score(&self) -> f32 {
let mut s = 0.0;
s += 0.40 * self.word_count_norm;
s += 0.20 * self.p_count_norm;
s += 0.15 * (1.0 - self.link_density);
s += 0.10 * self.class_positive;
s += 0.10 * self.tag_article;
s += 0.05 * self.tag_main;
s -= 0.40 * self.class_negative;
s -= 0.20 * self.link_density;
s.clamp(0.0, 1.0)
}
}
const POSITIVE_HINTS: &[&str] = &[
"article", "content", "post", "story", "body", "entry", "main", "text",
"blog", "page",
];
const NEGATIVE_HINTS: &[&str] = &[
"comment", "sidebar", "footer", "header", "nav", "menu", "ad", "advert",
"promo", "share", "social", "related", "widget", "banner", "popup",
"cookie", "newsletter", "subscribe", "breadcrumb",
];
fn p_selector() -> Selector {
Selector::parse("p").unwrap()
}
fn a_selector() -> Selector {
Selector::parse("a").unwrap()
}
pub fn node_text(el: &ElementRef) -> String {
let raw: String = el.text().collect::<Vec<_>>().join(" ");
raw.split_whitespace().collect::<Vec<_>>().join(" ")
}
#[derive(Debug, Clone)]
pub struct TextBlock {
pub text: String,
pub words: usize,
pub link_density: f32,
}
#[derive(Debug, Clone)]
pub struct CandidateContent {
pub blocks: Vec<TextBlock>,
pub full_text: String,
pub full_link_density: f32,
}
impl CandidateContent {
pub fn extract(&self, params: &ExtractionParams) -> String {
let kept: Vec<&str> = self
.blocks
.iter()
.filter(|b| b.words >= params.min_block_words && b.link_density <= params.max_block_link_density)
.map(|b| b.text.as_str())
.collect();
if kept.is_empty() {
if self.full_link_density <= params.max_block_link_density
&& self.full_text.split_whitespace().count() >= params.min_block_words
{
return self.full_text.clone();
}
return String::new();
}
kept.join("\n\n")
}
}
pub fn node_content(el: &ElementRef) -> CandidateContent {
let p_sel = p_selector();
let blocks: Vec<TextBlock> = el
.select(&p_sel)
.map(|p| {
let text = node_text(&p);
let words = text.split_whitespace().count();
TextBlock { words, link_density: link_density(&p), text }
})
.collect();
CandidateContent {
blocks,
full_text: node_text(el),
full_link_density: link_density(el),
}
}
pub fn extract_node_text(el: &ElementRef, params: &ExtractionParams) -> String {
node_content(el).extract(params)
}
pub fn link_density(el: &ElementRef) -> f32 {
let total = node_text(el).chars().count();
if total == 0 {
return 0.0;
}
let a_sel = a_selector();
let link_chars: usize = el
.select(&a_sel)
.map(|a| node_text(&a).chars().count())
.sum();
(link_chars as f32 / total as f32).clamp(0.0, 1.0)
}
fn class_id_hint_scores(el: &ElementRef) -> (f32, f32) {
let mut haystack = String::new();
if let Some(c) = el.value().attr("class") {
haystack.push_str(&c.to_lowercase());
haystack.push(' ');
}
if let Some(id) = el.value().attr("id") {
haystack.push_str(&id.to_lowercase());
}
if haystack.is_empty() {
return (0.0, 0.0);
}
let pos = POSITIVE_HINTS.iter().filter(|h| haystack.contains(**h)).count();
let neg = NEGATIVE_HINTS.iter().filter(|h| haystack.contains(**h)).count();
let pos_score = (pos as f32 / 2.0).clamp(0.0, 1.0);
let neg_score = (neg as f32 / 2.0).clamp(0.0, 1.0);
(pos_score, neg_score)
}
fn node_depth(el: &ElementRef) -> usize {
let mut depth = 0;
let mut current = Some(*el);
while let Some(e) = current {
depth += 1;
current = e.parent().and_then(ElementRef::wrap);
}
depth
}
pub fn extract_features(el: &ElementRef, stopwords: &HashSet<String>) -> NodeFeatures {
let text = node_text(el);
let tokens: Vec<&str> = text.split_whitespace().collect();
let word_count = tokens.len();
let char_count = text.chars().count();
let word_count_norm = if word_count == 0 {
0.0
} else {
((word_count as f32 + 1.0).ln() / (5000f32).ln()).clamp(0.0, 1.0)
};
let char_count_norm = ((char_count as f32 + 1.0).ln() / (40000f32).ln()).clamp(0.0, 1.0);
let link_density = link_density(el);
let stopword_ratio = if word_count == 0 {
0.0
} else {
let sw = tokens
.iter()
.filter(|t| stopwords.contains(&t.to_lowercase()))
.count();
(sw as f32 / word_count as f32).clamp(0.0, 1.0)
};
let p_count = el.select(&p_selector()).count();
let p_count_norm = (p_count as f32 / 30.0).clamp(0.0, 1.0);
let element_count = el
.descendants()
.filter(|n| n.value().is_element())
.count()
.max(1);
let text_tag_ratio = ((word_count as f32 / element_count as f32) / 50.0).clamp(0.0, 1.0);
let depth_norm = (node_depth(el) as f32 / 30.0).clamp(0.0, 1.0);
let comma_count = text.chars().filter(|c| *c == ',').count();
let comma_density = if word_count == 0 {
0.0
} else {
((comma_count as f32 / word_count as f32) / 0.2).clamp(0.0, 1.0)
};
let tag = el.value().name().to_lowercase();
let (tag_article, tag_main, tag_section, tag_div, tag_other) = match tag.as_str() {
"article" => (1.0, 0.0, 0.0, 0.0, 0.0),
"main" => (0.0, 1.0, 0.0, 0.0, 0.0),
"section" => (0.0, 0.0, 1.0, 0.0, 0.0),
"div" => (0.0, 0.0, 0.0, 1.0, 0.0),
_ => (0.0, 0.0, 0.0, 0.0, 1.0),
};
let (class_positive, class_negative) = class_id_hint_scores(el);
let unique_word_ratio = if word_count == 0 {
0.0
} else {
let unique: HashSet<String> = tokens.iter().map(|t| t.to_lowercase()).collect();
(unique.len() as f32 / word_count as f32).clamp(0.0, 1.0)
};
NodeFeatures {
word_count_norm,
char_count_norm,
link_density,
stopword_ratio,
p_count_norm,
text_tag_ratio,
depth_norm,
comma_density,
tag_article,
tag_main,
tag_section,
tag_div,
tag_other,
class_positive,
class_negative,
unique_word_ratio,
}
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::Html;
fn stopwords() -> HashSet<String> {
["the", "a", "is", "of", "and", "to", "in", "with", "this", "for"]
.into_iter()
.map(|s| s.to_string())
.collect()
}
fn first_matching<'a>(doc: &'a Html, sel: &str) -> ElementRef<'a> {
doc.select(&Selector::parse(sel).unwrap()).next().unwrap()
}
#[test]
fn to_vec_len_matches_dim() {
assert_eq!(NodeFeatures::zeros().to_vec().len(), NodeFeatures::DIM);
}
#[test]
fn content_node_scores_higher_than_boilerplate() {
let html = r#"
<html><body>
<article class="post-content">
<p>This is the real article body with a lot of meaningful text content.</p>
<p>It has multiple paragraphs describing important information in detail here.</p>
<p>Readers expect substantial prose and varied vocabulary throughout the piece.</p>
</article>
<div class="sidebar-ads">
<a href="/1">Link one</a> <a href="/2">Link two</a> <a href="/3">Link three</a>
</div>
</body></html>
"#;
let doc = Html::parse_document(html);
let sw = stopwords();
let article = first_matching(&doc, "article");
let sidebar = first_matching(&doc, "div");
let art_feat = extract_features(&article, &sw);
let side_feat = extract_features(&sidebar, &sw);
assert!(art_feat.link_density < side_feat.link_density);
assert!(art_feat.word_count_norm > side_feat.word_count_norm);
assert!(art_feat.class_positive > 0.0);
assert!(side_feat.class_negative > 0.0);
assert!(
art_feat.heuristic_content_score() > side_feat.heuristic_content_score(),
"article {} should beat sidebar {}",
art_feat.heuristic_content_score(),
side_feat.heuristic_content_score()
);
}
#[test]
fn different_nodes_yield_different_features() {
let html = r#"
<html><body>
<article><p>Alpha beta gamma delta epsilon zeta eta theta iota kappa.</p></article>
<div class="nav"><a href="/x">x</a></div>
</body></html>
"#;
let doc = Html::parse_document(html);
let sw = stopwords();
let a = extract_features(&first_matching(&doc, "article"), &sw);
let d = extract_features(&first_matching(&doc, "div"), &sw);
assert_ne!(a.to_vec(), d.to_vec(), "distinct nodes must have distinct features");
}
#[test]
fn extraction_params_change_extracted_text() {
let html = r#"
<html><body>
<article>
<p>Short one.</p>
<p>This paragraph is clearly long enough to survive a high minimum word threshold filter.</p>
</article>
</body></html>
"#;
let doc = Html::parse_document(html);
let article = first_matching(&doc, "article");
let lenient = ExtractionParams { min_block_words: 1, max_block_link_density: 0.9 };
let strict = ExtractionParams { min_block_words: 6, max_block_link_density: 0.9 };
let lenient_text = extract_node_text(&article, &lenient);
let strict_text = extract_node_text(&article, &strict);
assert!(lenient_text.contains("Short one"));
assert!(!strict_text.contains("Short one"));
assert_ne!(lenient_text, strict_text);
}
#[test]
fn normalized_params_map_into_range() {
let lo = ExtractionParams::from_normalized(&[-1.0, -1.0]);
let hi = ExtractionParams::from_normalized(&[1.0, 1.0]);
assert!(lo.min_block_words >= 1);
assert!(hi.min_block_words <= 40);
assert!(lo.max_block_link_density >= 0.1);
assert!(hi.max_block_link_density <= 0.9);
assert!(hi.min_block_words > lo.min_block_words);
}
}