use ego_tree::NodeId;
use scraper::ElementRef;
use std::collections::HashMap;
use xxhash_rust::xxh3::xxh3_64;
pub trait ElementChildren<'a> {
fn element_children(self) -> Box<dyn Iterator<Item = ElementRef<'a>> + 'a>;
}
impl<'a> ElementChildren<'a> for ElementRef<'a> {
fn element_children(self) -> Box<dyn Iterator<Item = ElementRef<'a>> + 'a> {
Box::new(self.children().filter_map(ElementRef::wrap))
}
}
pub fn text_char_len(el: ElementRef<'_>) -> usize {
el.text().map(|s| s.chars().count()).sum()
}
pub fn link_char_len(el: ElementRef<'_>) -> usize {
let sel = scraper::Selector::parse("a").unwrap();
el.select(&sel)
.map(|a| a.text().map(|s| s.chars().count()).sum::<usize>())
.sum()
}
pub fn non_link_char_ratio(el: ElementRef<'_>) -> f64 {
let total = text_char_len(el);
if total == 0 {
return 1.0;
}
let link = link_char_len(el);
let non = total.saturating_sub(link);
(non as f64 / total as f64).clamp(0.0, 1.0)
}
pub fn node_signature(el: ElementRef<'_>, depth: u8, memo: &mut HashMap<NodeId, u64>) -> u64 {
let id = el.id();
if let Some(cached) = memo.get(&id) {
return *cached;
}
let txt: String = el.text().collect();
if txt.trim().is_empty() {
memo.insert(id, 0);
return 0;
}
let kids: Vec<&str> = el.element_children().map(|c| c.value().name()).collect();
let bucket: u8 = match kids.len() {
0 => 0,
1 => 1,
2..=4 => 2,
5..=9 => 3,
_ => 4,
};
let mut buf = String::with_capacity(64 + kids.len() * 8);
buf.push_str(el.value().name());
buf.push('|');
for k in &kids {
buf.push_str(k);
buf.push(',');
}
buf.push('|');
buf.push((b'0' + bucket) as char);
let mut sig = xxh3_64(buf.as_bytes());
if depth > 0 {
for (idx, child) in el.element_children().enumerate() {
let rot = (((idx as u64).wrapping_mul(11)) ^ 7) % 64;
let csig = node_signature(child, depth - 1, memo);
sig ^= csig.rotate_left(rot as u32);
}
}
memo.insert(id, sig);
sig
}
pub fn count_repeating_shape_siblings(el: ElementRef<'_>) -> usize {
let mut memo: HashMap<NodeId, u64> = HashMap::new();
let mut groups: HashMap<u64, usize> = HashMap::new();
for child in el.element_children() {
let sig = node_signature(child, 2, &mut memo);
if sig == 0 {
continue;
}
*groups.entry(sig).or_insert(0) += 1;
}
groups.values().copied().max().unwrap_or(0)
}
pub fn has_paragraph_island(el: ElementRef<'_>, min_chars: usize) -> bool {
el.element_children()
.filter(|c| c.value().name().eq_ignore_ascii_case("p"))
.any(|p| {
let total = text_char_len(p);
let link = link_char_len(p);
total.saturating_sub(link) >= min_chars
})
}
pub fn in_reference_section(el: ElementRef<'_>) -> bool {
let mut cur = el.parent();
while let Some(node) = cur {
if let Some(parent_el) = ElementRef::wrap(node) {
let v = parent_el.value();
if let Some(role) = v.attr("role")
&& role.eq_ignore_ascii_case("doc-bibliography")
{
return true;
}
let class = v.attr("class").unwrap_or("");
let id = v.attr("id").unwrap_or("");
let blob = format!("{class} {id}").to_ascii_lowercase();
if blob.contains("references")
|| blob.contains("ref-list")
|| blob.contains("citation-list")
|| blob.contains("bibliography")
{
return true;
}
if v.name() == "section"
&& let Some(h) = parent_el
.select(&scraper::Selector::parse("h1,h2,h3,h4").unwrap())
.next()
{
let head: String = h.text().collect::<String>().trim().to_lowercase();
const KW: &[&str] = &[
"references",
"reference",
"bibliography",
"works cited",
"citations",
"further reading",
"参考文献",
"引用文献",
"참고문헌",
"참고 자료",
"литература",
"библиография",
"literatur",
"literaturverzeichnis",
"références",
"bibliographie",
"参考资料",
];
if KW.iter().any(|k| head.starts_with(k) || head == *k) {
return true;
}
}
}
cur = node.parent();
}
false
}
pub fn is_listing_container(_el: ElementRef<'_>) -> bool {
false
}
pub fn detach_repeating_subtrees(container: ElementRef<'_>) -> Option<String> {
let mut memo: HashMap<NodeId, u64> = HashMap::new();
let mut groups: HashMap<u64, Vec<ElementRef<'_>>> = HashMap::new();
for child in container.element_children() {
let sig = node_signature(child, 2, &mut memo);
if sig == 0 {
continue;
}
groups.entry(sig).or_default().push(child);
}
let dominant = groups.into_values().max_by_key(|v| v.len())?;
if dominant.len() < 5 {
return None;
}
let mut html = container.html();
for el in dominant {
let snippet = el.html();
html = html.replacen(&snippet, "", 1);
}
Some(html)
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::{Html, Selector};
fn root(html: &str, sel: &str) -> Html {
let _ = sel;
Html::parse_document(html)
}
#[test]
fn paragraph_island_detected_above_threshold() {
let html = format!(
"<!doctype html><html><body><div><p>{}</p></div></body></html>",
"long ".repeat(120)
);
let doc = root(&html, "div");
let div = doc.select(&Selector::parse("div").unwrap()).next().unwrap();
assert!(has_paragraph_island(div, 400));
}
#[test]
fn paragraph_island_skipped_when_short() {
let html = "<!doctype html><html><body><div><p>hi</p></div></body></html>";
let doc = root(html, "div");
let div = doc.select(&Selector::parse("div").unwrap()).next().unwrap();
assert!(!has_paragraph_island(div, 400));
}
#[test]
fn detach_repeating_returns_shorter_html() {
let mut html = String::from("<!doctype html><html><body><section>");
for i in 0..10 {
html.push_str(&format!(
"<article class=\"card\"><a href=\"/p/{i}\"><h3>Title {i} of the day</h3></a></article>"
));
}
html.push_str("</section></body></html>");
let doc = Html::parse_document(&html);
let sec = doc
.select(&Selector::parse("section").unwrap())
.next()
.unwrap();
let detached = detach_repeating_subtrees(sec).expect("should detach");
assert!(detached.len() < sec.html().len());
assert!(!detached.contains("Title 5"));
}
}