use tl::{ParserOptions, VDom};
use super::selectors::SEL_MAIN_CONTENT;
pub(crate) fn apply_remove_tags(html: &str, tags: &[String]) -> String {
let Ok(dom) = tl::parse(html, ParserOptions::default()) else {
return html.to_owned();
};
let parser = dom.parser();
let serialized = dom.outer_html();
let mut ranges: Vec<(usize, usize)> = Vec::new();
for tag in tags {
if let Some(iter) = dom.query_selector(tag) {
for handle in iter {
if let Some(node) = handle.get(parser) {
let fragment = node.outer_html(parser).to_string();
if let Some(pos) = serialized.find(&fragment) {
ranges.push((pos, pos + fragment.len()));
}
}
}
}
}
if ranges.is_empty() {
return html.to_owned();
}
ranges.sort_by_key(|b| std::cmp::Reverse(b.0));
ranges.dedup_by(|a, b| a.0 >= b.0 && a.0 < b.1);
let mut output = serialized;
for (start, end) in &ranges {
output.replace_range(*start..*end, "");
}
output
}
pub(crate) fn compute_word_count(dom: &VDom<'_>) -> usize {
let parser = dom.parser();
let body_text = dom
.query_selector("body")
.and_then(|mut iter| {
iter.next()
.and_then(|h| h.get(parser))
.map(|node| node.inner_text(parser).to_string())
})
.unwrap_or_default();
body_text.split_whitespace().count()
}
pub(crate) fn extract_main_content(html: &str) -> String {
let Ok(dom) = tl::parse(html, ParserOptions::default()) else {
return html.to_owned();
};
let parser = dom.parser();
dom.query_selector(SEL_MAIN_CONTENT)
.and_then(|mut iter| {
iter.next()
.and_then(|h| h.get(parser))
.and_then(|node| node.as_tag())
.map(|tag| tag.outer_html(parser))
})
.unwrap_or_else(|| html.to_owned())
}