use crate::dom::{Document, NodeId};
use crate::options::{ExtractionFocus, Options};
use crate::selector::query_all;
use crate::selector::Rule;
use crate::settings::{
ALLOWED_ATTRIBUTES, ELEMENT_WITH_SIZE_ATTR, EMPTY_TAGS_TO_REMOVE, TAGS_TO_CLEAN, TAGS_TO_STRIP,
XML_GRAPHIC_TAGS, XML_LB_TAGS, XML_QUOTE_TAGS,
};
use crate::utils::is_image_element;
use crate::utils::lru::LruCache;
use crate::utils::text::{duplicate_test, text_chars_test, text_filter};
use crate::utils::trim;
pub(crate) fn doc_cleaning(doc: &mut Document, opts: &Options) {
let mut cleaning_list: std::collections::HashSet<&'static str> = TAGS_TO_CLEAN.clone();
let mut stripping_list: std::collections::HashSet<&'static str> = TAGS_TO_STRIP.clone();
if opts.exclude_tables {
for tag in ["table", "td", "th", "tr"] {
cleaning_list.insert(tag);
}
} else {
let figures = doc.query_selector_all(doc.root(), "figure");
for figure_id in figures {
let has_table = doc
.get_elements_by_tag_name(figure_id, "*")
.iter()
.any(|&child| doc.tag_name(child) == "table");
if has_table {
doc.set_tag_name(figure_id, "div");
}
}
}
if opts.include_images {
cleaning_list.remove("figure");
cleaning_list.remove("picture");
cleaning_list.remove("source");
stripping_list.remove("img");
}
let strip_tags_vec: Vec<&str> = stripping_list.iter().copied().collect();
doc.strip_tags(doc.root(), &strip_tags_vec);
let cleaning_vec: Vec<&str> = cleaning_list.iter().copied().collect();
if opts.focus == ExtractionFocus::FavorRecall
&& !doc.get_elements_by_tag_name(doc.root(), "p").is_empty()
{
let backup = doc.clone_document();
doc.strip_elements(doc.root(), false, &cleaning_vec);
if doc.get_elements_by_tag_name(doc.root(), "p").is_empty() {
*doc = backup;
}
} else {
doc.strip_elements(doc.root(), false, &cleaning_vec);
}
remove_html_comment_nodes(doc);
prune_html(doc, opts);
}
pub(crate) fn remove_html_comment_nodes(doc: &mut Document) {
let comment_ids: Vec<NodeId> = doc.collect_comment_nodes(doc.root());
for id in comment_ids.into_iter().rev() {
doc.remove_comment(id);
}
}
pub(crate) fn prune_html(doc: &mut Document, opts: &Options) {
let keep_tail = opts.focus != ExtractionFocus::FavorPrecision;
let all_elements = doc.get_elements_by_tag_name(doc.root(), "*");
for &id in all_elements.iter().rev() {
let tag = doc.tag_name(id);
if !EMPTY_TAGS_TO_REMOVE.contains(tag) {
continue;
}
if doc.child_nodes(id).is_empty() {
doc.remove(id, keep_tail);
}
}
}
pub(crate) fn prune_unwanted_nodes(doc: &Document, rules: &[Rule], with_backup: bool) -> Document {
let (backup, old_len) = if with_backup {
let text_len = doc.iter_text(doc.root(), " ").chars().count();
(Some(doc.clone_document()), text_len)
} else {
(None, 0)
};
let mut work = doc.clone_document();
let root = work.root();
let matches = query_all(&work, root, rules);
for &id in matches.iter().rev() {
let tail = work.tail(id);
if !tail.is_empty() {
let target = work.prev_element_sibling(id).or_else(|| work.parent(id));
if let Some(target_id) = target {
let prev_tail = work.tail(target_id);
let new_tail = if prev_tail.is_empty() {
tail
} else {
format!("{prev_tail} {tail}")
};
work.set_tail(target_id, &new_tail);
}
}
work.remove(id, false);
}
if with_backup {
let new_len = work.iter_text(work.root(), " ").chars().count();
if new_len <= old_len / 7 {
return backup.expect("backup set when with_backup is true");
}
}
work
}
pub(crate) fn handle_text_node(
doc: &mut Document,
id: NodeId,
cache: &mut LruCache,
fix_comments: bool,
preserve_spaces: bool,
opts: &Options,
) -> Option<NodeId> {
let tag = doc.tag_name(id).to_string();
if XML_GRAPHIC_TAGS.contains(tag.as_str()) && is_image_element(doc, id) {
return Some(id);
}
let text = doc.text(id);
let tail = doc.tail(id);
let children = doc.children(id);
if tag == "done" || (children.is_empty() && text.is_empty() && tail.is_empty()) {
return None;
}
if !fix_comments && XML_LB_TAGS.contains(tag.as_str()) {
if !preserve_spaces {
let trimmed = trim(&tail);
doc.set_tail(id, &trimmed);
}
return Some(id);
}
let text = doc.text(id);
let tail = doc.tail(id);
if text.is_empty() && children.is_empty() {
doc.set_text(id, &tail);
doc.set_tail(id, "");
if fix_comments && XML_LB_TAGS.contains(tag.as_str()) {
doc.set_tag_name(id, "p");
}
}
if !preserve_spaces {
let t = trim(&doc.text(id));
doc.set_text(id, &t);
let tl = trim(&doc.tail(id));
doc.set_tail(id, &tl);
}
let text = doc.text(id);
if text.is_empty() && text_filter(doc, id) {
return None;
}
if opts.deduplicate && duplicate_test(doc, id, cache, opts) {
return None;
}
Some(id)
}
pub(crate) fn link_density_test(
doc: &Document,
element: NodeId,
opts: &Options,
) -> (Vec<NodeId>, bool) {
let links = doc.get_elements_by_tag_name(element, "a");
let n_links = links.len();
if n_links == 0 {
return (Vec::new(), false);
}
let text = trim(&doc.text_content(element));
let text_length = text.chars().count();
if n_links == 1 {
let threshold: usize = if opts.focus == ExtractionFocus::FavorPrecision {
10
} else {
100
};
let link_text = trim(&doc.text_content(links[0]));
let link_text_length = link_text.chars().count();
if link_text_length > threshold && link_text_length as f64 > text_length as f64 * 0.9 {
return (Vec::new(), true);
}
}
let limit_length: usize = if doc.tag_name(element) == "p" {
if doc.next_element_sibling(element).is_none() {
60
} else {
30
}
} else if doc.next_element_sibling(element).is_none() {
300
} else {
100
};
if text_length < limit_length {
let (link_length, n_short_links, non_empty_links) = collect_link_info(doc, &links);
let n_non_empty = non_empty_links.len();
if n_non_empty == 0 {
return (non_empty_links, true);
}
tracing::debug!("list link text/total: {}/{}", link_length, text_length);
tracing::debug!("short elems/total: {}/{}", n_short_links, n_non_empty);
if link_length as f64 > text_length as f64 * 0.8
|| (n_non_empty > 1 && n_short_links as f64 / n_non_empty as f64 > 0.8)
{
return (non_empty_links, true);
}
}
(Vec::new(), false)
}
pub(crate) fn link_density_test_tables(doc: &Document, table: NodeId, _opts: &Options) -> bool {
let links = doc.get_elements_by_tag_name(table, "a");
if links.is_empty() {
return false;
}
let text = trim(&doc.text_content(table));
let text_length = text.chars().count();
if text_length < 200 {
return false;
}
let (link_length, _, non_empty_links) = collect_link_info(doc, &links);
let n_non_empty = non_empty_links.len();
if n_non_empty == 0 {
return true;
}
tracing::debug!("table link text: {} / total: {}", link_length, text_length);
if text_length < 1000 {
link_length as f64 > text_length as f64 * 0.8
} else {
link_length as f64 > text_length as f64 * 0.5
}
}
pub(crate) fn collect_link_info(doc: &Document, links: &[NodeId]) -> (usize, usize, Vec<NodeId>) {
let mut link_length = 0usize;
let mut n_short_links = 0usize;
let mut non_empty_links = Vec::new();
for &link in links {
let text = trim(&doc.text_content(link));
let text_length = text.chars().count();
if text_length == 0 {
continue;
}
link_length += text_length;
if text_length < 10 {
n_short_links += 1;
}
non_empty_links.push(link);
}
(link_length, n_short_links, non_empty_links)
}
pub(crate) fn process_node(
doc: &mut Document,
id: NodeId,
cache: &mut LruCache,
opts: &Options,
) -> Option<NodeId> {
let tag = doc.tag_name(id).to_string();
let text = doc.text(id);
let tail = doc.tail(id);
let children = doc.children(id);
if tag == "done" || (children.is_empty() && text.is_empty() && tail.is_empty()) {
return None;
}
let text = trim(&text);
let tail = trim(&tail);
doc.set_text(id, &text);
doc.set_tail(id, &tail);
let text = doc.text(id);
let tail = doc.tail(id);
if !XML_LB_TAGS.contains(tag.as_str()) && text.is_empty() && !tail.is_empty() {
doc.set_text(id, &tail);
doc.set_tail(id, "");
}
let text = doc.text(id);
let tail = doc.tail(id);
if !text.is_empty() || !tail.is_empty() {
if text_filter(doc, id) {
return None;
}
if opts.deduplicate && duplicate_test(doc, id, cache, opts) {
return None;
}
}
Some(id)
}
pub(crate) fn post_cleaning(doc: &mut Document) {
let children = doc.get_elements_by_tag_name(doc.root(), "*");
for &id in children.iter().rev() {
let grandchildren = doc.children(id);
let is_void = doc.is_void_element(id);
let is_empty = !text_chars_test(&doc.text(id));
if grandchildren.is_empty() && is_empty && !is_void {
doc.strip(id);
}
}
let elements = doc.get_elements_by_tag_name(doc.root(), "*");
for &id in &elements {
let tag = doc.tag_name(id).to_string();
let allow_size = ELEMENT_WITH_SIZE_ATTR.contains(tag.as_str());
let attr_names: Vec<String> = doc.attribute_names(id);
for attr in attr_names {
let keep = match attr.as_str() {
"id" | "class" | "align" | "background" | "bgcolor" | "border" | "cellpadding"
| "cellspacing" | "frame" | "hspace" | "rules" | "style" | "valign" | "vspace" => {
false
}
"width" | "height" => allow_size,
other => ALLOWED_ATTRIBUTES.contains(other),
};
if !keep {
doc.remove_attribute(id, &attr);
}
}
}
}
pub(crate) fn delete_by_link_density(
doc: &mut Document,
subtree: NodeId,
opts: &Options,
backtracking: bool,
tag_names: &[&str],
) {
let threshold: usize = if opts.focus == ExtractionFocus::FavorPrecision {
200
} else {
100
};
let n_child_limit: usize = if opts.focus == ExtractionFocus::FavorPrecision {
1
} else {
3
};
let elements = if tag_names.is_empty() {
doc.iter(subtree, &[])
} else {
doc.iter(subtree, tag_names)
};
let mut to_delete: Vec<NodeId> = Vec::new();
for &elem in &elements {
let (non_empty_links, is_high_density) = link_density_test(doc, elem, opts);
if is_high_density {
to_delete.push(elem);
} else if backtracking && !non_empty_links.is_empty() {
let text = trim(&doc.text_content(elem));
let text_length = text.chars().count();
if text_length > 0
&& text_length < threshold
&& doc.children(elem).len() >= n_child_limit
{
to_delete.push(elem);
}
}
}
for &id in to_delete.iter().rev() {
doc.remove(id, false);
}
}
pub(crate) fn convert_tags(doc: &mut Document, opts: &Options) {
if !opts.include_links {
let css_selector = if opts.exclude_tables {
"div a, ul a, ol a, dl a, p a".to_string()
} else {
"div a, ul a, ol a, dl a, p a, table a".to_string()
};
let important_links = doc.query_selector_all(doc.root(), &css_selector);
for &id in &important_links {
doc.set_tag_name(id, "protected-a");
}
doc.strip_tags(doc.root(), &["a"]);
for &id in &important_links {
doc.set_tag_name(id, "a");
}
} else {
let links = doc.query_selector_all(doc.root(), "a");
for &id in &links {
let href = trim(&doc.get_attribute(id, "href").unwrap_or_default());
let target = trim(&doc.get_attribute(id, "target").unwrap_or_default());
doc.clear_attributes(id);
if !href.is_empty() {
let abs_href =
crate::utils::url::create_absolute_url(&href, opts.original_url.as_ref());
doc.set_attribute(id, "href", &abs_href);
}
if !target.is_empty() {
let abs_target =
crate::utils::url::create_absolute_url(&target, opts.original_url.as_ref());
doc.set_attribute(id, "target", &abs_target);
}
}
}
let quote_tags: Vec<&str> = XML_QUOTE_TAGS.iter().copied().collect();
let quote_elems = doc.iter(doc.root(), "e_tags);
for &id in "e_elems {
let mut code_flag = false;
if doc.tag_name(id) == "pre" {
let ch = doc.children(id);
if ch.len() == 1 && doc.tag_name(ch[0]) == "span" {
code_flag = true;
}
}
let hljs_elems = doc.query_selector_all(id, r#"span[class*=" hljs"], span[class^="hljs"]"#);
if !hljs_elems.is_empty() {
code_flag = true;
for &hljs_id in &hljs_elems {
doc.clear_attributes(hljs_id);
}
}
if code_flag {
doc.set_tag_name(id, "code");
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dom::Document;
use crate::options::Options;
fn parse(html: &str) -> Document {
Document::parse(html)
}
#[test]
fn test_doc_cleaning_removes_script() {
let mut doc = parse(r#"<html><body><script>alert(1)</script><p>text</p></body></html>"#);
doc_cleaning(&mut doc, &Options::default());
assert!(doc.query_selector(doc.root(), "script").is_none());
assert!(doc.query_selector(doc.root(), "p").is_some());
}
#[test]
fn test_doc_cleaning_removes_footer() {
let mut doc = parse(r#"<html><body><footer>nav</footer><p>article</p></body></html>"#);
doc_cleaning(&mut doc, &Options::default());
assert!(doc.query_selector(doc.root(), "footer").is_none());
}
#[test]
fn test_doc_cleaning_strips_abbr() {
let mut doc = parse(r#"<html><body><p><abbr>stuff</abbr></p></body></html>"#);
doc_cleaning(&mut doc, &Options::default());
assert!(doc.query_selector(doc.root(), "abbr").is_none());
let body = doc.body().unwrap();
assert!(doc.text_content(body).contains("stuff"));
}
#[test]
fn test_doc_cleaning_include_images_keeps_figure() {
let opts = Options {
include_images: true,
..Options::default()
};
let mut doc = parse(r#"<html><body><figure><img src="x.jpg"/></figure></body></html>"#);
doc_cleaning(&mut doc, &opts);
assert!(doc.query_selector(doc.root(), "figure").is_some());
}
#[test]
fn test_doc_cleaning_exclude_tables() {
let opts = Options {
exclude_tables: true,
..Options::default()
};
let mut doc =
parse(r#"<html><body><table><tr><td>data</td></tr></table><p>text</p></body></html>"#);
doc_cleaning(&mut doc, &opts);
assert!(doc.query_selector(doc.root(), "table").is_none());
}
#[test]
fn test_prune_html_removes_empty_div() {
let mut doc = parse(r#"<html><body><div></div><p>text</p></body></html>"#);
prune_html(&mut doc, &Options::default());
assert!(doc.query_selector(doc.root(), "div").is_none());
}
#[test]
fn test_prune_unwanted_nodes_removes_matched() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc = parse(r#"<html><body><div class="footer">foot</div><p>text</p></body></html>"#);
let result = prune_unwanted_nodes(&doc, OVERALL_DISCARDED_CONTENT, false);
assert!(result.query_selector(result.root(), "div").is_none());
assert!(result.query_selector(result.root(), "p").is_some());
}
#[test]
fn test_prune_unwanted_nodes_preserves_tail_text() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc = parse(
r#"<html><body><p>before</p><div class="footer">nav</div>after footer</body></html>"#,
);
let result = prune_unwanted_nodes(&doc, OVERALL_DISCARDED_CONTENT, false);
let body = result.body().unwrap();
assert!(result.text_content(body).contains("after footer"));
}
#[test]
fn test_prune_unwanted_nodes_backup_restored_on_large_removal() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc = parse(
r#"<html><body><div class="footer">a lot of text here that will be counted as content and removed by the discard rules because it has footer class</div></body></html>"#,
);
let result = prune_unwanted_nodes(&doc, OVERALL_DISCARDED_CONTENT, true);
assert!(result.query_selector(result.root(), "div").is_some());
}
#[test]
fn test_post_cleaning_strips_disallowed_attrs() {
let mut doc =
parse(r#"<html><body><p class="foo" style="color:red">text</p></body></html>"#);
post_cleaning(&mut doc);
let p = doc.query_selector(doc.root(), "p").unwrap();
assert!(doc.get_attribute(p, "class").is_none());
assert!(doc.get_attribute(p, "style").is_none());
}
#[test]
fn test_post_cleaning_removes_empty_span() {
let mut doc = parse(r#"<html><body><p><span></span>text</p></body></html>"#);
post_cleaning(&mut doc);
assert!(doc.query_selector(doc.root(), "span").is_none());
}
#[test]
fn test_link_density_test_high_density() {
let doc = parse(r#"<html><body><p><a href="/1">link</a></p></body></html>"#);
let body = doc.body().unwrap();
let p = doc.query_selector(body, "p").unwrap();
let (_, is_dense) = link_density_test(&doc, p, &Options::default());
assert!(is_dense);
}
#[test]
fn test_link_density_test_normal() {
let doc = parse(
r#"<html><body><p>This is a long paragraph with some text and <a href="x">one link</a> that is not dominant.</p></body></html>"#,
);
let body = doc.body().unwrap();
let p = doc.query_selector(body, "p").unwrap();
let (_, is_dense) = link_density_test(&doc, p, &Options::default());
assert!(!is_dense);
}
#[test]
fn test_convert_tags_strips_links_when_not_included() {
let opts = Options {
include_links: false,
..Options::default()
};
let mut doc = parse(
r#"<html><body><p><a href="x">inline</a></p><a href="nav">nav</a></body></html>"#,
);
convert_tags(&mut doc, &opts);
let body = doc.body().unwrap();
let p = doc.query_selector(body, "p").unwrap();
assert!(
doc.query_selector(p, "a").is_some(),
"<a> inside <p> should be preserved"
);
let text = doc.text_content(body);
assert!(text.contains("inline"));
assert!(text.contains("nav"));
let direct_links = doc.get_elements_by_tag_name(body, "a");
assert_eq!(direct_links.len(), 1);
}
#[test]
fn test_remove_html_comment_nodes() {
let mut doc = parse(r#"<html><body><!-- comment --><p>text</p></body></html>"#);
remove_html_comment_nodes(&mut doc);
assert!(doc.query_selector(doc.root(), "p").is_some());
}
#[test]
fn test_prune_no_backup_removes_all_matching_even_if_drastic() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc =
parse(r#"<html><body><div class="footer">all the text lives here</div></body></html>"#);
let result = prune_unwanted_nodes(&doc, OVERALL_DISCARDED_CONTENT, false);
assert!(result.query_selector(result.root(), "div").is_none());
}
#[test]
fn test_prune_backup_not_restored_when_removal_is_small() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc = parse(
r#"<html><body>
<p>This is a long article body with plenty of text content here.</p>
<p>Another paragraph with substantial content to ensure we stay well above the threshold.</p>
<div class="footer">footer</div>
</body></html>"#,
);
let result = prune_unwanted_nodes(&doc, OVERALL_DISCARDED_CONTENT, true);
assert!(
result.query_selector(result.root(), "div").is_none(),
"footer must be pruned"
);
assert!(
result.query_selector(result.root(), "p").is_some(),
"paragraphs must survive"
);
}
#[test]
fn test_prune_backup_restored_preserves_original_structure() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc = parse(
r#"<html><body>
<div class="nav">nav text one two three four five six seven eight nine ten eleven</div>
</body></html>"#,
);
let result = prune_unwanted_nodes(&doc, OVERALL_DISCARDED_CONTENT, true);
assert!(
result.query_selector(result.root(), "div").is_some(),
"document must be restored when too much text would be lost"
);
}
#[test]
fn test_prune_backup_captures_pre_modification_state() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc = parse(
r#"<html><body>
<div class="nav">nav-only-content one two three four five six seven</div>
<div class="footer">footer-only-content</div>
</body></html>"#,
);
let result = prune_unwanted_nodes(&doc, OVERALL_DISCARDED_CONTENT, true);
let body = result.body().unwrap();
let divs = result.get_elements_by_tag_name(body, "div");
assert_eq!(
divs.len(),
2,
"backup must contain both divs from original document"
);
}
#[test]
fn test_prune_unwanted_nodes_empty_document() {
use crate::selector::discard::OVERALL_DISCARDED_CONTENT;
let doc_no_backup = parse("<html><body></body></html>");
let _ = prune_unwanted_nodes(&doc_no_backup, OVERALL_DISCARDED_CONTENT, false);
let doc_with_backup = parse("<html><body></body></html>");
let _ = prune_unwanted_nodes(&doc_with_backup, OVERALL_DISCARDED_CONTENT, true);
}
}