use ego_tree::NodeId;
use scraper::{Html, Node};
use crate::dom;
use crate::patterns;
pub fn remove_metadata_block(html: &mut Html, main_content: NodeId) {
let h1_ids = dom::select_within(html, main_content, "h1");
let Some(&h1_id) = h1_ids.first() else {
return;
};
let target = find_metadata_sibling(html, h1_id);
if let Some(id) = target {
dom::remove_node(html, id);
}
}
fn find_metadata_sibling(html: &Html, h1_id: NodeId) -> Option<NodeId> {
let mut sibling_id = next_element_sibling(html, h1_id)?;
for _ in 0..3 {
let text = dom::text_content(html, sibling_id);
let trimmed = text.trim();
if !trimmed.is_empty() && trimmed.len() < 300 && has_date(html, sibling_id, trimmed) {
return Some(sibling_id);
}
match next_element_sibling(html, sibling_id) {
Some(next) => sibling_id = next,
None => return None,
}
}
None
}
fn has_date(html: &Html, node_id: NodeId, text: &str) -> bool {
if patterns::is_date_metadata_block(text) {
return true;
}
for sel in &["p", "time"] {
let child_ids = dom::select_within(html, node_id, sel);
for &cid in &child_ids {
let child_text = dom::text_content(html, cid);
if patterns::is_date_metadata_block(child_text.trim()) {
return true;
}
}
}
false
}
fn next_element_sibling(html: &Html, node_id: NodeId) -> Option<NodeId> {
let node_ref = html.tree.get(node_id)?;
let mut sibling = node_ref.next_sibling();
while let Some(sib) = sibling {
if matches!(sib.value(), Node::Element(_)) {
return Some(sib.id());
}
sibling = sib.next_sibling();
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn removes_date_block_after_h1() {
let doc = r#"<html><body>
<h1>Article Title</h1>
<div class="meta">By Author ยท March 24, 2026</div>
<p>Article content here</p>
</body></html>"#;
let mut html = Html::parse_document(doc);
let body_ids = dom::select_ids(&html, "body");
let body = body_ids[0];
remove_metadata_block(&mut html, body);
let result = dom::inner_html(&html, body);
assert!(!result.contains("March 24, 2026"));
assert!(result.contains("Article content here"));
}
#[test]
fn removes_date_in_child_time_element() {
let doc = r#"<html><body>
<h1>Title</h1>
<div><time datetime="2026-01-15">January 15, 2026</time></div>
<p>Content</p>
</body></html>"#;
let mut html = Html::parse_document(doc);
let body_ids = dom::select_ids(&html, "body");
remove_metadata_block(&mut html, body_ids[0]);
let result = dom::inner_html(&html, body_ids[0]);
assert!(!result.contains("January 15, 2026"));
assert!(result.contains("Content"));
}
#[test]
fn preserves_content_after_h1_without_date() {
let doc = r"<html><body>
<h1>Title</h1>
<p>Introduction paragraph</p>
<p>More content</p>
</body></html>";
let mut html = Html::parse_document(doc);
let body_ids = dom::select_ids(&html, "body");
remove_metadata_block(&mut html, body_ids[0]);
let result = dom::inner_html(&html, body_ids[0]);
assert!(result.contains("Introduction paragraph"));
}
#[test]
fn skips_long_elements() {
let long_text = "a".repeat(400);
let doc = format!(
r"<html><body>
<h1>Title</h1>
<div>{long_text} January 1, 2026</div>
<p>Content</p>
</body></html>"
);
let mut html = Html::parse_document(&doc);
let body_ids = dom::select_ids(&html, "body");
remove_metadata_block(&mut html, body_ids[0]);
let result = dom::inner_html(&html, body_ids[0]);
assert!(result.contains("January 1, 2026"));
}
#[test]
fn no_h1_is_safe() {
let doc = "<html><body><p>No heading here</p></body></html>";
let mut html = Html::parse_document(doc);
let body_ids = dom::select_ids(&html, "body");
remove_metadata_block(&mut html, body_ids[0]);
let result = dom::inner_html(&html, body_ids[0]);
assert!(result.contains("No heading here"));
}
}