use crate::document::{Document, Node, SourceType};
use anyhow::Result;
use scraper::{Html, Node as HtmlNode, Selector};
use std::path::Path;
pub struct HtmlParser;
impl super::Parser for HtmlParser {
fn extensions(&self) -> &[&str] {
&["html", "htm"]
}
fn source_type(&self) -> SourceType {
SourceType::Html
}
fn parse(&self, path: &Path, content: &str) -> Result<Document> {
let file_name = path
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_default();
let doc_id = path.to_string_lossy().to_string();
let mut doc = Document::new(&doc_id, &file_name, SourceType::Html);
if content.trim().is_empty() {
doc.assign_node_ids();
return Ok(doc);
}
let html = Html::parse_document(content);
if let Ok(sel) = Selector::parse("title") {
if let Some(title_el) = html.select(&sel).next() {
let title_text: String = title_el.text().collect::<Vec<_>>().join(" ");
doc.doc_description = title_text.trim().to_string();
}
}
let mut stack: Vec<(u8, Node)> = Vec::new();
let mut roots: Vec<Node> = Vec::new();
let mut current_text = String::new();
for node_ref in html.tree.nodes() {
match node_ref.value() {
HtmlNode::Element(el) => {
if let Some(level) = heading_level(el.name()) {
flush_text(&mut stack, &mut roots, &mut current_text);
collapse_stack(&mut stack, &mut roots, level);
let mut title = String::new();
for child in node_ref.children() {
collect_all_text(child, &mut title);
}
let node = Node::new("", title.trim());
stack.push((level, node));
}
}
HtmlNode::Text(text) => {
if !is_inside_heading(&html, node_ref.id()) {
let t = text.trim();
if !t.is_empty() {
if !current_text.is_empty() {
current_text.push(' ');
}
current_text.push_str(t);
}
}
}
_ => {}
}
}
flush_text(&mut stack, &mut roots, &mut current_text);
collapse_stack(&mut stack, &mut roots, 0);
if roots.is_empty() && !content.trim().is_empty() {
let body_text = extract_body_text(&html);
if !body_text.is_empty() {
let mut node = Node::new("", &file_name);
node.text = body_text;
roots.push(node);
}
}
doc.structure = roots;
doc.assign_node_ids();
Ok(doc)
}
}
fn heading_level(tag: &str) -> Option<u8> {
match tag {
"h1" => Some(1),
"h2" => Some(2),
"h3" => Some(3),
"h4" => Some(4),
"h5" => Some(5),
"h6" => Some(6),
_ => None,
}
}
fn is_inside_heading(html: &Html, node_id: ego_tree::NodeId) -> bool {
let mut current = Some(node_id);
while let Some(id) = current {
if let Some(node) = html.tree.get(id) {
if let HtmlNode::Element(el) = node.value() {
if heading_level(el.name()).is_some() {
return true;
}
}
current = node.parent().map(|p| p.id());
} else {
break;
}
}
false
}
fn collect_all_text(node_ref: ego_tree::NodeRef<'_, scraper::Node>, out: &mut String) {
match node_ref.value() {
HtmlNode::Text(t) => {
out.push_str(t);
}
_ => {
for child in node_ref.children() {
collect_all_text(child, out);
}
}
}
}
fn extract_body_text(html: &Html) -> String {
let mut parts = Vec::new();
if let Ok(sel) = Selector::parse("body") {
if let Some(body) = html.select(&sel).next() {
for text in body.text() {
let t = text.trim();
if !t.is_empty() {
parts.push(t.to_string());
}
}
}
}
if parts.is_empty() {
for node in html.tree.nodes() {
if let HtmlNode::Text(t) = node.value() {
let t = t.trim();
if !t.is_empty() {
parts.push(t.to_string());
}
}
}
}
parts.join(" ")
}
fn collapse_stack(stack: &mut Vec<(u8, Node)>, roots: &mut Vec<Node>, target_level: u8) {
while let Some(&(lvl, _)) = stack.last() {
if lvl >= target_level {
let (_, node) = stack.pop().unwrap();
if let Some(parent) = stack.last_mut() {
parent.1.children.push(node);
} else {
roots.push(node);
}
} else {
break;
}
}
}
fn flush_text(stack: &mut Vec<(u8, Node)>, roots: &mut Vec<Node>, text: &mut String) {
let trimmed = text.trim().to_string();
if trimmed.is_empty() {
text.clear();
return;
}
if let Some(top) = stack.last_mut() {
if top.1.text.is_empty() {
top.1.text = trimmed;
} else {
top.1.text.push_str("\n\n");
top.1.text.push_str(&trimmed);
}
} else {
let mut node = Node::new("", "");
node.text = trimmed;
roots.push(node);
}
text.clear();
}
#[cfg(test)]
mod tests {
use crate::parser::Parser;
use super::*;
fn parse(content: &str) -> Document {
let parser = HtmlParser;
parser
.parse(Path::new("test.html"), content)
.expect("parse failed")
}
#[test]
fn test_empty() {
let doc = parse("");
assert!(doc.structure.is_empty());
}
#[test]
fn test_single_heading() {
let doc = parse("<h1>Hello</h1><p>World</p>");
assert_eq!(doc.structure.len(), 1);
assert_eq!(doc.structure[0].title, "Hello");
assert!(doc.structure[0].text.contains("World"));
}
#[test]
fn test_nested_headings() {
let html = r#"
<h1>Top</h1>
<p>Intro</p>
<h2>Section A</h2>
<p>Text A</p>
<h3>Sub A1</h3>
<p>Deep</p>
<h2>Section B</h2>
<p>Text B</p>
"#;
let doc = parse(html);
assert_eq!(doc.structure.len(), 1);
let root = &doc.structure[0];
assert_eq!(root.title, "Top");
assert_eq!(root.children.len(), 2);
assert_eq!(root.children[0].title, "Section A");
assert_eq!(root.children[1].title, "Section B");
assert_eq!(root.children[0].children.len(), 1);
assert_eq!(root.children[0].children[0].title, "Sub A1");
}
#[test]
fn test_no_headings() {
let doc = parse("<p>Just text</p><p>More text</p>");
assert_eq!(doc.structure.len(), 1);
assert!(doc.structure[0].text.contains("Just text"));
}
#[test]
fn test_node_ids_assigned() {
let doc = parse("<h1>A</h1><h2>B</h2><h2>C</h2>");
assert_eq!(doc.structure[0].node_id, "0");
assert_eq!(doc.structure[0].children[0].node_id, "1");
assert_eq!(doc.structure[0].children[1].node_id, "2");
}
#[test]
fn test_source_type() {
let doc = parse("<h1>Hi</h1>");
assert_eq!(doc.source_type, SourceType::Html);
assert_eq!(doc.doc_id, "test.html");
}
#[test]
fn test_full_html_document() {
let html = r#"<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
<h1>Main Title</h1>
<p>Content here.</p>
</body>
</html>"#;
let doc = parse(html);
assert_eq!(doc.doc_description, "Test Page");
assert!(!doc.structure.is_empty());
}
#[test]
fn test_multiple_h1() {
let doc = parse("<h1>First</h1><p>A</p><h1>Second</h1><p>B</p>");
assert_eq!(doc.structure.len(), 2);
assert_eq!(doc.structure[0].title, "First");
assert_eq!(doc.structure[1].title, "Second");
}
#[test]
fn test_heading_with_nested_tags() {
let doc = parse("<h1>Hello <strong>World</strong></h1><p>Body</p>");
assert_eq!(doc.structure[0].title, "Hello World");
}
#[test]
fn test_heading_level_fn() {
assert_eq!(heading_level("h1"), Some(1));
assert_eq!(heading_level("h6"), Some(6));
assert_eq!(heading_level("p"), None);
assert_eq!(heading_level("div"), None);
}
}