use crate::error::{HtmlError, Result};
use scraper::{ElementRef, Html, Selector};
use ucm_core::{Block, BlockId, Content, Document, MediaSource};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum HeadingStrategy {
#[default]
AsIs,
Flatten(usize),
InferFromNesting,
}
#[derive(Debug, Clone)]
pub struct HtmlParserConfig {
pub preserve_whitespace: bool,
pub extract_images: bool,
pub extract_links: bool,
pub heading_strategy: HeadingStrategy,
pub max_depth: usize,
pub max_blocks: usize,
pub min_text_length: usize,
}
impl Default for HtmlParserConfig {
fn default() -> Self {
Self {
preserve_whitespace: false,
extract_images: true,
extract_links: true,
heading_strategy: HeadingStrategy::AsIs,
max_depth: 50,
max_blocks: 10000,
min_text_length: 1,
}
}
}
pub struct HtmlParser {
config: HtmlParserConfig,
}
impl HtmlParser {
pub fn new() -> Self {
Self {
config: HtmlParserConfig::default(),
}
}
pub fn with_config(config: HtmlParserConfig) -> Self {
Self { config }
}
pub fn parse(&self, html: &str) -> Result<Document> {
let mut doc = Document::create();
let root = doc.root;
let fragment = Html::parse_document(html);
let body_selector = Selector::parse("body").unwrap();
let body = fragment.select(&body_selector).next();
if let Some(body_element) = body {
self.process_children(&mut doc, &root, body_element, 0)?;
} else {
if let Some(root_element) = fragment.root_element().first_child() {
if let Some(element) = ElementRef::wrap(root_element) {
self.process_children(&mut doc, &root, element, 0)?;
}
}
}
Ok(doc)
}
fn process_children(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
depth: usize,
) -> Result<()> {
if depth > self.config.max_depth {
return Err(HtmlError::ResourceLimit(format!(
"Maximum nesting depth {} exceeded",
self.config.max_depth
)));
}
if doc.block_count() > self.config.max_blocks {
return Err(HtmlError::ResourceLimit(format!(
"Maximum block count {} exceeded",
self.config.max_blocks
)));
}
let mut current_heading_parent = *parent_id;
let mut heading_stack: Vec<(usize, BlockId)> = vec![(0, *parent_id)];
for child in element.children() {
if let Some(child_element) = ElementRef::wrap(child) {
let tag_name = child_element.value().name();
if let Some(level) = self.parse_heading_level(tag_name) {
while heading_stack.len() > 1
&& heading_stack
.last()
.map(|(l, _)| *l >= level)
.unwrap_or(false)
{
heading_stack.pop();
}
let heading_parent = heading_stack
.last()
.map(|(_, id)| *id)
.unwrap_or(*parent_id);
let heading_id =
self.process_heading(doc, &heading_parent, child_element, level)?;
if let Some(id) = heading_id {
heading_stack.push((level, id));
current_heading_parent = id;
}
} else {
self.process_element(doc, ¤t_heading_parent, child_element, depth + 1)?;
}
} else if let Some(text_node) = child.value().as_text() {
let text = if self.config.preserve_whitespace {
text_node.to_string()
} else {
text_node.trim().to_string()
};
if text.len() >= self.config.min_text_length {
let block = Block::new(Content::text(&text), Some("text"));
doc.add_block(block, ¤t_heading_parent)?;
}
}
}
Ok(())
}
fn process_element(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
depth: usize,
) -> Result<Option<BlockId>> {
if depth > self.config.max_depth {
return Ok(None);
}
let tag_name = element.value().name();
match tag_name {
"script" | "style" | "meta" | "link" | "head" | "noscript" => Ok(None),
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
let level = self.parse_heading_level(tag_name).unwrap_or(1);
self.process_heading(doc, parent_id, element, level)
}
"p" => self.process_paragraph(doc, parent_id, element),
"ul" | "ol" => self.process_list(doc, parent_id, element),
"pre" => self.process_code_block(doc, parent_id, element),
"code" => {
let code_text = element.text().collect::<String>();
if !code_text.trim().is_empty() {
let formatted = format!("`{}`", code_text);
let block = Block::new(Content::text(&formatted), Some("code"));
Ok(Some(doc.add_block(block, parent_id)?))
} else {
Ok(None)
}
}
"blockquote" => self.process_blockquote(doc, parent_id, element),
"img" => self.process_image(doc, parent_id, element),
"a" => self.process_link(doc, parent_id, element),
"table" => self.process_table(doc, parent_id, element),
"div" | "section" | "article" | "main" | "aside" | "nav" | "header" | "footer"
| "span" | "figure" | "figcaption" => {
self.process_children(doc, parent_id, element, depth)?;
Ok(None)
}
"br" | "hr" => Ok(None),
_ => {
let text = self.extract_text_content(element);
if !text.is_empty() && text.len() >= self.config.min_text_length {
let block = Block::new(Content::text(&text), Some("text"));
Ok(Some(doc.add_block(block, parent_id)?))
} else {
self.process_children(doc, parent_id, element, depth)?;
Ok(None)
}
}
}
}
fn process_heading(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
level: usize,
) -> Result<Option<BlockId>> {
let text = self.extract_text_content(element);
if text.is_empty() {
return Ok(None);
}
let adjusted_level = match self.config.heading_strategy {
HeadingStrategy::AsIs => level,
HeadingStrategy::Flatten(target) => target,
HeadingStrategy::InferFromNesting => level, };
let role = format!("heading{}", adjusted_level.clamp(1, 6));
let block = Block::new(Content::text(&text), Some(&role));
let block_id = doc.add_block(block, parent_id)?;
Ok(Some(block_id))
}
fn process_paragraph(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
) -> Result<Option<BlockId>> {
let text = self.extract_formatted_text(element);
if text.is_empty() || text.len() < self.config.min_text_length {
return Ok(None);
}
let block = Block::new(Content::text(&text), Some("paragraph"));
Ok(Some(doc.add_block(block, parent_id)?))
}
fn process_list(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
) -> Result<Option<BlockId>> {
let li_selector = Selector::parse("li").unwrap();
let items: Vec<String> = element
.select(&li_selector)
.map(|li| self.extract_formatted_text(li))
.filter(|s| !s.is_empty())
.collect();
if items.is_empty() {
return Ok(None);
}
let list_content = items.join("\n");
let block = Block::new(Content::text(&list_content), Some("list"));
Ok(Some(doc.add_block(block, parent_id)?))
}
fn process_code_block(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
) -> Result<Option<BlockId>> {
let code_selector = Selector::parse("code").unwrap();
let code_element = element.select(&code_selector).next().unwrap_or(element);
let code_text = code_element.text().collect::<String>();
if code_text.trim().is_empty() {
return Ok(None);
}
let language = code_element
.value()
.attr("class")
.and_then(|class| {
class
.split_whitespace()
.find(|c| c.starts_with("language-") || c.starts_with("lang-"))
.map(|c| {
c.trim_start_matches("language-")
.trim_start_matches("lang-")
})
})
.unwrap_or("text");
let block = Block::new(Content::code(language, &code_text), Some("code"));
Ok(Some(doc.add_block(block, parent_id)?))
}
fn process_blockquote(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
) -> Result<Option<BlockId>> {
let text = self.extract_formatted_text(element);
if text.is_empty() {
return Ok(None);
}
let block = Block::new(Content::text(&text), Some("quote"));
Ok(Some(doc.add_block(block, parent_id)?))
}
fn process_image(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
) -> Result<Option<BlockId>> {
if !self.config.extract_images {
return Ok(None);
}
let src = element.value().attr("src").unwrap_or("");
let alt = element.value().attr("alt").unwrap_or("");
if src.is_empty() {
return Ok(None);
}
let media_source = if src.starts_with("data:") {
let base64_data = src.split(',').nth(1).unwrap_or("").to_string();
MediaSource::Base64(base64_data)
} else {
MediaSource::Url(src.to_string())
};
let media = ucm_core::Media::image(media_source).with_alt(alt);
let block = Block::new(Content::Media(media), Some("image"));
Ok(Some(doc.add_block(block, parent_id)?))
}
fn process_link(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
) -> Result<Option<BlockId>> {
let text = self.extract_text_content(element);
let href = element.value().attr("href").unwrap_or("");
if text.is_empty() {
return Ok(None);
}
if self.config.extract_links && !href.is_empty() {
let link_text = format!("[{}]({})", text, href);
let block = Block::new(Content::text(&link_text), Some("link"));
Ok(Some(doc.add_block(block, parent_id)?))
} else {
let block = Block::new(Content::text(&text), Some("text"));
Ok(Some(doc.add_block(block, parent_id)?))
}
}
fn process_table(
&self,
doc: &mut Document,
parent_id: &BlockId,
element: ElementRef,
) -> Result<Option<BlockId>> {
let row_selector = Selector::parse("tr").unwrap();
let cell_selector = Selector::parse("td, th").unwrap();
let rows: Vec<Vec<String>> = element
.select(&row_selector)
.map(|row| {
row.select(&cell_selector)
.map(|cell| self.extract_text_content(cell))
.collect()
})
.filter(|row: &Vec<String>| !row.is_empty())
.collect();
if rows.is_empty() {
return Ok(None);
}
let block = Block::new(Content::table(rows), Some("table"));
Ok(Some(doc.add_block(block, parent_id)?))
}
fn parse_heading_level(&self, tag_name: &str) -> Option<usize> {
match tag_name {
"h1" => Some(1),
"h2" => Some(2),
"h3" => Some(3),
"h4" => Some(4),
"h5" => Some(5),
"h6" => Some(6),
_ => None,
}
}
fn extract_text_content(&self, element: ElementRef) -> String {
let text: String = element.text().collect();
if self.config.preserve_whitespace {
text
} else {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
}
fn extract_formatted_text(&self, element: ElementRef) -> String {
let mut result = String::new();
for child in element.children() {
if let Some(child_element) = ElementRef::wrap(child) {
let tag_name = child_element.value().name();
let child_text = self.extract_formatted_text(child_element);
match tag_name {
"strong" | "b" => {
result.push_str("**");
result.push_str(&child_text);
result.push_str("**");
}
"em" | "i" => {
result.push('*');
result.push_str(&child_text);
result.push('*');
}
"code" => {
result.push('`');
result.push_str(&child_text);
result.push('`');
}
"a" if self.config.extract_links => {
let href = child_element.value().attr("href").unwrap_or("");
if !href.is_empty() {
result.push_str(&format!("[{}]({})", child_text, href));
} else {
result.push_str(&child_text);
}
}
"br" => {
result.push('\n');
}
_ => {
result.push_str(&child_text);
}
}
} else if let Some(text_node) = child.value().as_text() {
let text = if self.config.preserve_whitespace {
text_node.to_string()
} else {
text_node.split_whitespace().collect::<Vec<_>>().join(" ")
};
result.push_str(&text);
}
}
result.trim().to_string()
}
}
impl Default for HtmlParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_heading_hierarchy() {
let html = r#"<html><body>
<h1>Main</h1>
<p>Intro</p>
<h2>Sub 1</h2>
<p>Content 1</p>
<h2>Sub 2</h2>
<p>Content 2</p>
</body></html>"#;
let doc = HtmlParser::new().parse(html).unwrap();
let root_children = doc.children(&doc.root);
assert!(!root_children.is_empty());
}
#[test]
fn test_code_language_extraction() {
let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
let doc = HtmlParser::new().parse(html).unwrap();
assert!(doc.block_count() >= 2);
}
#[test]
fn test_max_depth_limit() {
let config = HtmlParserConfig {
max_depth: 2,
..Default::default()
};
let parser = HtmlParser::with_config(config);
let html = "<div><div><div><div><div><p>Deep</p></div></div></div></div></div>";
let result = parser.parse(html);
assert!(result.is_ok() || matches!(result, Err(HtmlError::ResourceLimit(_))));
}
#[test]
fn test_heading_strategy_flatten() {
let config = HtmlParserConfig {
heading_strategy: HeadingStrategy::Flatten(3),
..Default::default()
};
let parser = HtmlParser::with_config(config);
let html = "<h1>Title</h1><h2>Subtitle</h2>";
let doc = parser.parse(html).unwrap();
for block in doc.blocks.values() {
if let Some(ref role) = block.metadata.semantic_role {
if role.category.as_str().starts_with("heading") {
assert_eq!(role.category.as_str(), "heading3");
}
}
}
}
}