1use crate::error::{HtmlError, Result};
4use scraper::{ElementRef, Html, Selector};
5use ucm_core::{Block, BlockId, Content, Document, MediaSource};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
9pub enum HeadingStrategy {
10 #[default]
12 AsIs,
13 Flatten(usize),
15 InferFromNesting,
17}
18
19#[derive(Debug, Clone)]
21pub struct HtmlParserConfig {
22 pub preserve_whitespace: bool,
24 pub extract_images: bool,
26 pub extract_links: bool,
28 pub heading_strategy: HeadingStrategy,
30 pub max_depth: usize,
32 pub max_blocks: usize,
34 pub min_text_length: usize,
36}
37
38impl Default for HtmlParserConfig {
39 fn default() -> Self {
40 Self {
41 preserve_whitespace: false,
42 extract_images: true,
43 extract_links: true,
44 heading_strategy: HeadingStrategy::AsIs,
45 max_depth: 50,
46 max_blocks: 10000,
47 min_text_length: 1,
48 }
49 }
50}
51
52pub struct HtmlParser {
54 config: HtmlParserConfig,
55}
56
57impl HtmlParser {
58 pub fn new() -> Self {
60 Self {
61 config: HtmlParserConfig::default(),
62 }
63 }
64
65 pub fn with_config(config: HtmlParserConfig) -> Self {
67 Self { config }
68 }
69
70 pub fn parse(&self, html: &str) -> Result<Document> {
72 let mut doc = Document::create();
73 let root = doc.root;
74
75 let fragment = Html::parse_document(html);
77
78 let body_selector = Selector::parse("body").unwrap();
80 let body = fragment.select(&body_selector).next();
81
82 if let Some(body_element) = body {
83 self.process_children(&mut doc, &root, body_element, 0)?;
84 } else {
85 if let Some(root_element) = fragment.root_element().first_child() {
87 if let Some(element) = ElementRef::wrap(root_element) {
88 self.process_children(&mut doc, &root, element, 0)?;
89 }
90 }
91 }
92
93 Ok(doc)
94 }
95
96 fn process_children(
98 &self,
99 doc: &mut Document,
100 parent_id: &BlockId,
101 element: ElementRef,
102 depth: usize,
103 ) -> Result<()> {
104 if depth > self.config.max_depth {
105 return Err(HtmlError::ResourceLimit(format!(
106 "Maximum nesting depth {} exceeded",
107 self.config.max_depth
108 )));
109 }
110
111 if doc.block_count() > self.config.max_blocks {
112 return Err(HtmlError::ResourceLimit(format!(
113 "Maximum block count {} exceeded",
114 self.config.max_blocks
115 )));
116 }
117
118 let mut current_heading_parent = *parent_id;
119 let mut heading_stack: Vec<(usize, BlockId)> = vec![(0, *parent_id)];
120
121 for child in element.children() {
122 if let Some(child_element) = ElementRef::wrap(child) {
123 let tag_name = child_element.value().name();
124
125 if let Some(level) = self.parse_heading_level(tag_name) {
127 while heading_stack.len() > 1
129 && heading_stack
130 .last()
131 .map(|(l, _)| *l >= level)
132 .unwrap_or(false)
133 {
134 heading_stack.pop();
135 }
136
137 let heading_parent = heading_stack
138 .last()
139 .map(|(_, id)| *id)
140 .unwrap_or(*parent_id);
141
142 let heading_id =
143 self.process_heading(doc, &heading_parent, child_element, level)?;
144
145 if let Some(id) = heading_id {
146 heading_stack.push((level, id));
147 current_heading_parent = id;
148 }
149 } else {
150 self.process_element(doc, ¤t_heading_parent, child_element, depth + 1)?;
152 }
153 } else if let Some(text_node) = child.value().as_text() {
154 let text = if self.config.preserve_whitespace {
155 text_node.to_string()
156 } else {
157 text_node.trim().to_string()
158 };
159
160 if text.len() >= self.config.min_text_length {
161 let block = Block::new(Content::text(&text), Some("text"));
162 doc.add_block(block, ¤t_heading_parent)?;
163 }
164 }
165 }
166
167 Ok(())
168 }
169
170 fn process_element(
172 &self,
173 doc: &mut Document,
174 parent_id: &BlockId,
175 element: ElementRef,
176 depth: usize,
177 ) -> Result<Option<BlockId>> {
178 if depth > self.config.max_depth {
179 return Ok(None);
180 }
181
182 let tag_name = element.value().name();
183
184 match tag_name {
185 "script" | "style" | "meta" | "link" | "head" | "noscript" => Ok(None),
187
188 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
190 let level = self.parse_heading_level(tag_name).unwrap_or(1);
191 self.process_heading(doc, parent_id, element, level)
192 }
193
194 "p" => self.process_paragraph(doc, parent_id, element),
196
197 "ul" | "ol" => self.process_list(doc, parent_id, element),
199
200 "pre" => self.process_code_block(doc, parent_id, element),
202 "code" => {
203 let code_text = element.text().collect::<String>();
205 if !code_text.trim().is_empty() {
206 let formatted = format!("`{}`", code_text);
207 let block = Block::new(Content::text(&formatted), Some("code"));
208 Ok(Some(doc.add_block(block, parent_id)?))
209 } else {
210 Ok(None)
211 }
212 }
213
214 "blockquote" => self.process_blockquote(doc, parent_id, element),
216
217 "img" => self.process_image(doc, parent_id, element),
219
220 "a" => self.process_link(doc, parent_id, element),
222
223 "table" => self.process_table(doc, parent_id, element),
225
226 "div" | "section" | "article" | "main" | "aside" | "nav" | "header" | "footer"
228 | "span" | "figure" | "figcaption" => {
229 self.process_children(doc, parent_id, element, depth)?;
230 Ok(None)
231 }
232
233 "br" | "hr" => Ok(None),
235
236 _ => {
238 let text = self.extract_text_content(element);
239 if !text.is_empty() && text.len() >= self.config.min_text_length {
240 let block = Block::new(Content::text(&text), Some("text"));
241 Ok(Some(doc.add_block(block, parent_id)?))
242 } else {
243 self.process_children(doc, parent_id, element, depth)?;
245 Ok(None)
246 }
247 }
248 }
249 }
250
251 fn process_heading(
253 &self,
254 doc: &mut Document,
255 parent_id: &BlockId,
256 element: ElementRef,
257 level: usize,
258 ) -> Result<Option<BlockId>> {
259 let text = self.extract_text_content(element);
260 if text.is_empty() {
261 return Ok(None);
262 }
263
264 let adjusted_level = match self.config.heading_strategy {
265 HeadingStrategy::AsIs => level,
266 HeadingStrategy::Flatten(target) => target,
267 HeadingStrategy::InferFromNesting => level, };
269
270 let role = format!("heading{}", adjusted_level.clamp(1, 6));
271 let block = Block::new(Content::text(&text), Some(&role));
272 let block_id = doc.add_block(block, parent_id)?;
273
274 Ok(Some(block_id))
275 }
276
277 fn process_paragraph(
279 &self,
280 doc: &mut Document,
281 parent_id: &BlockId,
282 element: ElementRef,
283 ) -> Result<Option<BlockId>> {
284 let text = self.extract_formatted_text(element);
285 if text.is_empty() || text.len() < self.config.min_text_length {
286 return Ok(None);
287 }
288
289 let block = Block::new(Content::text(&text), Some("paragraph"));
290 Ok(Some(doc.add_block(block, parent_id)?))
291 }
292
293 fn process_list(
295 &self,
296 doc: &mut Document,
297 parent_id: &BlockId,
298 element: ElementRef,
299 ) -> Result<Option<BlockId>> {
300 let li_selector = Selector::parse("li").unwrap();
301 let items: Vec<String> = element
302 .select(&li_selector)
303 .map(|li| self.extract_formatted_text(li))
304 .filter(|s| !s.is_empty())
305 .collect();
306
307 if items.is_empty() {
308 return Ok(None);
309 }
310
311 let list_content = items.join("\n");
312 let block = Block::new(Content::text(&list_content), Some("list"));
313 Ok(Some(doc.add_block(block, parent_id)?))
314 }
315
316 fn process_code_block(
318 &self,
319 doc: &mut Document,
320 parent_id: &BlockId,
321 element: ElementRef,
322 ) -> Result<Option<BlockId>> {
323 let code_selector = Selector::parse("code").unwrap();
324 let code_element = element.select(&code_selector).next().unwrap_or(element);
325
326 let code_text = code_element.text().collect::<String>();
327 if code_text.trim().is_empty() {
328 return Ok(None);
329 }
330
331 let language = code_element
333 .value()
334 .attr("class")
335 .and_then(|class| {
336 class
337 .split_whitespace()
338 .find(|c| c.starts_with("language-") || c.starts_with("lang-"))
339 .map(|c| {
340 c.trim_start_matches("language-")
341 .trim_start_matches("lang-")
342 })
343 })
344 .unwrap_or("text");
345
346 let block = Block::new(Content::code(language, &code_text), Some("code"));
347 Ok(Some(doc.add_block(block, parent_id)?))
348 }
349
350 fn process_blockquote(
352 &self,
353 doc: &mut Document,
354 parent_id: &BlockId,
355 element: ElementRef,
356 ) -> Result<Option<BlockId>> {
357 let text = self.extract_formatted_text(element);
358 if text.is_empty() {
359 return Ok(None);
360 }
361
362 let block = Block::new(Content::text(&text), Some("quote"));
363 Ok(Some(doc.add_block(block, parent_id)?))
364 }
365
366 fn process_image(
368 &self,
369 doc: &mut Document,
370 parent_id: &BlockId,
371 element: ElementRef,
372 ) -> Result<Option<BlockId>> {
373 if !self.config.extract_images {
374 return Ok(None);
375 }
376
377 let src = element.value().attr("src").unwrap_or("");
378 let alt = element.value().attr("alt").unwrap_or("");
379
380 if src.is_empty() {
381 return Ok(None);
382 }
383
384 let media_source = if src.starts_with("data:") {
386 let base64_data = src.split(',').nth(1).unwrap_or("").to_string();
388 MediaSource::Base64(base64_data)
389 } else {
390 MediaSource::Url(src.to_string())
391 };
392
393 let media = ucm_core::Media::image(media_source).with_alt(alt);
394 let block = Block::new(Content::Media(media), Some("image"));
395 Ok(Some(doc.add_block(block, parent_id)?))
396 }
397
398 fn process_link(
400 &self,
401 doc: &mut Document,
402 parent_id: &BlockId,
403 element: ElementRef,
404 ) -> Result<Option<BlockId>> {
405 let text = self.extract_text_content(element);
406 let href = element.value().attr("href").unwrap_or("");
407
408 if text.is_empty() {
409 return Ok(None);
410 }
411
412 if self.config.extract_links && !href.is_empty() {
413 let link_text = format!("[{}]({})", text, href);
415 let block = Block::new(Content::text(&link_text), Some("link"));
416 Ok(Some(doc.add_block(block, parent_id)?))
417 } else {
418 let block = Block::new(Content::text(&text), Some("text"));
420 Ok(Some(doc.add_block(block, parent_id)?))
421 }
422 }
423
424 fn process_table(
426 &self,
427 doc: &mut Document,
428 parent_id: &BlockId,
429 element: ElementRef,
430 ) -> Result<Option<BlockId>> {
431 let row_selector = Selector::parse("tr").unwrap();
432 let cell_selector = Selector::parse("td, th").unwrap();
433
434 let rows: Vec<Vec<String>> = element
435 .select(&row_selector)
436 .map(|row| {
437 row.select(&cell_selector)
438 .map(|cell| self.extract_text_content(cell))
439 .collect()
440 })
441 .filter(|row: &Vec<String>| !row.is_empty())
442 .collect();
443
444 if rows.is_empty() {
445 return Ok(None);
446 }
447
448 let block = Block::new(Content::table(rows), Some("table"));
449 Ok(Some(doc.add_block(block, parent_id)?))
450 }
451
452 fn parse_heading_level(&self, tag_name: &str) -> Option<usize> {
454 match tag_name {
455 "h1" => Some(1),
456 "h2" => Some(2),
457 "h3" => Some(3),
458 "h4" => Some(4),
459 "h5" => Some(5),
460 "h6" => Some(6),
461 _ => None,
462 }
463 }
464
465 fn extract_text_content(&self, element: ElementRef) -> String {
467 let text: String = element.text().collect();
468 if self.config.preserve_whitespace {
469 text
470 } else {
471 text.split_whitespace().collect::<Vec<_>>().join(" ")
473 }
474 }
475
476 fn extract_formatted_text(&self, element: ElementRef) -> String {
478 let mut result = String::new();
479
480 for child in element.children() {
481 if let Some(child_element) = ElementRef::wrap(child) {
482 let tag_name = child_element.value().name();
483 let child_text = self.extract_formatted_text(child_element);
484
485 match tag_name {
486 "strong" | "b" => {
487 result.push_str("**");
488 result.push_str(&child_text);
489 result.push_str("**");
490 }
491 "em" | "i" => {
492 result.push('*');
493 result.push_str(&child_text);
494 result.push('*');
495 }
496 "code" => {
497 result.push('`');
498 result.push_str(&child_text);
499 result.push('`');
500 }
501 "a" if self.config.extract_links => {
502 let href = child_element.value().attr("href").unwrap_or("");
503 if !href.is_empty() {
504 result.push_str(&format!("[{}]({})", child_text, href));
505 } else {
506 result.push_str(&child_text);
507 }
508 }
509 "br" => {
510 result.push('\n');
511 }
512 _ => {
513 result.push_str(&child_text);
514 }
515 }
516 } else if let Some(text_node) = child.value().as_text() {
517 let text = if self.config.preserve_whitespace {
518 text_node.to_string()
519 } else {
520 text_node.split_whitespace().collect::<Vec<_>>().join(" ")
521 };
522 result.push_str(&text);
523 }
524 }
525
526 result.trim().to_string()
527 }
528}
529
530impl Default for HtmlParser {
531 fn default() -> Self {
532 Self::new()
533 }
534}
535
536#[cfg(test)]
537mod tests {
538 use super::*;
539
540 #[test]
541 fn test_heading_hierarchy() {
542 let html = r#"<html><body>
543 <h1>Main</h1>
544 <p>Intro</p>
545 <h2>Sub 1</h2>
546 <p>Content 1</p>
547 <h2>Sub 2</h2>
548 <p>Content 2</p>
549 </body></html>"#;
550
551 let doc = HtmlParser::new().parse(html).unwrap();
552
553 let root_children = doc.children(&doc.root);
555 assert!(!root_children.is_empty());
556 }
557
558 #[test]
559 fn test_code_language_extraction() {
560 let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
561 let doc = HtmlParser::new().parse(html).unwrap();
562
563 assert!(doc.block_count() >= 2);
565 }
566
567 #[test]
568 fn test_max_depth_limit() {
569 let config = HtmlParserConfig {
570 max_depth: 2,
571 ..Default::default()
572 };
573 let parser = HtmlParser::with_config(config);
574
575 let html = "<div><div><div><div><div><p>Deep</p></div></div></div></div></div>";
577 let result = parser.parse(html);
578
579 assert!(result.is_ok() || matches!(result, Err(HtmlError::ResourceLimit(_))));
582 }
583
584 #[test]
585 fn test_heading_strategy_flatten() {
586 let config = HtmlParserConfig {
587 heading_strategy: HeadingStrategy::Flatten(3),
588 ..Default::default()
589 };
590 let parser = HtmlParser::with_config(config);
591
592 let html = "<h1>Title</h1><h2>Subtitle</h2>";
593 let doc = parser.parse(html).unwrap();
594
595 for block in doc.blocks.values() {
597 if let Some(ref role) = block.metadata.semantic_role {
598 if role.category.as_str().starts_with("heading") {
599 assert_eq!(role.category.as_str(), "heading3");
600 }
601 }
602 }
603 }
604}