Skip to main content

ucp_translator_html/
parser.rs

1//! HTML parser implementation.
2
3use crate::error::{HtmlError, Result};
4use scraper::{ElementRef, Html, Selector};
5use ucm_core::{Block, BlockId, Content, Document, MediaSource};
6
7/// Strategy for handling heading levels
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
9pub enum HeadingStrategy {
10    /// Use actual heading levels from HTML (h1-h6)
11    #[default]
12    AsIs,
13    /// Flatten all headings to a single level
14    Flatten(usize),
15    /// Infer hierarchy from DOM nesting depth
16    InferFromNesting,
17}
18
19/// Configuration for HTML parsing
20#[derive(Debug, Clone)]
21pub struct HtmlParserConfig {
22    /// Whether to preserve whitespace in text nodes
23    pub preserve_whitespace: bool,
24    /// Whether to extract images as media blocks
25    pub extract_images: bool,
26    /// Whether to extract links and store href in edges
27    pub extract_links: bool,
28    /// Strategy for handling heading levels
29    pub heading_strategy: HeadingStrategy,
30    /// Maximum nesting depth to process
31    pub max_depth: usize,
32    /// Maximum number of blocks to create
33    pub max_blocks: usize,
34    /// Minimum text length to create a block (filters noise)
35    pub min_text_length: usize,
36}
37
38impl Default for HtmlParserConfig {
39    fn default() -> Self {
40        Self {
41            preserve_whitespace: false,
42            extract_images: true,
43            extract_links: true,
44            heading_strategy: HeadingStrategy::AsIs,
45            max_depth: 50,
46            max_blocks: 10000,
47            min_text_length: 1,
48        }
49    }
50}
51
52/// HTML to UCM document parser
53pub struct HtmlParser {
54    config: HtmlParserConfig,
55}
56
57impl HtmlParser {
58    /// Create a new parser with default configuration
59    pub fn new() -> Self {
60        Self {
61            config: HtmlParserConfig::default(),
62        }
63    }
64
65    /// Create a parser with custom configuration
66    pub fn with_config(config: HtmlParserConfig) -> Self {
67        Self { config }
68    }
69
70    /// Parse HTML string into a UCM Document
71    pub fn parse(&self, html: &str) -> Result<Document> {
72        let mut doc = Document::create();
73        let root = doc.root;
74
75        // Parse HTML
76        let fragment = Html::parse_document(html);
77
78        // Find body or use root element
79        let body_selector = Selector::parse("body").unwrap();
80        let body = fragment.select(&body_selector).next();
81
82        if let Some(body_element) = body {
83            self.process_children(&mut doc, &root, body_element, 0)?;
84        } else {
85            // No body tag, process entire document
86            if let Some(root_element) = fragment.root_element().first_child() {
87                if let Some(element) = ElementRef::wrap(root_element) {
88                    self.process_children(&mut doc, &root, element, 0)?;
89                }
90            }
91        }
92
93        Ok(doc)
94    }
95
96    /// Process all children of an element
97    fn process_children(
98        &self,
99        doc: &mut Document,
100        parent_id: &BlockId,
101        element: ElementRef,
102        depth: usize,
103    ) -> Result<()> {
104        if depth > self.config.max_depth {
105            return Err(HtmlError::ResourceLimit(format!(
106                "Maximum nesting depth {} exceeded",
107                self.config.max_depth
108            )));
109        }
110
111        if doc.block_count() > self.config.max_blocks {
112            return Err(HtmlError::ResourceLimit(format!(
113                "Maximum block count {} exceeded",
114                self.config.max_blocks
115            )));
116        }
117
118        let mut current_heading_parent = *parent_id;
119        let mut heading_stack: Vec<(usize, BlockId)> = vec![(0, *parent_id)];
120
121        for child in element.children() {
122            if let Some(child_element) = ElementRef::wrap(child) {
123                let tag_name = child_element.value().name();
124
125                // Handle headings specially for hierarchy
126                if let Some(level) = self.parse_heading_level(tag_name) {
127                    // Pop stack until we find a heading with lower level
128                    while heading_stack.len() > 1
129                        && heading_stack
130                            .last()
131                            .map(|(l, _)| *l >= level)
132                            .unwrap_or(false)
133                    {
134                        heading_stack.pop();
135                    }
136
137                    let heading_parent = heading_stack
138                        .last()
139                        .map(|(_, id)| *id)
140                        .unwrap_or(*parent_id);
141
142                    let heading_id =
143                        self.process_heading(doc, &heading_parent, child_element, level)?;
144
145                    if let Some(id) = heading_id {
146                        heading_stack.push((level, id));
147                        current_heading_parent = id;
148                    }
149                } else {
150                    // Non-heading elements go under current heading
151                    self.process_element(doc, &current_heading_parent, child_element, depth + 1)?;
152                }
153            } else if let Some(text_node) = child.value().as_text() {
154                let text = if self.config.preserve_whitespace {
155                    text_node.to_string()
156                } else {
157                    text_node.trim().to_string()
158                };
159
160                if text.len() >= self.config.min_text_length {
161                    let block = Block::new(Content::text(&text), Some("text"));
162                    doc.add_block(block, &current_heading_parent)?;
163                }
164            }
165        }
166
167        Ok(())
168    }
169
170    /// Process a single HTML element
171    fn process_element(
172        &self,
173        doc: &mut Document,
174        parent_id: &BlockId,
175        element: ElementRef,
176        depth: usize,
177    ) -> Result<Option<BlockId>> {
178        if depth > self.config.max_depth {
179            return Ok(None);
180        }
181
182        let tag_name = element.value().name();
183
184        match tag_name {
185            // Skip script, style, meta, etc.
186            "script" | "style" | "meta" | "link" | "head" | "noscript" => Ok(None),
187
188            // Headings (handled separately in process_children for hierarchy)
189            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
190                let level = self.parse_heading_level(tag_name).unwrap_or(1);
191                self.process_heading(doc, parent_id, element, level)
192            }
193
194            // Paragraphs
195            "p" => self.process_paragraph(doc, parent_id, element),
196
197            // Lists
198            "ul" | "ol" => self.process_list(doc, parent_id, element),
199
200            // Code blocks
201            "pre" => self.process_code_block(doc, parent_id, element),
202            "code" => {
203                // Inline code - treat as text
204                let code_text = element.text().collect::<String>();
205                if !code_text.trim().is_empty() {
206                    let formatted = format!("`{}`", code_text);
207                    let block = Block::new(Content::text(&formatted), Some("code"));
208                    Ok(Some(doc.add_block(block, parent_id)?))
209                } else {
210                    Ok(None)
211                }
212            }
213
214            // Blockquotes
215            "blockquote" => self.process_blockquote(doc, parent_id, element),
216
217            // Images
218            "img" => self.process_image(doc, parent_id, element),
219
220            // Links
221            "a" => self.process_link(doc, parent_id, element),
222
223            // Tables
224            "table" => self.process_table(doc, parent_id, element),
225
226            // Container elements - process children
227            "div" | "section" | "article" | "main" | "aside" | "nav" | "header" | "footer"
228            | "span" | "figure" | "figcaption" => {
229                self.process_children(doc, parent_id, element, depth)?;
230                Ok(None)
231            }
232
233            // Line breaks
234            "br" | "hr" => Ok(None),
235
236            // Default: try to extract text content
237            _ => {
238                let text = self.extract_text_content(element);
239                if !text.is_empty() && text.len() >= self.config.min_text_length {
240                    let block = Block::new(Content::text(&text), Some("text"));
241                    Ok(Some(doc.add_block(block, parent_id)?))
242                } else {
243                    // Process children for unknown container elements
244                    self.process_children(doc, parent_id, element, depth)?;
245                    Ok(None)
246                }
247            }
248        }
249    }
250
251    /// Process a heading element
252    fn process_heading(
253        &self,
254        doc: &mut Document,
255        parent_id: &BlockId,
256        element: ElementRef,
257        level: usize,
258    ) -> Result<Option<BlockId>> {
259        let text = self.extract_text_content(element);
260        if text.is_empty() {
261            return Ok(None);
262        }
263
264        let adjusted_level = match self.config.heading_strategy {
265            HeadingStrategy::AsIs => level,
266            HeadingStrategy::Flatten(target) => target,
267            HeadingStrategy::InferFromNesting => level, // Could be enhanced
268        };
269
270        let role = format!("heading{}", adjusted_level.clamp(1, 6));
271        let block = Block::new(Content::text(&text), Some(&role));
272        let block_id = doc.add_block(block, parent_id)?;
273
274        Ok(Some(block_id))
275    }
276
277    /// Process a paragraph element
278    fn process_paragraph(
279        &self,
280        doc: &mut Document,
281        parent_id: &BlockId,
282        element: ElementRef,
283    ) -> Result<Option<BlockId>> {
284        let text = self.extract_formatted_text(element);
285        if text.is_empty() || text.len() < self.config.min_text_length {
286            return Ok(None);
287        }
288
289        let block = Block::new(Content::text(&text), Some("paragraph"));
290        Ok(Some(doc.add_block(block, parent_id)?))
291    }
292
293    /// Process a list (ul/ol)
294    fn process_list(
295        &self,
296        doc: &mut Document,
297        parent_id: &BlockId,
298        element: ElementRef,
299    ) -> Result<Option<BlockId>> {
300        let li_selector = Selector::parse("li").unwrap();
301        let items: Vec<String> = element
302            .select(&li_selector)
303            .map(|li| self.extract_formatted_text(li))
304            .filter(|s| !s.is_empty())
305            .collect();
306
307        if items.is_empty() {
308            return Ok(None);
309        }
310
311        let list_content = items.join("\n");
312        let block = Block::new(Content::text(&list_content), Some("list"));
313        Ok(Some(doc.add_block(block, parent_id)?))
314    }
315
316    /// Process a code block (pre/code)
317    fn process_code_block(
318        &self,
319        doc: &mut Document,
320        parent_id: &BlockId,
321        element: ElementRef,
322    ) -> Result<Option<BlockId>> {
323        let code_selector = Selector::parse("code").unwrap();
324        let code_element = element.select(&code_selector).next().unwrap_or(element);
325
326        let code_text = code_element.text().collect::<String>();
327        if code_text.trim().is_empty() {
328            return Ok(None);
329        }
330
331        // Try to extract language from class
332        let language = code_element
333            .value()
334            .attr("class")
335            .and_then(|class| {
336                class
337                    .split_whitespace()
338                    .find(|c| c.starts_with("language-") || c.starts_with("lang-"))
339                    .map(|c| {
340                        c.trim_start_matches("language-")
341                            .trim_start_matches("lang-")
342                    })
343            })
344            .unwrap_or("text");
345
346        let block = Block::new(Content::code(language, &code_text), Some("code"));
347        Ok(Some(doc.add_block(block, parent_id)?))
348    }
349
350    /// Process a blockquote
351    fn process_blockquote(
352        &self,
353        doc: &mut Document,
354        parent_id: &BlockId,
355        element: ElementRef,
356    ) -> Result<Option<BlockId>> {
357        let text = self.extract_formatted_text(element);
358        if text.is_empty() {
359            return Ok(None);
360        }
361
362        let block = Block::new(Content::text(&text), Some("quote"));
363        Ok(Some(doc.add_block(block, parent_id)?))
364    }
365
366    /// Process an image element
367    fn process_image(
368        &self,
369        doc: &mut Document,
370        parent_id: &BlockId,
371        element: ElementRef,
372    ) -> Result<Option<BlockId>> {
373        if !self.config.extract_images {
374            return Ok(None);
375        }
376
377        let src = element.value().attr("src").unwrap_or("");
378        let alt = element.value().attr("alt").unwrap_or("");
379
380        if src.is_empty() {
381            return Ok(None);
382        }
383
384        // Create media content
385        let media_source = if src.starts_with("data:") {
386            // Base64 encoded image
387            let base64_data = src.split(',').nth(1).unwrap_or("").to_string();
388            MediaSource::Base64(base64_data)
389        } else {
390            MediaSource::Url(src.to_string())
391        };
392
393        let media = ucm_core::Media::image(media_source).with_alt(alt);
394        let block = Block::new(Content::Media(media), Some("image"));
395        Ok(Some(doc.add_block(block, parent_id)?))
396    }
397
398    /// Process a link element
399    fn process_link(
400        &self,
401        doc: &mut Document,
402        parent_id: &BlockId,
403        element: ElementRef,
404    ) -> Result<Option<BlockId>> {
405        let text = self.extract_text_content(element);
406        let href = element.value().attr("href").unwrap_or("");
407
408        if text.is_empty() {
409            return Ok(None);
410        }
411
412        if self.config.extract_links && !href.is_empty() {
413            // Create link in markdown format
414            let link_text = format!("[{}]({})", text, href);
415            let block = Block::new(Content::text(&link_text), Some("link"));
416            Ok(Some(doc.add_block(block, parent_id)?))
417        } else {
418            // Just extract as text
419            let block = Block::new(Content::text(&text), Some("text"));
420            Ok(Some(doc.add_block(block, parent_id)?))
421        }
422    }
423
424    /// Process a table element
425    fn process_table(
426        &self,
427        doc: &mut Document,
428        parent_id: &BlockId,
429        element: ElementRef,
430    ) -> Result<Option<BlockId>> {
431        let row_selector = Selector::parse("tr").unwrap();
432        let cell_selector = Selector::parse("td, th").unwrap();
433
434        let rows: Vec<Vec<String>> = element
435            .select(&row_selector)
436            .map(|row| {
437                row.select(&cell_selector)
438                    .map(|cell| self.extract_text_content(cell))
439                    .collect()
440            })
441            .filter(|row: &Vec<String>| !row.is_empty())
442            .collect();
443
444        if rows.is_empty() {
445            return Ok(None);
446        }
447
448        let block = Block::new(Content::table(rows), Some("table"));
449        Ok(Some(doc.add_block(block, parent_id)?))
450    }
451
452    /// Parse heading level from tag name
453    fn parse_heading_level(&self, tag_name: &str) -> Option<usize> {
454        match tag_name {
455            "h1" => Some(1),
456            "h2" => Some(2),
457            "h3" => Some(3),
458            "h4" => Some(4),
459            "h5" => Some(5),
460            "h6" => Some(6),
461            _ => None,
462        }
463    }
464
465    /// Extract plain text content from an element
466    fn extract_text_content(&self, element: ElementRef) -> String {
467        let text: String = element.text().collect();
468        if self.config.preserve_whitespace {
469            text
470        } else {
471            // Normalize whitespace
472            text.split_whitespace().collect::<Vec<_>>().join(" ")
473        }
474    }
475
476    /// Extract text with some formatting preserved (bold, italic, etc.)
477    fn extract_formatted_text(&self, element: ElementRef) -> String {
478        let mut result = String::new();
479
480        for child in element.children() {
481            if let Some(child_element) = ElementRef::wrap(child) {
482                let tag_name = child_element.value().name();
483                let child_text = self.extract_formatted_text(child_element);
484
485                match tag_name {
486                    "strong" | "b" => {
487                        result.push_str("**");
488                        result.push_str(&child_text);
489                        result.push_str("**");
490                    }
491                    "em" | "i" => {
492                        result.push('*');
493                        result.push_str(&child_text);
494                        result.push('*');
495                    }
496                    "code" => {
497                        result.push('`');
498                        result.push_str(&child_text);
499                        result.push('`');
500                    }
501                    "a" if self.config.extract_links => {
502                        let href = child_element.value().attr("href").unwrap_or("");
503                        if !href.is_empty() {
504                            result.push_str(&format!("[{}]({})", child_text, href));
505                        } else {
506                            result.push_str(&child_text);
507                        }
508                    }
509                    "br" => {
510                        result.push('\n');
511                    }
512                    _ => {
513                        result.push_str(&child_text);
514                    }
515                }
516            } else if let Some(text_node) = child.value().as_text() {
517                let text = if self.config.preserve_whitespace {
518                    text_node.to_string()
519                } else {
520                    text_node.split_whitespace().collect::<Vec<_>>().join(" ")
521                };
522                result.push_str(&text);
523            }
524        }
525
526        result.trim().to_string()
527    }
528}
529
530impl Default for HtmlParser {
531    fn default() -> Self {
532        Self::new()
533    }
534}
535
536#[cfg(test)]
537mod tests {
538    use super::*;
539
540    #[test]
541    fn test_heading_hierarchy() {
542        let html = r#"<html><body>
543            <h1>Main</h1>
544            <p>Intro</p>
545            <h2>Sub 1</h2>
546            <p>Content 1</p>
547            <h2>Sub 2</h2>
548            <p>Content 2</p>
549        </body></html>"#;
550
551        let doc = HtmlParser::new().parse(html).unwrap();
552
553        // Verify structure
554        let root_children = doc.children(&doc.root);
555        assert!(!root_children.is_empty());
556    }
557
558    #[test]
559    fn test_code_language_extraction() {
560        let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
561        let doc = HtmlParser::new().parse(html).unwrap();
562
563        // Should have extracted the code block
564        assert!(doc.block_count() >= 2);
565    }
566
567    #[test]
568    fn test_max_depth_limit() {
569        let config = HtmlParserConfig {
570            max_depth: 2,
571            ..Default::default()
572        };
573        let parser = HtmlParser::with_config(config);
574
575        // Deeply nested HTML
576        let html = "<div><div><div><div><div><p>Deep</p></div></div></div></div></div>";
577        let result = parser.parse(html);
578
579        // Should handle gracefully (either succeed with truncation or error)
580        // The important thing is it doesn't stack overflow
581        assert!(result.is_ok() || matches!(result, Err(HtmlError::ResourceLimit(_))));
582    }
583
584    #[test]
585    fn test_heading_strategy_flatten() {
586        let config = HtmlParserConfig {
587            heading_strategy: HeadingStrategy::Flatten(3),
588            ..Default::default()
589        };
590        let parser = HtmlParser::with_config(config);
591
592        let html = "<h1>Title</h1><h2>Subtitle</h2>";
593        let doc = parser.parse(html).unwrap();
594
595        // All headings should be flattened to h3
596        for block in doc.blocks.values() {
597            if let Some(ref role) = block.metadata.semantic_role {
598                if role.category.as_str().starts_with("heading") {
599                    assert_eq!(role.category.as_str(), "heading3");
600                }
601            }
602        }
603    }
604}