Skip to main content

ucp_translator_html/
lib.rs

1//! HTML to UCM document translator.
2//!
3//! This crate provides translation from HTML documents to UCM's block-based
4//! document model. It extracts semantic structure from HTML elements and
5//! creates appropriate blocks with proper hierarchy.
6//!
7//! # Example
8//!
9//! ```
10//! use ucp_translator_html::{HtmlParser, HtmlParserConfig};
11//!
12//! let html = r#"<html><body>
13//!     <h1>Title</h1>
14//!     <p>Some content here.</p>
15//! </body></html>"#;
16//!
17//! let parser = HtmlParser::new();
18//! let doc = parser.parse(html).unwrap();
19//! ```
20
21mod error;
22mod parser;
23
24pub use error::{HtmlError, Result};
25pub use parser::{HeadingStrategy, HtmlParser, HtmlParserConfig};
26
27/// Parse HTML string into a UCM Document.
28///
29/// This is a convenience function that uses default configuration.
30pub fn parse_html(html: &str) -> Result<ucm_core::Document> {
31    HtmlParser::new().parse(html)
32}
33
34#[cfg(test)]
35mod tests {
36    use super::*;
37
38    #[test]
39    fn test_parse_simple_html() {
40        let html = r#"<html><body>
41            <h1>Hello World</h1>
42            <p>This is a paragraph.</p>
43        </body></html>"#;
44
45        let doc = parse_html(html).unwrap();
46        assert!(doc.block_count() > 1);
47    }
48
49    #[test]
50    fn test_parse_nested_structure() {
51        let html = r#"<html><body>
52            <h1>Main Title</h1>
53            <p>Intro paragraph</p>
54            <h2>Section 1</h2>
55            <p>Section 1 content</p>
56            <h2>Section 2</h2>
57            <p>Section 2 content</p>
58        </body></html>"#;
59
60        let doc = parse_html(html).unwrap();
61
62        // Should have root + h1 + h2 + h2 + paragraphs
63        assert!(doc.block_count() >= 5);
64    }
65
66    #[test]
67    fn test_parse_with_links() {
68        let html = r#"<html><body>
69            <p>Check out <a href="https://example.com">this link</a>!</p>
70        </body></html>"#;
71
72        let doc = parse_html(html).unwrap();
73        assert!(doc.block_count() >= 2);
74    }
75
76    #[test]
77    fn test_parse_with_images() {
78        let html = r#"<html><body>
79            <h1>Gallery</h1>
80            <img src="https://example.com/image.jpg" alt="Test image">
81        </body></html>"#;
82
83        let config = HtmlParserConfig {
84            extract_images: true,
85            ..Default::default()
86        };
87        let parser = HtmlParser::with_config(config);
88        let doc = parser.parse(html).unwrap();
89
90        assert!(doc.block_count() >= 2);
91    }
92
93    #[test]
94    fn test_parse_code_blocks() {
95        let html = r#"<html><body>
96            <pre><code class="language-rust">fn main() {
97    println!("Hello");
98}</code></pre>
99        </body></html>"#;
100
101        let doc = parse_html(html).unwrap();
102        assert!(doc.block_count() >= 2);
103    }
104
105    #[test]
106    fn test_parse_lists() {
107        let html = r#"<html><body>
108            <ul>
109                <li>Item 1</li>
110                <li>Item 2</li>
111                <li>Item 3</li>
112            </ul>
113        </body></html>"#;
114
115        let doc = parse_html(html).unwrap();
116        assert!(doc.block_count() >= 2);
117    }
118
119    #[test]
120    fn test_parse_tables() {
121        let html = r#"<html><body>
122            <table>
123                <tr><th>Name</th><th>Age</th></tr>
124                <tr><td>Alice</td><td>30</td></tr>
125                <tr><td>Bob</td><td>25</td></tr>
126            </table>
127        </body></html>"#;
128
129        let doc = parse_html(html).unwrap();
130        assert!(doc.block_count() >= 2);
131    }
132
133    #[test]
134    fn test_empty_html() {
135        let html = "<html><body></body></html>";
136        let doc = parse_html(html).unwrap();
137        assert_eq!(doc.block_count(), 1); // Just root
138    }
139
140    #[test]
141    fn test_malformed_html() {
142        // Should handle malformed HTML gracefully
143        let html = "<p>Unclosed paragraph <b>bold";
144        let result = parse_html(html);
145        // Should not panic, may succeed with partial parsing
146        assert!(result.is_ok());
147    }
148}