ucp_translator_html/
lib.rs1mod error;
22mod parser;
23
24pub use error::{HtmlError, Result};
25pub use parser::{HeadingStrategy, HtmlParser, HtmlParserConfig};
26
27pub fn parse_html(html: &str) -> Result<ucm_core::Document> {
31 HtmlParser::new().parse(html)
32}
33
34#[cfg(test)]
35mod tests {
36 use super::*;
37
38 #[test]
39 fn test_parse_simple_html() {
40 let html = r#"<html><body>
41 <h1>Hello World</h1>
42 <p>This is a paragraph.</p>
43 </body></html>"#;
44
45 let doc = parse_html(html).unwrap();
46 assert!(doc.block_count() > 1);
47 }
48
49 #[test]
50 fn test_parse_nested_structure() {
51 let html = r#"<html><body>
52 <h1>Main Title</h1>
53 <p>Intro paragraph</p>
54 <h2>Section 1</h2>
55 <p>Section 1 content</p>
56 <h2>Section 2</h2>
57 <p>Section 2 content</p>
58 </body></html>"#;
59
60 let doc = parse_html(html).unwrap();
61
62 assert!(doc.block_count() >= 5);
64 }
65
66 #[test]
67 fn test_parse_with_links() {
68 let html = r#"<html><body>
69 <p>Check out <a href="https://example.com">this link</a>!</p>
70 </body></html>"#;
71
72 let doc = parse_html(html).unwrap();
73 assert!(doc.block_count() >= 2);
74 }
75
76 #[test]
77 fn test_parse_with_images() {
78 let html = r#"<html><body>
79 <h1>Gallery</h1>
80 <img src="https://example.com/image.jpg" alt="Test image">
81 </body></html>"#;
82
83 let config = HtmlParserConfig {
84 extract_images: true,
85 ..Default::default()
86 };
87 let parser = HtmlParser::with_config(config);
88 let doc = parser.parse(html).unwrap();
89
90 assert!(doc.block_count() >= 2);
91 }
92
93 #[test]
94 fn test_parse_code_blocks() {
95 let html = r#"<html><body>
96 <pre><code class="language-rust">fn main() {
97 println!("Hello");
98}</code></pre>
99 </body></html>"#;
100
101 let doc = parse_html(html).unwrap();
102 assert!(doc.block_count() >= 2);
103 }
104
105 #[test]
106 fn test_parse_lists() {
107 let html = r#"<html><body>
108 <ul>
109 <li>Item 1</li>
110 <li>Item 2</li>
111 <li>Item 3</li>
112 </ul>
113 </body></html>"#;
114
115 let doc = parse_html(html).unwrap();
116 assert!(doc.block_count() >= 2);
117 }
118
119 #[test]
120 fn test_parse_tables() {
121 let html = r#"<html><body>
122 <table>
123 <tr><th>Name</th><th>Age</th></tr>
124 <tr><td>Alice</td><td>30</td></tr>
125 <tr><td>Bob</td><td>25</td></tr>
126 </table>
127 </body></html>"#;
128
129 let doc = parse_html(html).unwrap();
130 assert!(doc.block_count() >= 2);
131 }
132
133 #[test]
134 fn test_empty_html() {
135 let html = "<html><body></body></html>";
136 let doc = parse_html(html).unwrap();
137 assert_eq!(doc.block_count(), 1); }
139
140 #[test]
141 fn test_malformed_html() {
142 let html = "<p>Unclosed paragraph <b>bold";
144 let result = parse_html(html);
145 assert!(result.is_ok());
147 }
148}