1use std::path::PathBuf;
7
8use scraper::{Html, Node, Selector};
9
10use oxipdf_ir::node::{ContentVariant, ImageContent, LinkContent, LinkTarget};
11use oxipdf_ir::semantic::SemanticRole;
12use oxipdf_ir::style::{Display, ResolvedStyle};
13use oxipdf_ir::tree::StyledTreeBuilder;
14use oxipdf_ir::units::Pt;
15use oxipdf_ir::{IrVersion, TextContent};
16
17use crate::css::{self, apply_declarations, parse_declarations};
18use crate::elements::{self, heading_font_size};
19use crate::error::HtmlError;
20
21use super::cascade::{
22 apply_important_stylesheet_rules, apply_matching_rules, apply_normal_stylesheet_rules,
23};
24use super::stylesheets::{collect_link_stylesheets, collect_style_rules};
25
26#[derive(Debug, Clone, Default)]
28pub struct ConvertOptions {
29 pub extra_css: String,
31 pub base_dir: Option<PathBuf>,
34}
35
36pub fn html_to_tree(html: &str) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
41 html_to_tree_with_options(html, &ConvertOptions::default())
42}
43
44pub fn html_to_tree_with_css(
49 html: &str,
50 extra_css: &str,
51) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
52 html_to_tree_with_options(
53 html,
54 &ConvertOptions {
55 extra_css: extra_css.to_string(),
56 ..Default::default()
57 },
58 )
59}
60
61pub fn html_to_tree_with_options(
66 html: &str,
67 options: &ConvertOptions,
68) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
69 let document = Html::parse_document(html);
70
71 let mut rules = collect_link_stylesheets(&document, options.base_dir.as_deref());
73 rules.extend(collect_style_rules(&document));
74 if !options.extra_css.is_empty() {
75 rules.extend(css::parse_stylesheet(&options.extra_css));
76 }
77
78 let mut builder = StyledTreeBuilder::new(IrVersion::new(1, 0));
79
80 let body_sel = Selector::parse("body").expect("'body' is a valid CSS selector");
82 let body_node = document
83 .select(&body_sel)
84 .next()
85 .map(|el| el.id())
86 .unwrap_or(document.root_element().id());
87
88 let mut root_style = ResolvedStyle::default();
90 root_style.layout.display = Display::Block;
91 let root_id = builder.add_node(
92 ContentVariant::Container,
93 root_style,
94 Some(SemanticRole::Document),
95 None,
96 );
97
98 let body_ref = document
100 .tree
101 .get(body_node)
102 .ok_or(HtmlError::EmptyDocument)?;
103 convert_children(&document, body_ref, root_id, &rules, &mut builder)?;
104
105 if builder.len() < 2 {
106 return Err(HtmlError::EmptyDocument);
107 }
108
109 Ok(builder.build()?)
110}
111
112fn convert_children(
114 document: &Html,
115 parent_node: ego_tree::NodeRef<'_, Node>,
116 parent_id: oxipdf_ir::node::NodeId,
117 rules: &[crate::css::CssRule],
118 builder: &mut StyledTreeBuilder,
119) -> Result<(), HtmlError> {
120 for child in parent_node.children() {
121 match child.value() {
122 Node::Text(text) => {
123 let t = text.text.to_string();
124 if !t.trim().is_empty() {
125 let mut style = ResolvedStyle::default();
126 style.layout.display = Display::Inline;
127 builder.add_child(
128 parent_id,
129 ContentVariant::Text(TextContent::new(&t)),
130 style,
131 None,
132 None,
133 );
134 }
135 }
136 Node::Element(el) => {
137 convert_element(document, child, el, parent_id, rules, builder)?;
138 }
139 _ => {} }
141 }
142 Ok(())
143}
144
145fn convert_element(
147 document: &Html,
148 node_ref: ego_tree::NodeRef<'_, Node>,
149 el: &scraper::node::Element,
150 parent_id: oxipdf_ir::node::NodeId,
151 rules: &[crate::css::CssRule],
152 builder: &mut StyledTreeBuilder,
153) -> Result<(), HtmlError> {
154 let tag = el.name().to_lowercase();
155
156 if matches!(
158 tag.as_str(),
159 "script" | "style" | "meta" | "link" | "head" | "title"
160 ) {
161 return Ok(());
162 }
163
164 if matches!(
167 tag.as_str(),
168 "thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col"
169 ) {
170 return Ok(());
171 }
172
173 if tag == "br" {
175 let mut style = ResolvedStyle::default();
176 style.layout.display = Display::Inline;
177 builder.add_child(
178 parent_id,
179 ContentVariant::Text(TextContent::new("\n")),
180 style,
181 None,
182 None,
183 );
184 return Ok(());
185 }
186
187 let info = elements::element_info(&tag);
188 let element_id = el.attr("id").map(|s| s.to_string());
189
190 let mut style = ResolvedStyle::default();
192 style.layout.display = info.default_display;
193
194 if let Some(SemanticRole::Heading { level }) = info.role {
196 style.typography.font_size = Pt::new(heading_font_size(level));
197 }
198
199 let inline_css = el.attr("style");
212
213 if inline_css.is_some() {
214 apply_normal_stylesheet_rules(document, node_ref.id(), &mut style, rules);
215 } else {
216 apply_matching_rules(document, node_ref.id(), &mut style, rules);
217 }
218
219 info.style_overrides.apply(&mut style);
221
222 if info.style_overrides.is_monospace && style.typography.font_families.is_empty() {
224 style.typography.font_families = vec!["monospace".to_string()];
225 }
226
227 if let Some(inline_css) = inline_css {
228 let decls = parse_declarations(inline_css);
229
230 let normal: Vec<_> = decls.iter().filter(|d| !d.important).cloned().collect();
232 if !normal.is_empty() {
233 apply_declarations(&mut style, &normal);
234 }
235
236 apply_important_stylesheet_rules(document, node_ref.id(), &mut style, rules);
238
239 let important: Vec<_> = decls.iter().filter(|d| d.important).cloned().collect();
241 if !important.is_empty() {
242 apply_declarations(&mut style, &important);
243 }
244 }
245
246 match tag.as_str() {
248 "table" => {
249 return super::table::convert_table(
250 document, node_ref, parent_id, style, rules, element_id, builder,
251 );
252 }
253 "img" => {
254 return convert_img(el, parent_id, style, info.role, element_id, builder);
255 }
256 "a" => {
257 return convert_link(
258 document, node_ref, el, parent_id, style, rules, element_id, builder,
259 );
260 }
261 "hr" => {
262 style.visual.border_top = oxipdf_ir::style::visual::BorderSide {
263 width: Pt::new(1.0),
264 style: oxipdf_ir::style::visual::BorderStyle::Solid,
265 color: oxipdf_ir::color::Color::rgb(0.8, 0.8, 0.8),
266 };
267 style.layout.margin_top = oxipdf_ir::Dimension::Length(Pt::new(6.0));
268 style.layout.margin_bottom = oxipdf_ir::Dimension::Length(Pt::new(6.0));
269 builder.add_child(
270 parent_id,
271 ContentVariant::Container,
272 style,
273 None,
274 element_id,
275 );
276 return Ok(());
277 }
278 _ => {}
279 }
280
281 let node_id = builder.add_child(parent_id, info.content, style, info.role, element_id);
283 convert_children(document, node_ref, node_id, rules, builder)?;
284
285 Ok(())
286}
287
288fn convert_img(
290 el: &scraper::node::Element,
291 parent_id: oxipdf_ir::node::NodeId,
292 style: ResolvedStyle,
293 role: Option<SemanticRole>,
294 element_id: Option<String>,
295 builder: &mut StyledTreeBuilder,
296) -> Result<(), HtmlError> {
297 let src = el.attr("src").unwrap_or_default();
298 let alt = el.attr("alt").map(|s| s.to_string());
299 let width = el
300 .attr("width")
301 .and_then(|w| w.parse::<f64>().ok())
302 .unwrap_or(100.0);
303 let height = el
304 .attr("height")
305 .and_then(|h| h.parse::<f64>().ok())
306 .unwrap_or(100.0);
307
308 if let Some((data, format)) = super::uri::parse_data_uri(src) {
310 let mut img = ImageContent::with_dimensions(
311 data,
312 format,
313 Pt::new(width * 0.75),
314 Pt::new(height * 0.75),
315 );
316 if let Some(alt_text) = alt {
317 img = img.with_alt_text(alt_text);
318 }
319 builder.add_child(
320 parent_id,
321 ContentVariant::Image(img),
322 style,
323 role.or(Some(SemanticRole::Figure)),
324 element_id,
325 );
326 }
327 Ok(())
330}
331
332#[allow(clippy::too_many_arguments)]
334fn convert_link(
335 document: &Html,
336 node_ref: ego_tree::NodeRef<'_, Node>,
337 el: &scraper::node::Element,
338 parent_id: oxipdf_ir::node::NodeId,
339 mut style: ResolvedStyle,
340 rules: &[crate::css::CssRule],
341 element_id: Option<String>,
342 builder: &mut StyledTreeBuilder,
343) -> Result<(), HtmlError> {
344 let href = el.attr("href").unwrap_or_default().to_string();
345 let target = if let Some(fragment) = href.strip_prefix('#') {
346 LinkTarget::Internal(fragment.to_string())
347 } else {
348 LinkTarget::External(href)
349 };
350
351 if style.typography.color == oxipdf_ir::color::Color::BLACK {
353 style.typography.color = oxipdf_ir::color::Color::rgb(0.0, 0.0, 0.8);
354 }
355 if style.typography.text_decoration == oxipdf_ir::style::typography::TextDecoration::None {
356 style.typography.text_decoration = oxipdf_ir::style::typography::TextDecoration::Underline;
357 }
358 style.layout.display = Display::Inline;
359
360 let link_id = builder.add_child(
361 parent_id,
362 ContentVariant::Link(LinkContent { target }),
363 style,
364 None,
365 element_id,
366 );
367
368 convert_children(document, node_ref, link_id, rules, builder)?;
369 Ok(())
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375 use oxipdf_ir::node::LinkTarget;
376 use oxipdf_ir::style::typography::FontStyle;
377
378 #[test]
379 fn simple_paragraph() {
380 let tree = html_to_tree("<p>Hello world</p>").unwrap();
381 assert!(tree.node_count() >= 3); }
383
384 #[test]
385 fn headings_create_semantic_roles() {
386 let tree = html_to_tree("<h1>Title</h1><h2>Sub</h2>").unwrap();
387 let mut found_h1 = false;
388 let mut found_h2 = false;
389 for node in tree.iter_nodes() {
390 if node.semantic_role == Some(SemanticRole::Heading { level: 1 }) {
391 found_h1 = true;
392 }
393 if node.semantic_role == Some(SemanticRole::Heading { level: 2 }) {
394 found_h2 = true;
395 }
396 }
397 assert!(found_h1, "should have H1");
398 assert!(found_h2, "should have H2");
399 }
400
401 #[test]
402 fn inline_elements_styled() {
403 let tree = html_to_tree("<p><strong>bold</strong> and <em>italic</em></p>").unwrap();
404 let mut found_bold = false;
405 let mut found_italic = false;
406 for node in tree.iter_nodes() {
407 if node.style.typography.font_weight == 700 {
408 found_bold = true;
409 }
410 if node.style.typography.font_style == FontStyle::Italic {
411 found_italic = true;
412 }
413 }
414 assert!(found_bold, "should have bold");
415 assert!(found_italic, "should have italic");
416 }
417
418 #[test]
419 fn style_block_applied() {
420 let html = r##"
421 <style>p { color: #ff0000; font-size: 14pt; }</style>
422 <p>Red text</p>
423 "##;
424 let tree = html_to_tree(html).unwrap();
425 let mut found = false;
426 for node in tree.iter_nodes() {
427 if node.semantic_role == Some(SemanticRole::Paragraph) {
428 found = true;
429 assert!(
430 (node.style.typography.font_size.get() - 14.0).abs() < 0.01,
431 "font size should be 14pt"
432 );
433 }
434 }
435 assert!(found, "should find paragraph");
436 }
437
438 #[test]
439 fn inline_style_overrides_stylesheet() {
440 let html = r##"
441 <style>p { font-size: 10pt; }</style>
442 <p style="font-size: 20pt">Big text</p>
443 "##;
444 let tree = html_to_tree(html).unwrap();
445 for node in tree.iter_nodes() {
446 if node.semantic_role == Some(SemanticRole::Paragraph) {
447 assert!(
448 (node.style.typography.font_size.get() - 20.0).abs() < 0.01,
449 "inline style should override stylesheet"
450 );
451 }
452 }
453 }
454
455 #[test]
456 fn extra_css_applied() {
457 let html = "<p>Styled</p>";
458 let css = "p { font-size: 18pt; }";
459 let tree = html_to_tree_with_css(html, css).unwrap();
460 for node in tree.iter_nodes() {
461 if node.semantic_role == Some(SemanticRole::Paragraph) {
462 assert!((node.style.typography.font_size.get() - 18.0).abs() < 0.01);
463 }
464 }
465 }
466
467 #[test]
468 fn empty_body_returns_error() {
469 assert!(matches!(
470 html_to_tree("<html><body></body></html>"),
471 Err(HtmlError::EmptyDocument)
472 ));
473 }
474
475 #[test]
476 fn br_creates_newline_text() {
477 let tree = html_to_tree("<p>Line 1<br>Line 2</p>").unwrap();
478 let mut found_newline = false;
479 for node in tree.iter_nodes() {
480 if let ContentVariant::Text(ref t) = node.content {
481 if t.text.contains('\n') {
482 found_newline = true;
483 }
484 }
485 }
486 assert!(found_newline, "should have newline from <br>");
487 }
488
489 #[test]
490 fn link_creates_link_node() {
491 let tree = html_to_tree(r#"<a href="https://example.com">Click</a>"#).unwrap();
492 let mut found_link = false;
493 for node in tree.iter_nodes() {
494 if let ContentVariant::Link(ref l) = node.content {
495 if let LinkTarget::External(ref url) = l.target {
496 if url == "https://example.com" {
497 found_link = true;
498 }
499 }
500 }
501 }
502 assert!(found_link, "should have external link");
503 }
504
505 #[test]
510 fn important_overrides_higher_specificity() {
511 let html = r##"
512 <style>
513 #specific { font-size: 30pt; }
514 p { font-size: 14pt !important; }
515 </style>
516 <p id="specific">Text</p>
517 "##;
518 let tree = html_to_tree(html).unwrap();
519 for node in tree.iter_nodes() {
520 if node.semantic_role == Some(SemanticRole::Paragraph) {
521 assert!(
522 (node.style.typography.font_size.get() - 14.0).abs() < 0.01,
523 "!important should override #id specificity, got {}",
524 node.style.typography.font_size.get()
525 );
526 }
527 }
528 }
529
530 #[test]
531 fn important_overrides_inline_style() {
532 let html = r##"
533 <style>p { color: #ff0000 !important; }</style>
534 <p style="color: #0000ff">Text</p>
535 "##;
536 let tree = html_to_tree(html).unwrap();
537 for node in tree.iter_nodes() {
538 if node.semantic_role == Some(SemanticRole::Paragraph) {
539 match node.style.typography.color {
540 oxipdf_ir::color::Color::Srgb { r, b, .. } => {
541 assert!(
542 r > 0.9 && b < 0.1,
543 "!important red should override inline blue"
544 );
545 }
546 _ => panic!("expected Srgb color"),
547 }
548 }
549 }
550 }
551
552 #[test]
553 fn link_stylesheet_loaded() {
554 let dir = std::env::temp_dir().join("oxipdf_html_test");
556 let _ = std::fs::create_dir_all(&dir);
557 let css_path = dir.join("test_style.css");
558 std::fs::write(&css_path, "p { font-size: 22pt; }").unwrap();
559
560 let html = r#"
561 <link rel="stylesheet" href="test_style.css">
562 <p>Styled from file</p>
563 "#;
564 let options = ConvertOptions {
565 base_dir: Some(dir.clone()),
566 ..Default::default()
567 };
568 let tree = html_to_tree_with_options(html, &options).unwrap();
569 for node in tree.iter_nodes() {
570 if node.semantic_role == Some(SemanticRole::Paragraph) {
571 assert!(
572 (node.style.typography.font_size.get() - 22.0).abs() < 0.01,
573 "should apply CSS from linked file, got {}",
574 node.style.typography.font_size.get()
575 );
576 }
577 }
578
579 let _ = std::fs::remove_dir_all(&dir);
580 }
581
582 #[test]
583 fn link_stylesheet_missing_file_skipped() {
584 let html = r#"
585 <link rel="stylesheet" href="nonexistent.css">
586 <p>Still works</p>
587 "#;
588 let options = ConvertOptions {
589 base_dir: Some(std::env::temp_dir()),
590 ..Default::default()
591 };
592 let tree = html_to_tree_with_options(html, &options).unwrap();
594 assert!(tree.node_count() >= 3);
595 }
596
597 #[test]
598 fn link_stylesheet_no_base_dir_skipped() {
599 let html = r#"
600 <link rel="stylesheet" href="style.css">
601 <p>No base dir</p>
602 "#;
603 let tree = html_to_tree(html).unwrap();
605 assert!(tree.node_count() >= 3);
606 }
607
608 #[test]
609 fn link_stylesheet_http_skipped() {
610 let html = r#"
611 <link rel="stylesheet" href="https://example.com/style.css">
612 <p>No network</p>
613 "#;
614 let options = ConvertOptions {
615 base_dir: Some(std::env::temp_dir()),
616 ..Default::default()
617 };
618 let tree = html_to_tree_with_options(html, &options).unwrap();
620 assert!(tree.node_count() >= 3);
621 }
622
623 #[test]
624 fn inline_important_beats_stylesheet_important() {
625 let html = r##"
626 <style>p { font-size: 10pt !important; }</style>
627 <p style="font-size: 20pt !important">Text</p>
628 "##;
629 let tree = html_to_tree(html).unwrap();
630 for node in tree.iter_nodes() {
631 if node.semantic_role == Some(SemanticRole::Paragraph) {
632 assert!(
633 (node.style.typography.font_size.get() - 20.0).abs() < 0.01,
634 "inline !important should beat stylesheet !important, got {}",
635 node.style.typography.font_size.get()
636 );
637 }
638 }
639 }
640}