Skip to main content

oxipdf_html/convert/
core.rs

1//! Main HTML → StyledTree conversion logic.
2//!
3//! Walks the parsed DOM tree, resolves CSS styles from `<style>` blocks
4//! and inline `style=""` attributes, and builds a `StyledTree`.
5
6use std::path::PathBuf;
7
8use scraper::{Html, Node, Selector};
9
10use oxipdf_ir::node::{ContentVariant, ImageContent, LinkContent, LinkTarget};
11use oxipdf_ir::semantic::SemanticRole;
12use oxipdf_ir::style::{Display, ResolvedStyle};
13use oxipdf_ir::tree::StyledTreeBuilder;
14use oxipdf_ir::units::Pt;
15use oxipdf_ir::{IrVersion, TextContent};
16
17use crate::css::{self, apply_declarations, parse_declarations};
18use crate::elements::{self, heading_font_size};
19use crate::error::HtmlError;
20
21use super::cascade::{
22    apply_important_stylesheet_rules, apply_matching_rules, apply_normal_stylesheet_rules,
23};
24use super::stylesheets::{collect_link_stylesheets, collect_style_rules};
25
26/// Options for HTML → StyledTree conversion.
27#[derive(Debug, Clone, Default)]
28pub struct ConvertOptions {
29    /// Additional CSS text applied after `<style>` blocks.
30    pub extra_css: String,
31    /// Base directory for resolving relative `<link rel="stylesheet" href="...">` paths.
32    /// When `None`, `<link>` elements with relative paths are skipped.
33    pub base_dir: Option<PathBuf>,
34}
35
36/// Convert an HTML string to a `StyledTree`.
37///
38/// Parses the HTML, extracts `<style>` blocks, resolves the CSS cascade,
39/// and maps HTML elements to oxipdf IR nodes.
40pub fn html_to_tree(html: &str) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
41    html_to_tree_with_options(html, &ConvertOptions::default())
42}
43
44/// Convert an HTML string to a `StyledTree` with additional CSS.
45///
46/// The `extra_css` is applied after any `<style>` blocks in the HTML,
47/// with the same specificity rules.
48pub fn html_to_tree_with_css(
49    html: &str,
50    extra_css: &str,
51) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
52    html_to_tree_with_options(
53        html,
54        &ConvertOptions {
55            extra_css: extra_css.to_string(),
56            ..Default::default()
57        },
58    )
59}
60
61/// Convert an HTML string to a `StyledTree` with full options.
62///
63/// Supports `<style>` blocks, `<link rel="stylesheet">` (when `base_dir` set),
64/// extra CSS, `!important` cascade, and all element types.
65pub fn html_to_tree_with_options(
66    html: &str,
67    options: &ConvertOptions,
68) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
69    let document = Html::parse_document(html);
70
71    // Collect CSS rules: <link> stylesheets + <style> blocks + extra CSS.
72    let mut rules = collect_link_stylesheets(&document, options.base_dir.as_deref());
73    rules.extend(collect_style_rules(&document));
74    if !options.extra_css.is_empty() {
75        rules.extend(css::parse_stylesheet(&options.extra_css));
76    }
77
78    let mut builder = StyledTreeBuilder::new(IrVersion::new(1, 0));
79
80    // Find the <body> element, or fall back to the document root.
81    let body_sel = Selector::parse("body").expect("'body' is a valid CSS selector");
82    let body_node = document
83        .select(&body_sel)
84        .next()
85        .map(|el| el.id())
86        .unwrap_or(document.root_element().id());
87
88    // Create root container.
89    let mut root_style = ResolvedStyle::default();
90    root_style.layout.display = Display::Block;
91    let root_id = builder.add_node(
92        ContentVariant::Container,
93        root_style,
94        Some(SemanticRole::Document),
95        None,
96    );
97
98    // Walk body children.
99    let body_ref = document
100        .tree
101        .get(body_node)
102        .ok_or(HtmlError::EmptyDocument)?;
103    convert_children(&document, body_ref, root_id, &rules, &mut builder)?;
104
105    if builder.len() < 2 {
106        return Err(HtmlError::EmptyDocument);
107    }
108
109    Ok(builder.build()?)
110}
111
112/// Convert child nodes of a DOM node to StyledTree nodes.
113fn convert_children(
114    document: &Html,
115    parent_node: ego_tree::NodeRef<'_, Node>,
116    parent_id: oxipdf_ir::node::NodeId,
117    rules: &[crate::css::CssRule],
118    builder: &mut StyledTreeBuilder,
119) -> Result<(), HtmlError> {
120    for child in parent_node.children() {
121        match child.value() {
122            Node::Text(text) => {
123                let t = text.text.to_string();
124                if !t.trim().is_empty() {
125                    let mut style = ResolvedStyle::default();
126                    style.layout.display = Display::Inline;
127                    builder.add_child(
128                        parent_id,
129                        ContentVariant::Text(TextContent::new(&t)),
130                        style,
131                        None,
132                        None,
133                    );
134                }
135            }
136            Node::Element(el) => {
137                convert_element(document, child, el, parent_id, rules, builder)?;
138            }
139            _ => {} // Comments, processing instructions — skip.
140        }
141    }
142    Ok(())
143}
144
145/// Convert a single HTML element to a StyledTree node.
146fn convert_element(
147    document: &Html,
148    node_ref: ego_tree::NodeRef<'_, Node>,
149    el: &scraper::node::Element,
150    parent_id: oxipdf_ir::node::NodeId,
151    rules: &[crate::css::CssRule],
152    builder: &mut StyledTreeBuilder,
153) -> Result<(), HtmlError> {
154    let tag = el.name().to_lowercase();
155
156    // Skip non-renderable elements.
157    if matches!(
158        tag.as_str(),
159        "script" | "style" | "meta" | "link" | "head" | "title"
160    ) {
161        return Ok(());
162    }
163
164    // Skip table sub-elements when encountered outside a <table> context
165    // (they are handled by convert_table when encountered inside <table>).
166    if matches!(
167        tag.as_str(),
168        "thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col"
169    ) {
170        return Ok(());
171    }
172
173    // Handle <br> as newline text.
174    if tag == "br" {
175        let mut style = ResolvedStyle::default();
176        style.layout.display = Display::Inline;
177        builder.add_child(
178            parent_id,
179            ContentVariant::Text(TextContent::new("\n")),
180            style,
181            None,
182            None,
183        );
184        return Ok(());
185    }
186
187    let info = elements::element_info(&tag);
188    let element_id = el.attr("id").map(|s| s.to_string());
189
190    // Build resolved style: base defaults → CSS rules → element overrides → inline style.
191    let mut style = ResolvedStyle::default();
192    style.layout.display = info.default_display;
193
194    // Apply heading font size.
195    if let Some(SemanticRole::Heading { level }) = info.role {
196        style.typography.font_size = Pt::new(heading_font_size(level));
197    }
198
199    // Apply the CSS cascade.
200    //
201    // When the element has an inline `style=""` attribute we must interleave
202    // the two phases of the stylesheet cascade with the inline declarations:
203    //   1. Normal stylesheet declarations
204    //   2. Element-implied overrides (bold for <strong>, italic for <em>, etc.)
205    //   3. Normal inline declarations
206    //   4. !important stylesheet declarations
207    //   5. !important inline declarations  (beats everything)
208    //
209    // When there is no inline style the two stylesheet phases are applied
210    // together via apply_matching_rules, followed by element overrides.
211    let inline_css = el.attr("style");
212
213    if inline_css.is_some() {
214        apply_normal_stylesheet_rules(document, node_ref.id(), &mut style, rules);
215    } else {
216        apply_matching_rules(document, node_ref.id(), &mut style, rules);
217    }
218
219    // Element-implied styles run after normal stylesheet rules.
220    info.style_overrides.apply(&mut style);
221
222    // Apply monospace font hint.
223    if info.style_overrides.is_monospace && style.typography.font_families.is_empty() {
224        style.typography.font_families = vec!["monospace".to_string()];
225    }
226
227    if let Some(inline_css) = inline_css {
228        let decls = parse_declarations(inline_css);
229
230        // Normal inline declarations.
231        let normal: Vec<_> = decls.iter().filter(|d| !d.important).cloned().collect();
232        if !normal.is_empty() {
233            apply_declarations(&mut style, &normal);
234        }
235
236        // Stylesheet !important declarations (override normal inline).
237        apply_important_stylesheet_rules(document, node_ref.id(), &mut style, rules);
238
239        // Inline !important declarations (beats everything).
240        let important: Vec<_> = decls.iter().filter(|d| d.important).cloned().collect();
241        if !important.is_empty() {
242            apply_declarations(&mut style, &important);
243        }
244    }
245
246    // Handle special elements.
247    match tag.as_str() {
248        "table" => {
249            return super::table::convert_table(
250                document, node_ref, parent_id, style, rules, element_id, builder,
251            );
252        }
253        "img" => {
254            return convert_img(el, parent_id, style, info.role, element_id, builder);
255        }
256        "a" => {
257            return convert_link(
258                document, node_ref, el, parent_id, style, rules, element_id, builder,
259            );
260        }
261        "hr" => {
262            style.visual.border_top = oxipdf_ir::style::visual::BorderSide {
263                width: Pt::new(1.0),
264                style: oxipdf_ir::style::visual::BorderStyle::Solid,
265                color: oxipdf_ir::color::Color::rgb(0.8, 0.8, 0.8),
266            };
267            style.layout.margin_top = oxipdf_ir::Dimension::Length(Pt::new(6.0));
268            style.layout.margin_bottom = oxipdf_ir::Dimension::Length(Pt::new(6.0));
269            builder.add_child(
270                parent_id,
271                ContentVariant::Container,
272                style,
273                None,
274                element_id,
275            );
276            return Ok(());
277        }
278        _ => {}
279    }
280
281    // Create the node and recurse into children.
282    let node_id = builder.add_child(parent_id, info.content, style, info.role, element_id);
283    convert_children(document, node_ref, node_id, rules, builder)?;
284
285    Ok(())
286}
287
288/// Convert an `<img>` element to an Image node.
289fn convert_img(
290    el: &scraper::node::Element,
291    parent_id: oxipdf_ir::node::NodeId,
292    style: ResolvedStyle,
293    role: Option<SemanticRole>,
294    element_id: Option<String>,
295    builder: &mut StyledTreeBuilder,
296) -> Result<(), HtmlError> {
297    let src = el.attr("src").unwrap_or_default();
298    let alt = el.attr("alt").map(|s| s.to_string());
299    let width = el
300        .attr("width")
301        .and_then(|w| w.parse::<f64>().ok())
302        .unwrap_or(100.0);
303    let height = el
304        .attr("height")
305        .and_then(|h| h.parse::<f64>().ok())
306        .unwrap_or(100.0);
307
308    // Only support data: URIs for now (no network I/O).
309    if let Some((data, format)) = super::uri::parse_data_uri(src) {
310        let mut img = ImageContent::with_dimensions(
311            data,
312            format,
313            Pt::new(width * 0.75),
314            Pt::new(height * 0.75),
315        );
316        if let Some(alt_text) = alt {
317            img = img.with_alt_text(alt_text);
318        }
319        builder.add_child(
320            parent_id,
321            ContentVariant::Image(img),
322            style,
323            role.or(Some(SemanticRole::Figure)),
324            element_id,
325        );
326    }
327    // Non-data URIs silently skipped (no network I/O in the engine).
328
329    Ok(())
330}
331
332/// Convert an `<a>` element to a Link node wrapping its children.
333#[allow(clippy::too_many_arguments)]
334fn convert_link(
335    document: &Html,
336    node_ref: ego_tree::NodeRef<'_, Node>,
337    el: &scraper::node::Element,
338    parent_id: oxipdf_ir::node::NodeId,
339    mut style: ResolvedStyle,
340    rules: &[crate::css::CssRule],
341    element_id: Option<String>,
342    builder: &mut StyledTreeBuilder,
343) -> Result<(), HtmlError> {
344    let href = el.attr("href").unwrap_or_default().to_string();
345    let target = if let Some(fragment) = href.strip_prefix('#') {
346        LinkTarget::Internal(fragment.to_string())
347    } else {
348        LinkTarget::External(href)
349    };
350
351    // Default link styling: blue + underline.
352    if style.typography.color == oxipdf_ir::color::Color::BLACK {
353        style.typography.color = oxipdf_ir::color::Color::rgb(0.0, 0.0, 0.8);
354    }
355    if style.typography.text_decoration == oxipdf_ir::style::typography::TextDecoration::None {
356        style.typography.text_decoration = oxipdf_ir::style::typography::TextDecoration::Underline;
357    }
358    style.layout.display = Display::Inline;
359
360    let link_id = builder.add_child(
361        parent_id,
362        ContentVariant::Link(LinkContent { target }),
363        style,
364        None,
365        element_id,
366    );
367
368    convert_children(document, node_ref, link_id, rules, builder)?;
369    Ok(())
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375    use oxipdf_ir::node::LinkTarget;
376    use oxipdf_ir::style::typography::FontStyle;
377
378    #[test]
379    fn simple_paragraph() {
380        let tree = html_to_tree("<p>Hello world</p>").unwrap();
381        assert!(tree.node_count() >= 3); // root + p + text
382    }
383
384    #[test]
385    fn headings_create_semantic_roles() {
386        let tree = html_to_tree("<h1>Title</h1><h2>Sub</h2>").unwrap();
387        let mut found_h1 = false;
388        let mut found_h2 = false;
389        for node in tree.iter_nodes() {
390            if node.semantic_role == Some(SemanticRole::Heading { level: 1 }) {
391                found_h1 = true;
392            }
393            if node.semantic_role == Some(SemanticRole::Heading { level: 2 }) {
394                found_h2 = true;
395            }
396        }
397        assert!(found_h1, "should have H1");
398        assert!(found_h2, "should have H2");
399    }
400
401    #[test]
402    fn inline_elements_styled() {
403        let tree = html_to_tree("<p><strong>bold</strong> and <em>italic</em></p>").unwrap();
404        let mut found_bold = false;
405        let mut found_italic = false;
406        for node in tree.iter_nodes() {
407            if node.style.typography.font_weight == 700 {
408                found_bold = true;
409            }
410            if node.style.typography.font_style == FontStyle::Italic {
411                found_italic = true;
412            }
413        }
414        assert!(found_bold, "should have bold");
415        assert!(found_italic, "should have italic");
416    }
417
418    #[test]
419    fn style_block_applied() {
420        let html = r##"
421            <style>p { color: #ff0000; font-size: 14pt; }</style>
422            <p>Red text</p>
423        "##;
424        let tree = html_to_tree(html).unwrap();
425        let mut found = false;
426        for node in tree.iter_nodes() {
427            if node.semantic_role == Some(SemanticRole::Paragraph) {
428                found = true;
429                assert!(
430                    (node.style.typography.font_size.get() - 14.0).abs() < 0.01,
431                    "font size should be 14pt"
432                );
433            }
434        }
435        assert!(found, "should find paragraph");
436    }
437
438    #[test]
439    fn inline_style_overrides_stylesheet() {
440        let html = r##"
441            <style>p { font-size: 10pt; }</style>
442            <p style="font-size: 20pt">Big text</p>
443        "##;
444        let tree = html_to_tree(html).unwrap();
445        for node in tree.iter_nodes() {
446            if node.semantic_role == Some(SemanticRole::Paragraph) {
447                assert!(
448                    (node.style.typography.font_size.get() - 20.0).abs() < 0.01,
449                    "inline style should override stylesheet"
450                );
451            }
452        }
453    }
454
455    #[test]
456    fn extra_css_applied() {
457        let html = "<p>Styled</p>";
458        let css = "p { font-size: 18pt; }";
459        let tree = html_to_tree_with_css(html, css).unwrap();
460        for node in tree.iter_nodes() {
461            if node.semantic_role == Some(SemanticRole::Paragraph) {
462                assert!((node.style.typography.font_size.get() - 18.0).abs() < 0.01);
463            }
464        }
465    }
466
467    #[test]
468    fn empty_body_returns_error() {
469        assert!(matches!(
470            html_to_tree("<html><body></body></html>"),
471            Err(HtmlError::EmptyDocument)
472        ));
473    }
474
475    #[test]
476    fn br_creates_newline_text() {
477        let tree = html_to_tree("<p>Line 1<br>Line 2</p>").unwrap();
478        let mut found_newline = false;
479        for node in tree.iter_nodes() {
480            if let ContentVariant::Text(ref t) = node.content {
481                if t.text.contains('\n') {
482                    found_newline = true;
483                }
484            }
485        }
486        assert!(found_newline, "should have newline from <br>");
487    }
488
489    #[test]
490    fn link_creates_link_node() {
491        let tree = html_to_tree(r#"<a href="https://example.com">Click</a>"#).unwrap();
492        let mut found_link = false;
493        for node in tree.iter_nodes() {
494            if let ContentVariant::Link(ref l) = node.content {
495                if let LinkTarget::External(ref url) = l.target {
496                    if url == "https://example.com" {
497                        found_link = true;
498                    }
499                }
500            }
501        }
502        assert!(found_link, "should have external link");
503    }
504
505    // -----------------------------------------------------------------------
506    // !important tests
507    // -----------------------------------------------------------------------
508
509    #[test]
510    fn important_overrides_higher_specificity() {
511        let html = r##"
512            <style>
513                #specific { font-size: 30pt; }
514                p { font-size: 14pt !important; }
515            </style>
516            <p id="specific">Text</p>
517        "##;
518        let tree = html_to_tree(html).unwrap();
519        for node in tree.iter_nodes() {
520            if node.semantic_role == Some(SemanticRole::Paragraph) {
521                assert!(
522                    (node.style.typography.font_size.get() - 14.0).abs() < 0.01,
523                    "!important should override #id specificity, got {}",
524                    node.style.typography.font_size.get()
525                );
526            }
527        }
528    }
529
530    #[test]
531    fn important_overrides_inline_style() {
532        let html = r##"
533            <style>p { color: #ff0000 !important; }</style>
534            <p style="color: #0000ff">Text</p>
535        "##;
536        let tree = html_to_tree(html).unwrap();
537        for node in tree.iter_nodes() {
538            if node.semantic_role == Some(SemanticRole::Paragraph) {
539                match node.style.typography.color {
540                    oxipdf_ir::color::Color::Srgb { r, b, .. } => {
541                        assert!(
542                            r > 0.9 && b < 0.1,
543                            "!important red should override inline blue"
544                        );
545                    }
546                    _ => panic!("expected Srgb color"),
547                }
548            }
549        }
550    }
551
552    #[test]
553    fn link_stylesheet_loaded() {
554        // Write a temp CSS file.
555        let dir = std::env::temp_dir().join("oxipdf_html_test");
556        let _ = std::fs::create_dir_all(&dir);
557        let css_path = dir.join("test_style.css");
558        std::fs::write(&css_path, "p { font-size: 22pt; }").unwrap();
559
560        let html = r#"
561            <link rel="stylesheet" href="test_style.css">
562            <p>Styled from file</p>
563        "#;
564        let options = ConvertOptions {
565            base_dir: Some(dir.clone()),
566            ..Default::default()
567        };
568        let tree = html_to_tree_with_options(html, &options).unwrap();
569        for node in tree.iter_nodes() {
570            if node.semantic_role == Some(SemanticRole::Paragraph) {
571                assert!(
572                    (node.style.typography.font_size.get() - 22.0).abs() < 0.01,
573                    "should apply CSS from linked file, got {}",
574                    node.style.typography.font_size.get()
575                );
576            }
577        }
578
579        let _ = std::fs::remove_dir_all(&dir);
580    }
581
582    #[test]
583    fn link_stylesheet_missing_file_skipped() {
584        let html = r#"
585            <link rel="stylesheet" href="nonexistent.css">
586            <p>Still works</p>
587        "#;
588        let options = ConvertOptions {
589            base_dir: Some(std::env::temp_dir()),
590            ..Default::default()
591        };
592        // Should not error — missing CSS files are silently skipped.
593        let tree = html_to_tree_with_options(html, &options).unwrap();
594        assert!(tree.node_count() >= 3);
595    }
596
597    #[test]
598    fn link_stylesheet_no_base_dir_skipped() {
599        let html = r#"
600            <link rel="stylesheet" href="style.css">
601            <p>No base dir</p>
602        "#;
603        // No base_dir → relative links skipped.
604        let tree = html_to_tree(html).unwrap();
605        assert!(tree.node_count() >= 3);
606    }
607
608    #[test]
609    fn link_stylesheet_http_skipped() {
610        let html = r#"
611            <link rel="stylesheet" href="https://example.com/style.css">
612            <p>No network</p>
613        "#;
614        let options = ConvertOptions {
615            base_dir: Some(std::env::temp_dir()),
616            ..Default::default()
617        };
618        // HTTP URLs silently skipped — no network I/O.
619        let tree = html_to_tree_with_options(html, &options).unwrap();
620        assert!(tree.node_count() >= 3);
621    }
622
623    #[test]
624    fn inline_important_beats_stylesheet_important() {
625        let html = r##"
626            <style>p { font-size: 10pt !important; }</style>
627            <p style="font-size: 20pt !important">Text</p>
628        "##;
629        let tree = html_to_tree(html).unwrap();
630        for node in tree.iter_nodes() {
631            if node.semantic_role == Some(SemanticRole::Paragraph) {
632                assert!(
633                    (node.style.typography.font_size.get() - 20.0).abs() < 0.01,
634                    "inline !important should beat stylesheet !important, got {}",
635                    node.style.typography.font_size.get()
636                );
637            }
638        }
639    }
640}