oxipdf-html 0.1.0

HTML+CSS → StyledTree adapter for the oxipdf PDF engine
Documentation
//! HTML element → ContentVariant + SemanticRole mapping.

use oxipdf_ir::node::ContentVariant;
use oxipdf_ir::semantic::SemanticRole;
use oxipdf_ir::style::typography::{FontStyle, TextDecoration, WhiteSpace};
use oxipdf_ir::style::{Display, ResolvedStyle};

/// Information derived from an HTML element tag name.
pub(crate) struct ElementInfo {
    /// The IR content variant for this element.
    pub content: ContentVariant,
    /// Optional semantic role.
    pub role: Option<SemanticRole>,
    /// Base style overrides implied by the element (e.g., `<strong>` → bold).
    pub style_overrides: StyleOverrides,
    /// Whether this element's children are inline by default.
    pub default_display: Display,
}

/// Style properties implied by the HTML element itself (user-agent defaults).
#[derive(Default)]
pub(crate) struct StyleOverrides {
    pub font_weight: Option<u16>,
    pub font_style: Option<FontStyle>,
    pub text_decoration: Option<TextDecoration>,
    pub white_space: Option<WhiteSpace>,
    pub display: Option<Display>,
    pub is_monospace: bool,
}

impl StyleOverrides {
    pub fn apply(&self, style: &mut ResolvedStyle) {
        if let Some(w) = self.font_weight {
            style.typography.font_weight = w;
        }
        if let Some(fs) = self.font_style {
            style.typography.font_style = fs;
        }
        if let Some(td) = self.text_decoration {
            style.typography.text_decoration = td;
        }
        if let Some(ws) = self.white_space {
            style.typography.white_space = ws;
        }
        if let Some(d) = self.display {
            style.layout.display = d;
        }
    }
}

/// Map an HTML tag name to its IR representation.
pub(crate) fn element_info(tag: &str) -> ElementInfo {
    match tag {
        // Block containers
        "div" | "article" | "main" | "header" | "footer" | "aside" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },
        "section" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::Section),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },
        "nav" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::Navigation),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },

        // Paragraphs
        "p" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::Paragraph),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },

        // Headings
        "h1" => heading_info(1),
        "h2" => heading_info(2),
        "h3" => heading_info(3),
        "h4" => heading_info(4),
        "h5" => heading_info(5),
        "h6" => heading_info(6),

        // Lists
        "ul" | "ol" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::List),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },
        "li" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::ListItem),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },

        // Block quote
        "blockquote" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::BlockQuote),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },

        // Code blocks
        "pre" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::CodeBlock),
            style_overrides: StyleOverrides {
                white_space: Some(WhiteSpace::Pre),
                is_monospace: true,
                ..Default::default()
            },
            default_display: Display::Block,
        },

        // Figure / caption
        "figure" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::Figure),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },
        "figcaption" => ElementInfo {
            content: ContentVariant::Container,
            role: Some(SemanticRole::Caption),
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },

        // Inline elements
        "span" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides {
                display: Some(Display::Inline),
                ..Default::default()
            },
            default_display: Display::Inline,
        },
        "strong" | "b" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides {
                font_weight: Some(700),
                display: Some(Display::Inline),
                ..Default::default()
            },
            default_display: Display::Inline,
        },
        "em" | "i" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides {
                font_style: Some(FontStyle::Italic),
                display: Some(Display::Inline),
                ..Default::default()
            },
            default_display: Display::Inline,
        },
        "u" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides {
                text_decoration: Some(TextDecoration::Underline),
                display: Some(Display::Inline),
                ..Default::default()
            },
            default_display: Display::Inline,
        },
        "s" | "del" | "strike" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides {
                text_decoration: Some(TextDecoration::LineThrough),
                display: Some(Display::Inline),
                ..Default::default()
            },
            default_display: Display::Inline,
        },
        "code" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides {
                display: Some(Display::Inline),
                is_monospace: true,
                ..Default::default()
            },
            default_display: Display::Inline,
        },

        // Horizontal rule
        "hr" => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },

        // Fallback: treat unknown elements as block containers.
        _ => ElementInfo {
            content: ContentVariant::Container,
            role: None,
            style_overrides: StyleOverrides::default(),
            default_display: Display::Block,
        },
    }
}

fn heading_info(level: u8) -> ElementInfo {
    let sizes: [u16; 6] = [700, 700, 700, 700, 600, 600];
    ElementInfo {
        content: ContentVariant::Container,
        role: Some(SemanticRole::Heading { level }),
        style_overrides: StyleOverrides {
            font_weight: Some(sizes.get(level as usize - 1).copied().unwrap_or(400)),
            ..Default::default()
        },
        default_display: Display::Block,
    }
}

/// Heading font sizes (pt) for h1-h6.
pub(crate) fn heading_font_size(level: u8) -> f64 {
    match level {
        1 => 28.0,
        2 => 22.0,
        3 => 17.0,
        4 => 14.0,
        5 => 12.0,
        6 => 11.0,
        _ => 11.0,
    }
}