use std::path::PathBuf;
use scraper::{Html, Node, Selector};
use oxipdf_ir::node::{ContentVariant, ImageContent, LinkContent, LinkTarget};
use oxipdf_ir::semantic::SemanticRole;
use oxipdf_ir::style::{Display, ResolvedStyle};
use oxipdf_ir::tree::StyledTreeBuilder;
use oxipdf_ir::units::Pt;
use oxipdf_ir::{IrVersion, TextContent};
use crate::css::{self, apply_declarations, parse_declarations};
use crate::elements::{self, heading_font_size};
use crate::error::HtmlError;
use super::cascade::{
apply_important_stylesheet_rules, apply_matching_rules, apply_normal_stylesheet_rules,
};
use super::stylesheets::{collect_link_stylesheets, collect_style_rules};
#[derive(Debug, Clone, Default)]
pub struct ConvertOptions {
pub extra_css: String,
pub base_dir: Option<PathBuf>,
}
pub fn html_to_tree(html: &str) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
html_to_tree_with_options(html, &ConvertOptions::default())
}
pub fn html_to_tree_with_css(
html: &str,
extra_css: &str,
) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
html_to_tree_with_options(
html,
&ConvertOptions {
extra_css: extra_css.to_string(),
..Default::default()
},
)
}
pub fn html_to_tree_with_options(
html: &str,
options: &ConvertOptions,
) -> Result<oxipdf_ir::tree::StyledTree, HtmlError> {
let document = Html::parse_document(html);
let mut rules = collect_link_stylesheets(&document, options.base_dir.as_deref());
rules.extend(collect_style_rules(&document));
if !options.extra_css.is_empty() {
rules.extend(css::parse_stylesheet(&options.extra_css));
}
let mut builder = StyledTreeBuilder::new(IrVersion::new(1, 0));
let body_sel = Selector::parse("body").expect("'body' is a valid CSS selector");
let body_node = document
.select(&body_sel)
.next()
.map(|el| el.id())
.unwrap_or(document.root_element().id());
let mut root_style = ResolvedStyle::default();
root_style.layout.display = Display::Block;
let root_id = builder.add_node(
ContentVariant::Container,
root_style,
Some(SemanticRole::Document),
None,
);
let body_ref = document
.tree
.get(body_node)
.ok_or(HtmlError::EmptyDocument)?;
convert_children(&document, body_ref, root_id, &rules, &mut builder)?;
if builder.len() < 2 {
return Err(HtmlError::EmptyDocument);
}
Ok(builder.build()?)
}
fn convert_children(
document: &Html,
parent_node: ego_tree::NodeRef<'_, Node>,
parent_id: oxipdf_ir::node::NodeId,
rules: &[crate::css::CssRule],
builder: &mut StyledTreeBuilder,
) -> Result<(), HtmlError> {
for child in parent_node.children() {
match child.value() {
Node::Text(text) => {
let t = text.text.to_string();
if !t.trim().is_empty() {
let mut style = ResolvedStyle::default();
style.layout.display = Display::Inline;
builder.add_child(
parent_id,
ContentVariant::Text(TextContent::new(&t)),
style,
None,
None,
);
}
}
Node::Element(el) => {
convert_element(document, child, el, parent_id, rules, builder)?;
}
_ => {} }
}
Ok(())
}
fn convert_element(
document: &Html,
node_ref: ego_tree::NodeRef<'_, Node>,
el: &scraper::node::Element,
parent_id: oxipdf_ir::node::NodeId,
rules: &[crate::css::CssRule],
builder: &mut StyledTreeBuilder,
) -> Result<(), HtmlError> {
let tag = el.name().to_lowercase();
if matches!(
tag.as_str(),
"script" | "style" | "meta" | "link" | "head" | "title"
) {
return Ok(());
}
if matches!(
tag.as_str(),
"thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col"
) {
return Ok(());
}
if tag == "br" {
let mut style = ResolvedStyle::default();
style.layout.display = Display::Inline;
builder.add_child(
parent_id,
ContentVariant::Text(TextContent::new("\n")),
style,
None,
None,
);
return Ok(());
}
let info = elements::element_info(&tag);
let element_id = el.attr("id").map(|s| s.to_string());
let mut style = ResolvedStyle::default();
style.layout.display = info.default_display;
if let Some(SemanticRole::Heading { level }) = info.role {
style.typography.font_size = Pt::new(heading_font_size(level));
}
let inline_css = el.attr("style");
if inline_css.is_some() {
apply_normal_stylesheet_rules(document, node_ref.id(), &mut style, rules);
} else {
apply_matching_rules(document, node_ref.id(), &mut style, rules);
}
info.style_overrides.apply(&mut style);
if info.style_overrides.is_monospace && style.typography.font_families.is_empty() {
style.typography.font_families = vec!["monospace".to_string()];
}
if let Some(inline_css) = inline_css {
let decls = parse_declarations(inline_css);
let normal: Vec<_> = decls.iter().filter(|d| !d.important).cloned().collect();
if !normal.is_empty() {
apply_declarations(&mut style, &normal);
}
apply_important_stylesheet_rules(document, node_ref.id(), &mut style, rules);
let important: Vec<_> = decls.iter().filter(|d| d.important).cloned().collect();
if !important.is_empty() {
apply_declarations(&mut style, &important);
}
}
match tag.as_str() {
"table" => {
return super::table::convert_table(
document, node_ref, parent_id, style, rules, element_id, builder,
);
}
"img" => {
return convert_img(el, parent_id, style, info.role, element_id, builder);
}
"a" => {
return convert_link(
document, node_ref, el, parent_id, style, rules, element_id, builder,
);
}
"hr" => {
style.visual.border_top = oxipdf_ir::style::visual::BorderSide {
width: Pt::new(1.0),
style: oxipdf_ir::style::visual::BorderStyle::Solid,
color: oxipdf_ir::color::Color::rgb(0.8, 0.8, 0.8),
};
style.layout.margin_top = oxipdf_ir::Dimension::Length(Pt::new(6.0));
style.layout.margin_bottom = oxipdf_ir::Dimension::Length(Pt::new(6.0));
builder.add_child(
parent_id,
ContentVariant::Container,
style,
None,
element_id,
);
return Ok(());
}
_ => {}
}
let node_id = builder.add_child(parent_id, info.content, style, info.role, element_id);
convert_children(document, node_ref, node_id, rules, builder)?;
Ok(())
}
fn convert_img(
el: &scraper::node::Element,
parent_id: oxipdf_ir::node::NodeId,
style: ResolvedStyle,
role: Option<SemanticRole>,
element_id: Option<String>,
builder: &mut StyledTreeBuilder,
) -> Result<(), HtmlError> {
let src = el.attr("src").unwrap_or_default();
let alt = el.attr("alt").map(|s| s.to_string());
let width = el
.attr("width")
.and_then(|w| w.parse::<f64>().ok())
.unwrap_or(100.0);
let height = el
.attr("height")
.and_then(|h| h.parse::<f64>().ok())
.unwrap_or(100.0);
if let Some((data, format)) = super::uri::parse_data_uri(src) {
let mut img = ImageContent::with_dimensions(
data,
format,
Pt::new(width * 0.75),
Pt::new(height * 0.75),
);
if let Some(alt_text) = alt {
img = img.with_alt_text(alt_text);
}
builder.add_child(
parent_id,
ContentVariant::Image(img),
style,
role.or(Some(SemanticRole::Figure)),
element_id,
);
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn convert_link(
document: &Html,
node_ref: ego_tree::NodeRef<'_, Node>,
el: &scraper::node::Element,
parent_id: oxipdf_ir::node::NodeId,
mut style: ResolvedStyle,
rules: &[crate::css::CssRule],
element_id: Option<String>,
builder: &mut StyledTreeBuilder,
) -> Result<(), HtmlError> {
let href = el.attr("href").unwrap_or_default().to_string();
let target = if let Some(fragment) = href.strip_prefix('#') {
LinkTarget::Internal(fragment.to_string())
} else {
LinkTarget::External(href)
};
if style.typography.color == oxipdf_ir::color::Color::BLACK {
style.typography.color = oxipdf_ir::color::Color::rgb(0.0, 0.0, 0.8);
}
if style.typography.text_decoration == oxipdf_ir::style::typography::TextDecoration::None {
style.typography.text_decoration = oxipdf_ir::style::typography::TextDecoration::Underline;
}
style.layout.display = Display::Inline;
let link_id = builder.add_child(
parent_id,
ContentVariant::Link(LinkContent { target }),
style,
None,
element_id,
);
convert_children(document, node_ref, link_id, rules, builder)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use oxipdf_ir::node::LinkTarget;
use oxipdf_ir::style::typography::FontStyle;
#[test]
fn simple_paragraph() {
let tree = html_to_tree("<p>Hello world</p>").unwrap();
assert!(tree.node_count() >= 3); }
#[test]
fn headings_create_semantic_roles() {
let tree = html_to_tree("<h1>Title</h1><h2>Sub</h2>").unwrap();
let mut found_h1 = false;
let mut found_h2 = false;
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Heading { level: 1 }) {
found_h1 = true;
}
if node.semantic_role == Some(SemanticRole::Heading { level: 2 }) {
found_h2 = true;
}
}
assert!(found_h1, "should have H1");
assert!(found_h2, "should have H2");
}
#[test]
fn inline_elements_styled() {
let tree = html_to_tree("<p><strong>bold</strong> and <em>italic</em></p>").unwrap();
let mut found_bold = false;
let mut found_italic = false;
for node in tree.iter_nodes() {
if node.style.typography.font_weight == 700 {
found_bold = true;
}
if node.style.typography.font_style == FontStyle::Italic {
found_italic = true;
}
}
assert!(found_bold, "should have bold");
assert!(found_italic, "should have italic");
}
#[test]
fn style_block_applied() {
let html = r##"
<style>p { color: #ff0000; font-size: 14pt; }</style>
<p>Red text</p>
"##;
let tree = html_to_tree(html).unwrap();
let mut found = false;
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Paragraph) {
found = true;
assert!(
(node.style.typography.font_size.get() - 14.0).abs() < 0.01,
"font size should be 14pt"
);
}
}
assert!(found, "should find paragraph");
}
#[test]
fn inline_style_overrides_stylesheet() {
let html = r##"
<style>p { font-size: 10pt; }</style>
<p style="font-size: 20pt">Big text</p>
"##;
let tree = html_to_tree(html).unwrap();
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Paragraph) {
assert!(
(node.style.typography.font_size.get() - 20.0).abs() < 0.01,
"inline style should override stylesheet"
);
}
}
}
#[test]
fn extra_css_applied() {
let html = "<p>Styled</p>";
let css = "p { font-size: 18pt; }";
let tree = html_to_tree_with_css(html, css).unwrap();
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Paragraph) {
assert!((node.style.typography.font_size.get() - 18.0).abs() < 0.01);
}
}
}
#[test]
fn empty_body_returns_error() {
assert!(matches!(
html_to_tree("<html><body></body></html>"),
Err(HtmlError::EmptyDocument)
));
}
#[test]
fn br_creates_newline_text() {
let tree = html_to_tree("<p>Line 1<br>Line 2</p>").unwrap();
let mut found_newline = false;
for node in tree.iter_nodes() {
if let ContentVariant::Text(ref t) = node.content {
if t.text.contains('\n') {
found_newline = true;
}
}
}
assert!(found_newline, "should have newline from <br>");
}
#[test]
fn link_creates_link_node() {
let tree = html_to_tree(r#"<a href="https://example.com">Click</a>"#).unwrap();
let mut found_link = false;
for node in tree.iter_nodes() {
if let ContentVariant::Link(ref l) = node.content {
if let LinkTarget::External(ref url) = l.target {
if url == "https://example.com" {
found_link = true;
}
}
}
}
assert!(found_link, "should have external link");
}
#[test]
fn important_overrides_higher_specificity() {
let html = r##"
<style>
#specific { font-size: 30pt; }
p { font-size: 14pt !important; }
</style>
<p id="specific">Text</p>
"##;
let tree = html_to_tree(html).unwrap();
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Paragraph) {
assert!(
(node.style.typography.font_size.get() - 14.0).abs() < 0.01,
"!important should override #id specificity, got {}",
node.style.typography.font_size.get()
);
}
}
}
#[test]
fn important_overrides_inline_style() {
let html = r##"
<style>p { color: #ff0000 !important; }</style>
<p style="color: #0000ff">Text</p>
"##;
let tree = html_to_tree(html).unwrap();
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Paragraph) {
match node.style.typography.color {
oxipdf_ir::color::Color::Srgb { r, b, .. } => {
assert!(
r > 0.9 && b < 0.1,
"!important red should override inline blue"
);
}
_ => panic!("expected Srgb color"),
}
}
}
}
#[test]
fn link_stylesheet_loaded() {
let dir = std::env::temp_dir().join("oxipdf_html_test");
let _ = std::fs::create_dir_all(&dir);
let css_path = dir.join("test_style.css");
std::fs::write(&css_path, "p { font-size: 22pt; }").unwrap();
let html = r#"
<link rel="stylesheet" href="test_style.css">
<p>Styled from file</p>
"#;
let options = ConvertOptions {
base_dir: Some(dir.clone()),
..Default::default()
};
let tree = html_to_tree_with_options(html, &options).unwrap();
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Paragraph) {
assert!(
(node.style.typography.font_size.get() - 22.0).abs() < 0.01,
"should apply CSS from linked file, got {}",
node.style.typography.font_size.get()
);
}
}
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn link_stylesheet_missing_file_skipped() {
let html = r#"
<link rel="stylesheet" href="nonexistent.css">
<p>Still works</p>
"#;
let options = ConvertOptions {
base_dir: Some(std::env::temp_dir()),
..Default::default()
};
let tree = html_to_tree_with_options(html, &options).unwrap();
assert!(tree.node_count() >= 3);
}
#[test]
fn link_stylesheet_no_base_dir_skipped() {
let html = r#"
<link rel="stylesheet" href="style.css">
<p>No base dir</p>
"#;
let tree = html_to_tree(html).unwrap();
assert!(tree.node_count() >= 3);
}
#[test]
fn link_stylesheet_http_skipped() {
let html = r#"
<link rel="stylesheet" href="https://example.com/style.css">
<p>No network</p>
"#;
let options = ConvertOptions {
base_dir: Some(std::env::temp_dir()),
..Default::default()
};
let tree = html_to_tree_with_options(html, &options).unwrap();
assert!(tree.node_count() >= 3);
}
#[test]
fn inline_important_beats_stylesheet_important() {
let html = r##"
<style>p { font-size: 10pt !important; }</style>
<p style="font-size: 20pt !important">Text</p>
"##;
let tree = html_to_tree(html).unwrap();
for node in tree.iter_nodes() {
if node.semantic_role == Some(SemanticRole::Paragraph) {
assert!(
(node.style.typography.font_size.get() - 20.0).abs() < 0.01,
"inline !important should beat stylesheet !important, got {}",
node.style.typography.font_size.get()
);
}
}
}
}