use scraper::{ElementRef, Html};
use crate::{Error, Result};
use super::{FlowDocument, FlowOptions, Margins};
pub struct HtmlRenderOptions {
pub font_bytes: Vec<u8>,
pub page_size: (f32, f32),
pub margins: Margins,
pub body_font_size: f32,
pub line_height_factor: f32,
pub max_pages: u32,
}
impl Default for HtmlRenderOptions {
fn default() -> Self {
HtmlRenderOptions {
font_bytes: Vec::new(),
page_size: (595.0, 842.0),
margins: Margins::a4_standard(),
body_font_size: 11.0,
line_height_factor: 1.4,
max_pages: 2000,
}
}
}
pub fn render_html_to_pdf(html: &str, options: HtmlRenderOptions) -> Result<Vec<u8>> {
if options.font_bytes.is_empty() {
return Err(Error::InvalidInput(
"HtmlRenderOptions.font_bytes must be set to a valid TTF/OTF font".into(),
));
}
let flow_opts = FlowOptions {
page_size: options.page_size,
margins: options.margins,
body_font_size: options.body_font_size,
line_height_factor: options.line_height_factor,
max_pages: options.max_pages,
..FlowOptions::default()
};
let mut flow = FlowDocument::new(options.font_bytes, flow_opts)?;
let document = Html::parse_document(html);
walk_iterative(document.root_element(), &mut flow)?;
flow.render()
}
fn walk_iterative<'a>(root: ElementRef<'a>, flow: &mut FlowDocument) -> Result<()> {
let mut stack: Vec<ElementRef<'a>> = vec![root];
while let Some(elem) = stack.pop() {
process_one(elem, flow, &mut stack)?;
}
Ok(())
}
fn process_one<'a>(
elem: ElementRef<'a>,
flow: &mut FlowDocument,
stack: &mut Vec<ElementRef<'a>>,
) -> Result<()> {
let tag = elem.value().name();
match tag {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
let level: u8 = tag[1..].parse().unwrap_or(1);
let text = collect_text(elem);
if !text.trim().is_empty() {
flow.push_heading(text.trim(), level)?;
}
}
"p" => {
let text = collect_text(elem);
if !text.trim().is_empty() {
flow.push_paragraph(text.trim())?;
}
}
"table" => {
process_table(elem, flow)?;
}
"ul" => {
process_list(elem, flow, false)?;
}
"ol" => {
process_list(elem, flow, true)?;
}
"head" | "script" | "style" | "meta" | "link" | "title" | "noscript" => {}
_ => {
let children: Vec<ElementRef<'_>> = elem
.children()
.filter_map(ElementRef::wrap)
.collect();
for child in children.into_iter().rev() {
stack.push(child);
}
}
}
if has_page_break(elem) {
flow.push_page_break()?;
}
Ok(())
}
fn collect_text(elem: ElementRef<'_>) -> String {
elem.text().collect()
}
fn has_page_break(elem: ElementRef<'_>) -> bool {
let style = elem.value().attr("style").unwrap_or("");
let class = elem.value().attr("class").unwrap_or("");
style.contains("page-break-after: always")
|| style.contains("page-break-after:always")
|| class.split_whitespace().any(|c| c == "page-break")
}
fn table_rows(table: ElementRef<'_>) -> Vec<ElementRef<'_>> {
let mut rows = Vec::new();
for child in table.children().filter_map(ElementRef::wrap) {
match child.value().name() {
"tr" => rows.push(child),
"tbody" | "thead" | "tfoot" => {
for tr in child.children().filter_map(ElementRef::wrap) {
if tr.value().name() == "tr" {
rows.push(tr);
}
}
}
_ => {}
}
}
rows
}
fn process_table(table: ElementRef<'_>, flow: &mut FlowDocument) -> Result<()> {
let mut rows: Vec<(String, String)> = Vec::new();
for tr in table_rows(table) {
let cells: Vec<String> = tr
.children()
.filter_map(ElementRef::wrap)
.filter(|e| matches!(e.value().name(), "th" | "td"))
.map(|e| collect_text(e).trim().to_owned())
.collect();
match cells.len() {
0 => {}
1 => rows.push((cells[0].clone(), String::new())),
_ => rows.push((cells[0].clone(), cells[1].clone())),
}
}
if rows.is_empty() {
return Ok(());
}
let rows_ref: Vec<(&str, &str)> =
rows.iter().map(|(k, v)| (k.as_str(), v.as_str())).collect();
flow.push_key_value_table(&rows_ref)
}
fn process_list(list: ElementRef<'_>, flow: &mut FlowDocument, ordered: bool) -> Result<()> {
let items: Vec<String> = list
.children()
.filter_map(ElementRef::wrap)
.filter(|e| e.value().name() == "li")
.map(|li| collect_text(li).trim().to_owned())
.filter(|s| !s.is_empty())
.collect();
if items.is_empty() {
return Ok(());
}
let items_ref: Vec<&str> = items.iter().map(String::as_str).collect();
flow.push_list(&items_ref, ordered)
}