harumi 0.6.0

Overlay searchable CJK text on PDFs, extract text, merge/split pages — pure Rust, zero C dependencies
Documentation
//! HTML-to-PDF renderer backed by [`FlowDocument`].
//!
//! Enabled by the `html` feature flag (implies `flow`).
//!
//! # Supported HTML elements
//!
//! | Element | Mapping |
//! |---------|---------|
//! | `<h1>`–`<h6>` | Heading at the corresponding level |
//! | `<p>` | Body paragraph |
//! | `<table><tr><th/td>` | Two-column key/value table |
//! | `<ul><li>` | Bulleted list |
//! | `<ol><li>` | Numbered list |
//! | `<br>` | (ignored; use paragraph breaks instead) |
//! | `style="page-break-after: always"` / `class="page-break"` | Page break |
//! | `<div>`, `<section>`, `<article>`, … | Block container; children are processed |
//! | `<strong>`, `<em>`, … | Text content extracted; styling ignored in v1 |
//! | `<head>`, `<script>`, `<style>`, … | Skipped entirely |

use scraper::{ElementRef, Html};

use crate::{Error, Result};

use super::{FlowDocument, FlowOptions, Margins};

/// Options for [`render_html_to_pdf`].
pub struct HtmlRenderOptions {
    /// Raw TTF/OTF font bytes (required). CJK fonts such as NotoSansCJK are supported.
    pub font_bytes: Vec<u8>,
    /// Page width and height in PDF points. Default: A4 (595 × 842).
    pub page_size: (f32, f32),
    /// Page margins. Default: [`Margins::a4_standard`] (20 mm on all sides).
    pub margins: Margins,
    /// Body text font size in PDF points. Default: 11.0.
    pub body_font_size: f32,
    /// Line height multiplier relative to font size. Default: 1.4.
    pub line_height_factor: f32,
    /// Maximum number of pages that may be generated.
    ///
    /// Prevents DoS from very large HTML inputs. Default: 2000.
    pub max_pages: u32,
}

impl Default for HtmlRenderOptions {
    fn default() -> Self {
        HtmlRenderOptions {
            font_bytes: Vec::new(),
            page_size: (595.0, 842.0),
            margins: Margins::a4_standard(),
            body_font_size: 11.0,
            line_height_factor: 1.4,
            max_pages: 2000,
        }
    }
}

/// Renders an HTML string to PDF bytes.
///
/// The HTML is parsed and mapped to [`FlowDocument`] block elements.
/// Only a document-oriented subset of HTML is supported; see the module docs
/// for the complete element mapping.
///
/// `options.font_bytes` must be non-empty; all other fields have sensible defaults.
///
/// # Errors
/// Returns [`Error::InvalidInput`] if `font_bytes` is empty or `max_pages` is exceeded.
/// Other errors propagate from font embedding or PDF writing.
pub fn render_html_to_pdf(html: &str, options: HtmlRenderOptions) -> Result<Vec<u8>> {
    if options.font_bytes.is_empty() {
        return Err(Error::InvalidInput(
            "HtmlRenderOptions.font_bytes must be set to a valid TTF/OTF font".into(),
        ));
    }

    let flow_opts = FlowOptions {
        page_size: options.page_size,
        margins: options.margins,
        body_font_size: options.body_font_size,
        line_height_factor: options.line_height_factor,
        max_pages: options.max_pages,
        ..FlowOptions::default()
    };

    let mut flow = FlowDocument::new(options.font_bytes, flow_opts)?;

    let document = Html::parse_document(html);
    // Walk the tree iteratively to avoid stack overflows from deeply nested HTML.
    walk_iterative(document.root_element(), &mut flow)?;

    flow.render()
}

// ── Iterative tree walker ─────────────────────────────────────────────────────

/// Iterative depth-first traversal of the element tree.
///
/// Using an explicit stack instead of recursion prevents stack overflows when
/// processing deeply nested HTML (e.g. `<div><div><div>…</div></div></div>`).
fn walk_iterative<'a>(root: ElementRef<'a>, flow: &mut FlowDocument) -> Result<()> {
    let mut stack: Vec<ElementRef<'a>> = vec![root];

    while let Some(elem) = stack.pop() {
        process_one(elem, flow, &mut stack)?;
    }

    Ok(())
}

/// Process a single element. If this element is a block container, its children
/// are pushed onto `stack` in reverse order (so the first child is processed first).
fn process_one<'a>(
    elem: ElementRef<'a>,
    flow: &mut FlowDocument,
    stack: &mut Vec<ElementRef<'a>>,
) -> Result<()> {
    let tag = elem.value().name();

    match tag {
        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
            let level: u8 = tag[1..].parse().unwrap_or(1);
            let text = collect_text(elem);
            if !text.trim().is_empty() {
                flow.push_heading(text.trim(), level)?;
            }
        }

        "p" => {
            let text = collect_text(elem);
            if !text.trim().is_empty() {
                flow.push_paragraph(text.trim())?;
            }
        }

        "table" => {
            process_table(elem, flow)?;
            // Do NOT push children — table is handled as a unit.
        }

        "ul" => {
            process_list(elem, flow, false)?;
            // Do NOT push children — list is handled as a unit.
        }

        "ol" => {
            process_list(elem, flow, true)?;
            // Do NOT push children — list is handled as a unit.
        }

        // Non-content elements — skip entirely (don't push children either).
        "head" | "script" | "style" | "meta" | "link" | "title" | "noscript" => {}

        // Block containers and everything else: push children so they are processed.
        _ => {
            // Push in reverse order so the first child is at the top of the stack.
            let children: Vec<ElementRef<'_>> = elem
                .children()
                .filter_map(ElementRef::wrap)
                .collect();
            for child in children.into_iter().rev() {
                stack.push(child);
            }
        }
    }

    // page-break-after check (applied after content, before siblings)
    if has_page_break(elem) {
        flow.push_page_break()?;
    }

    Ok(())
}

// ── Helpers ───────────────────────────────────────────────────────────────────

fn collect_text(elem: ElementRef<'_>) -> String {
    elem.text().collect()
}

fn has_page_break(elem: ElementRef<'_>) -> bool {
    let style = elem.value().attr("style").unwrap_or("");
    let class = elem.value().attr("class").unwrap_or("");
    style.contains("page-break-after: always")
        || style.contains("page-break-after:always")
        || class.split_whitespace().any(|c| c == "page-break")
}

/// Collects `<tr>` elements that are direct or `<tbody>`/`<thead>`/`<tfoot>`-wrapped
/// children of `table` — without descending into nested `<table>` elements.
fn table_rows(table: ElementRef<'_>) -> Vec<ElementRef<'_>> {
    let mut rows = Vec::new();
    for child in table.children().filter_map(ElementRef::wrap) {
        match child.value().name() {
            "tr" => rows.push(child),
            "tbody" | "thead" | "tfoot" => {
                // One level of tbody/thead/tfoot wrapping — stop here.
                for tr in child.children().filter_map(ElementRef::wrap) {
                    if tr.value().name() == "tr" {
                        rows.push(tr);
                    }
                }
            }
            // Nested <table> or other elements — skip.
            _ => {}
        }
    }
    rows
}

fn process_table(table: ElementRef<'_>, flow: &mut FlowDocument) -> Result<()> {
    let mut rows: Vec<(String, String)> = Vec::new();

    for tr in table_rows(table) {
        // Collect only direct <th>/<td> children of this <tr>.
        let cells: Vec<String> = tr
            .children()
            .filter_map(ElementRef::wrap)
            .filter(|e| matches!(e.value().name(), "th" | "td"))
            .map(|e| collect_text(e).trim().to_owned())
            .collect();

        match cells.len() {
            0 => {}
            1 => rows.push((cells[0].clone(), String::new())),
            _ => rows.push((cells[0].clone(), cells[1].clone())),
        }
    }

    if rows.is_empty() {
        return Ok(());
    }

    let rows_ref: Vec<(&str, &str)> =
        rows.iter().map(|(k, v)| (k.as_str(), v.as_str())).collect();
    flow.push_key_value_table(&rows_ref)
}

fn process_list(list: ElementRef<'_>, flow: &mut FlowDocument, ordered: bool) -> Result<()> {
    // Only collect direct <li> children to avoid duplicating text from nested lists.
    let items: Vec<String> = list
        .children()
        .filter_map(ElementRef::wrap)
        .filter(|e| e.value().name() == "li")
        .map(|li| collect_text(li).trim().to_owned())
        .filter(|s| !s.is_empty())
        .collect();

    if items.is_empty() {
        return Ok(());
    }

    let items_ref: Vec<&str> = items.iter().map(String::as_str).collect();
    flow.push_list(&items_ref, ordered)
}