dongler-core 0.3.5

Rust-native PDF and document extraction core for Markdown, LaTeX, and JSON output.
Documentation
use crate::error::Result;
use crate::ir::{Block, Document, FigureBlock, TableBlock, TextBlock};

pub trait Renderer {
    fn render(&self, document: &Document) -> Result<String>;
}

#[derive(Debug, Default, Clone, Copy)]
pub struct MarkdownRenderer;

impl Renderer for MarkdownRenderer {
    fn render(&self, document: &Document) -> Result<String> {
        let mut rendered_blocks = Vec::new();

        for page in &document.pages {
            for block in &page.blocks {
                match block {
                    Block::Text(text) => rendered_blocks.push(render_markdown_text(text)),
                    Block::Table(table) => rendered_blocks.push(render_markdown_table(table)),
                    Block::Figure(figure) => {
                        rendered_blocks.push(render_markdown_figure(figure));
                    }
                }
            }
        }

        Ok(rendered_blocks.join("\n\n"))
    }
}

#[derive(Debug, Default, Clone, Copy)]
pub struct JsonRenderer;

impl Renderer for JsonRenderer {
    fn render(&self, document: &Document) -> Result<String> {
        Ok(serde_json::to_string_pretty(document)?)
    }
}

#[derive(Debug, Default, Clone, Copy)]
pub struct LatexRenderer;

impl Renderer for LatexRenderer {
    fn render(&self, document: &Document) -> Result<String> {
        let mut output = String::from("\\documentclass{article}\n\\begin{document}\n\n");

        for page in &document.pages {
            for block in &page.blocks {
                match block {
                    Block::Text(text) => {
                        output.push_str(&render_latex_text(text));
                        output.push_str("\n\n");
                    }
                    Block::Table(table) => {
                        output.push_str(&render_latex_table(table));
                        output.push_str("\n\n");
                    }
                    Block::Figure(figure) => {
                        output.push_str(&render_latex_figure(figure));
                        output.push_str("\n\n");
                    }
                }
            }
        }

        output.push_str("\\end{document}\n");
        Ok(output)
    }
}

fn render_markdown_text(text: &TextBlock) -> String {
    if let Some(level) = heading_level(&text.kind) {
        return format!(
            "{} {}",
            "#".repeat(level),
            sanitize_markdown_text(&text.text)
        );
    }
    if text.kind == "list" {
        return text
            .text
            .lines()
            .filter(|line| !line.trim().is_empty())
            .map(|line| format!("- {}", sanitize_markdown_text(line.trim())))
            .collect::<Vec<_>>()
            .join("\n");
    }
    let body = sanitize_markdown_text(&text.text);
    let (bold, italic) = block_emphasis(text);
    emphasize_markdown(&body, bold, italic)
}

/// Whether every non-blank span of a block is bold and/or italic, so the whole
/// block can be wrapped in emphasis markers without losing the cleaned text.
fn block_emphasis(block: &TextBlock) -> (bool, bool) {
    let mut any = false;
    let mut bold = true;
    let mut italic = true;
    for span in block.lines.iter().flat_map(|line| line.spans.iter()) {
        if span.text.trim().is_empty() {
            continue;
        }
        any = true;
        bold &= span.bold;
        italic &= span.italic;
    }
    if any {
        (bold, italic)
    } else {
        (false, false)
    }
}

fn emphasize_markdown(text: &str, bold: bool, italic: bool) -> String {
    let marker = match (bold, italic) {
        (true, true) => "***",
        (true, false) => "**",
        (false, true) => "*",
        (false, false) => return text.to_owned(),
    };
    if text.is_empty() {
        return text.to_owned();
    }
    format!("{marker}{text}{marker}")
}

fn emphasize_latex(text: &str, bold: bool, italic: bool) -> String {
    match (bold, italic) {
        (true, true) => format!("\\textbf{{\\textit{{{text}}}}}"),
        (true, false) => format!("\\textbf{{{text}}}"),
        (false, true) => format!("\\textit{{{text}}}"),
        (false, false) => text.to_owned(),
    }
}

fn render_markdown_table(table: &TableBlock) -> String {
    let width = table
        .headers
        .len()
        .max(table.rows.iter().map(Vec::len).max().unwrap_or_default());

    if width == 0 {
        return String::new();
    }

    let headers = normalize_row(&table.headers, width);
    let separators = vec!["---".to_owned(); width];
    let rows = table
        .rows
        .iter()
        .map(|row| normalize_row(row, width))
        .collect::<Vec<_>>();

    let mut lines = Vec::with_capacity(rows.len() + 2);
    lines.push(markdown_row(&headers));
    lines.push(markdown_row(&separators));
    lines.extend(rows.iter().map(|row| markdown_row(row)));
    lines.join("\n")
}

fn render_markdown_figure(figure: &FigureBlock) -> String {
    let alt_text = figure
        .alt_text
        .as_deref()
        .or(figure.caption.as_deref())
        .or(figure.image_ref.as_deref())
        .unwrap_or("image");
    let image_ref = figure.image_ref.as_deref().unwrap_or("#image");
    let image = format!(
        "![{}]({})",
        sanitize_markdown_text(alt_text).replace(['[', ']'], ""),
        image_ref
    );
    if let Some(caption) = &figure.caption {
        let caption = sanitize_markdown_text(caption);
        if !caption.is_empty() && caption != alt_text {
            return format!("{image}\n\n{caption}");
        }
    }
    image
}

fn markdown_row(cells: &[String]) -> String {
    format!(
        "| {} |",
        cells
            .iter()
            .map(|cell| sanitize_markdown_text(cell).replace('|', "\\|"))
            .collect::<Vec<_>>()
            .join(" | ")
    )
}

fn sanitize_markdown_text(text: &str) -> String {
    text.lines()
        .map(|line| {
            line.chars()
                .filter(|character| !is_non_printing_control(*character))
                .collect::<String>()
                .split_whitespace()
                .collect::<Vec<_>>()
                .join(" ")
        })
        .collect::<Vec<_>>()
        .join("\n")
}

fn is_non_printing_control(character: char) -> bool {
    character.is_control() && !matches!(character, '\n' | '\r' | '\t')
}

fn normalize_row(row: &[String], width: usize) -> Vec<String> {
    let mut normalized = row.to_vec();
    normalized.resize(width, String::new());
    normalized
}

fn render_latex_text(text: &TextBlock) -> String {
    if let Some(level) = heading_level(&text.kind) {
        let command = match level {
            1 => "section",
            2 => "subsection",
            3 => "subsubsection",
            _ => "paragraph",
        };
        return format!("\\{command}{{{}}}", escape_latex(&text.text));
    }
    if text.kind == "list" {
        let items = text
            .text
            .lines()
            .filter(|line| !line.trim().is_empty())
            .map(|line| format!("\\item {}", escape_latex(line.trim())))
            .collect::<Vec<_>>();
        if !items.is_empty() {
            return format!("\\begin{{itemize}}\n{}\n\\end{{itemize}}", items.join("\n"));
        }
    }
    let body = escape_latex(&text.text);
    let (bold, italic) = block_emphasis(text);
    emphasize_latex(&body, bold, italic)
}

fn render_latex_table(table: &TableBlock) -> String {
    let width = table
        .headers
        .len()
        .max(table.rows.iter().map(Vec::len).max().unwrap_or_default());

    if width == 0 {
        return String::new();
    }

    let mut output = format!("\\begin{{tabular}}{{{}}}\n", "l".repeat(width));
    if !table.headers.is_empty() {
        output.push_str(&latex_row(&normalize_row(&table.headers, width)));
        output.push_str("\\hline\n");
    }

    for row in &table.rows {
        output.push_str(&latex_row(&normalize_row(row, width)));
    }

    output.push_str("\\end{tabular}");
    output
}

fn render_latex_figure(figure: &FigureBlock) -> String {
    let label = figure
        .caption
        .as_deref()
        .or(figure.alt_text.as_deref())
        .or(figure.image_ref.as_deref())
        .unwrap_or("image");
    format!("[Image: {}]", escape_latex(label))
}

fn heading_level(kind: &str) -> Option<usize> {
    let level = kind.strip_prefix("heading_")?.parse::<usize>().ok()?;
    (1..=6).contains(&level).then_some(level)
}

fn latex_row(cells: &[String]) -> String {
    format!(
        "{} \\\\\n",
        cells
            .iter()
            .map(|cell| escape_latex(cell))
            .collect::<Vec<_>>()
            .join(" & ")
    )
}

fn escape_latex(text: &str) -> String {
    let mut escaped = String::with_capacity(text.len());

    for character in text.chars() {
        match character {
            '\\' => escaped.push_str("\\textbackslash{}"),
            '&' => escaped.push_str("\\&"),
            '%' => escaped.push_str("\\%"),
            '$' => escaped.push_str("\\$"),
            '#' => escaped.push_str("\\#"),
            '_' => escaped.push_str("\\_"),
            '{' => escaped.push_str("\\{"),
            '}' => escaped.push_str("\\}"),
            '~' => escaped.push_str("\\textasciitilde{}"),
            '^' => escaped.push_str("\\textasciicircum{}"),
            '\n' => escaped.push('\n'),
            character if character.is_control() && character.is_whitespace() => escaped.push(' '),
            character if character.is_control() => {}
            character if !character.is_ascii() => {
                escaped.push_str(latex_unicode_ascii_fallback(character));
            }
            _ => escaped.push(character),
        }
    }

    escaped
}

fn latex_unicode_ascii_fallback(character: char) -> &'static str {
    match character {
        '\u{00a0}' => " ",
        '' | '' => "-",
        '' => "---",
        '' | '' | '' => "'",
        '' | '' | '' => "\"",
        '' => "*",
        '' => "...",
        '×' => "x",
        '÷' => "/",
        '' => "<=",
        '' => ">=",
        '' => "!=",
        '±' => "+/-",
        _ => "?",
    }
}