Skip to main content

dongler_core/
render.rs

1use crate::error::Result;
2use crate::ir::{Block, BlockKind, Document, FigureBlock, TableBlock, TextBlock};
3
4pub trait Renderer {
5    fn render(&self, document: &Document) -> Result<String>;
6}
7
8#[derive(Debug, Default, Clone, Copy)]
9pub struct MarkdownRenderer;
10
11impl Renderer for MarkdownRenderer {
12    fn render(&self, document: &Document) -> Result<String> {
13        let mut rendered_blocks = Vec::new();
14
15        for page in &document.pages {
16            for block in &page.blocks {
17                match block {
18                    Block::Text(text) => {
19                        // Page furniture (running headers/footers) is kept in the
20                        // IR but excluded from default Markdown, per PRD §4.G.
21                        if BlockKind::parse(&text.kind).is_page_furniture() {
22                            continue;
23                        }
24                        rendered_blocks.push(render_markdown_text(text));
25                    }
26                    Block::Table(table) => rendered_blocks.push(render_markdown_table(table)),
27                    Block::Figure(figure) => {
28                        rendered_blocks.push(render_markdown_figure(figure));
29                    }
30                }
31            }
32        }
33
34        Ok(rendered_blocks.join("\n\n"))
35    }
36}
37
38#[derive(Debug, Default, Clone, Copy)]
39pub struct JsonRenderer;
40
41impl Renderer for JsonRenderer {
42    fn render(&self, document: &Document) -> Result<String> {
43        Ok(serde_json::to_string_pretty(document)?)
44    }
45}
46
47#[derive(Debug, Default, Clone, Copy)]
48pub struct LatexRenderer;
49
50impl Renderer for LatexRenderer {
51    fn render(&self, document: &Document) -> Result<String> {
52        let mut output = String::from(
53            "\\documentclass{article}\n\\usepackage{longtable}\n\\begin{document}\n\n",
54        );
55
56        for page in &document.pages {
57            for block in &page.blocks {
58                match block {
59                    Block::Text(text) => {
60                        output.push_str(&render_latex_text(text));
61                        output.push_str("\n\n");
62                    }
63                    Block::Table(table) => {
64                        output.push_str(&render_latex_table(table));
65                        output.push_str("\n\n");
66                    }
67                    Block::Figure(figure) => {
68                        output.push_str(&render_latex_figure(figure));
69                        output.push_str("\n\n");
70                    }
71                }
72            }
73        }
74
75        output.push_str("\\end{document}\n");
76        Ok(output)
77    }
78}
79
80fn render_markdown_text(text: &TextBlock) -> String {
81    if let Some(level) = heading_level(&text.kind) {
82        return format!(
83            "{} {}",
84            "#".repeat(level),
85            sanitize_markdown_text(&text.text)
86        );
87    }
88    if text.kind == "list" {
89        return text
90            .text
91            .lines()
92            .filter(|line| !line.trim().is_empty())
93            .map(|line| format!("- {}", sanitize_markdown_text(line.trim())))
94            .collect::<Vec<_>>()
95            .join("\n");
96    }
97    let body = sanitize_markdown_text(&text.text);
98    let (bold, italic) = block_emphasis(text);
99    emphasize_markdown(&body, bold, italic)
100}
101
102/// Whether every non-blank span of a block is bold and/or italic, so the whole
103/// block can be wrapped in emphasis markers without losing the cleaned text.
104fn block_emphasis(block: &TextBlock) -> (bool, bool) {
105    let mut any = false;
106    let mut bold = true;
107    let mut italic = true;
108    for span in block.lines.iter().flat_map(|line| line.spans.iter()) {
109        if span.text.trim().is_empty() {
110            continue;
111        }
112        any = true;
113        bold &= span.bold;
114        italic &= span.italic;
115    }
116    if any {
117        (bold, italic)
118    } else {
119        (false, false)
120    }
121}
122
123fn emphasize_markdown(text: &str, bold: bool, italic: bool) -> String {
124    let marker = match (bold, italic) {
125        (true, true) => "***",
126        (true, false) => "**",
127        (false, true) => "*",
128        (false, false) => return text.to_owned(),
129    };
130    if text.is_empty() {
131        return text.to_owned();
132    }
133    format!("{marker}{text}{marker}")
134}
135
136fn emphasize_latex(text: &str, bold: bool, italic: bool) -> String {
137    match (bold, italic) {
138        (true, true) => format!("\\textbf{{\\textit{{{text}}}}}"),
139        (true, false) => format!("\\textbf{{{text}}}"),
140        (false, true) => format!("\\textit{{{text}}}"),
141        (false, false) => text.to_owned(),
142    }
143}
144
145fn render_markdown_table(table: &TableBlock) -> String {
146    // PRD default: embed HTML tables so row/col spans survive. Prefer a
147    // pre-rendered html string, then synthesize HTML from spanning cells; fall
148    // back to a GFM pipe table for simple (span-free) tables.
149    if let Some(html) = &table.html {
150        let html = html.trim();
151        if !html.is_empty() {
152            return html.to_owned();
153        }
154    }
155    if table.cells.iter().any(|c| c.col_span > 1 || c.row_span > 1) {
156        if let Some(html) = render_html_table_from_cells(table) {
157            return html;
158        }
159    }
160
161    let width = table
162        .headers
163        .len()
164        .max(table.rows.iter().map(Vec::len).max().unwrap_or_default());
165
166    if width == 0 {
167        return String::new();
168    }
169
170    let headers = normalize_row(&table.headers, width);
171    let separators = vec!["---".to_owned(); width];
172    let rows = table
173        .rows
174        .iter()
175        .map(|row| normalize_row(row, width))
176        .collect::<Vec<_>>();
177
178    let mut lines = Vec::with_capacity(rows.len() + 2);
179    lines.push(markdown_row(&headers));
180    lines.push(markdown_row(&separators));
181    lines.extend(rows.iter().map(|row| markdown_row(row)));
182    lines.join("\n")
183}
184
185/// Build an HTML `<table>` from explicit cells, preserving `colspan`/`rowspan`.
186/// Spanned-over grid positions are omitted from `cells`, so each cell is emitted
187/// once with its span attributes. Returns `None` if there are no cells.
188fn render_html_table_from_cells(table: &TableBlock) -> Option<String> {
189    html_table_from_cells(&table.cells, table.caption.as_deref())
190}
191
192/// Build an HTML `<table>` from explicit cells (with optional caption), preserving
193/// `colspan`/`rowspan`. Public so pipeline stages that produce cells directly
194/// (e.g. the SLANet table path) render identically to the Markdown renderer.
195/// Returns `None` if `cells` is empty.
196pub fn html_table_from_cells(cells: &[crate::ir::TableCell], caption: Option<&str>) -> Option<String> {
197    if cells.is_empty() {
198        return None;
199    }
200    let max_row = cells.iter().map(|c| c.row).max()?;
201    let mut rows: Vec<Vec<&crate::ir::TableCell>> = vec![Vec::new(); max_row + 1];
202    for cell in cells {
203        if cell.row < rows.len() {
204            rows[cell.row].push(cell);
205        }
206    }
207    for row in &mut rows {
208        row.sort_by_key(|c| c.column);
209    }
210
211    let mut html = String::from("<table>\n");
212    if let Some(caption) = caption {
213        let caption = caption.trim();
214        if !caption.is_empty() {
215            html.push_str(&format!("<caption>{}</caption>\n", html_escape(caption)));
216        }
217    }
218    for row in &rows {
219        html.push_str("<tr>");
220        for cell in row {
221            let tag = if cell.is_header { "th" } else { "td" };
222            let mut attrs = String::new();
223            if cell.col_span > 1 {
224                attrs.push_str(&format!(" colspan=\"{}\"", cell.col_span));
225            }
226            if cell.row_span > 1 {
227                attrs.push_str(&format!(" rowspan=\"{}\"", cell.row_span));
228            }
229            html.push_str(&format!(
230                "<{tag}{attrs}>{}</{tag}>",
231                html_escape(cell.text.trim())
232            ));
233        }
234        html.push_str("</tr>\n");
235    }
236    html.push_str("</table>");
237    Some(html)
238}
239
240fn html_escape(text: &str) -> String {
241    let mut out = String::with_capacity(text.len());
242    for ch in text.chars() {
243        match ch {
244            '&' => out.push_str("&amp;"),
245            '<' => out.push_str("&lt;"),
246            '>' => out.push_str("&gt;"),
247            '"' => out.push_str("&quot;"),
248            _ => out.push(ch),
249        }
250    }
251    out
252}
253
254fn render_markdown_figure(figure: &FigureBlock) -> String {
255    let alt_text = figure
256        .alt_text
257        .as_deref()
258        .or(figure.caption.as_deref())
259        .or(figure.image_ref.as_deref())
260        .unwrap_or("image");
261    let image_ref = figure.image_ref.as_deref().unwrap_or("#image");
262    let image = format!(
263        "![{}]({})",
264        sanitize_markdown_text(alt_text).replace(['[', ']'], ""),
265        image_ref
266    );
267    if let Some(caption) = &figure.caption {
268        let caption = sanitize_markdown_text(caption);
269        if !caption.is_empty() && caption != alt_text {
270            return format!("{image}\n\n{caption}");
271        }
272    }
273    image
274}
275
276fn markdown_row(cells: &[String]) -> String {
277    format!(
278        "| {} |",
279        cells
280            .iter()
281            .map(|cell| sanitize_markdown_text(cell).replace('|', "\\|"))
282            .collect::<Vec<_>>()
283            .join(" | ")
284    )
285}
286
287fn sanitize_markdown_text(text: &str) -> String {
288    text.lines()
289        .map(|line| {
290            line.chars()
291                .filter(|character| !is_non_printing_control(*character))
292                .collect::<String>()
293                .split_whitespace()
294                .collect::<Vec<_>>()
295                .join(" ")
296        })
297        .collect::<Vec<_>>()
298        .join("\n")
299}
300
301fn is_non_printing_control(character: char) -> bool {
302    character.is_control() && !matches!(character, '\n' | '\r' | '\t')
303}
304
305fn normalize_row(row: &[String], width: usize) -> Vec<String> {
306    let mut normalized = row.to_vec();
307    normalized.resize(width, String::new());
308    normalized
309}
310
311fn render_latex_text(text: &TextBlock) -> String {
312    if let Some(level) = heading_level(&text.kind) {
313        let command = match level {
314            1 => "section",
315            2 => "subsection",
316            3 => "subsubsection",
317            _ => "paragraph",
318        };
319        return format!("\\{command}{{{}}}", escape_latex(&text.text));
320    }
321    if text.kind == "list" {
322        let items = text
323            .text
324            .lines()
325            .filter(|line| !line.trim().is_empty())
326            .map(|line| format!("\\item {}", escape_latex(line.trim())))
327            .collect::<Vec<_>>();
328        if !items.is_empty() {
329            return format!("\\begin{{itemize}}\n{}\n\\end{{itemize}}", items.join("\n"));
330        }
331    }
332    let body = escape_latex(&text.text);
333    let (bold, italic) = block_emphasis(text);
334    emphasize_latex(&body, bold, italic)
335}
336
337fn render_latex_table(table: &TableBlock) -> String {
338    let width = table
339        .headers
340        .len()
341        .max(table.rows.iter().map(Vec::len).max().unwrap_or_default());
342
343    if width == 0 {
344        return String::new();
345    }
346
347    let spec = latex_column_spec(table, width);
348    // A long statement (e.g. a full cash-flow) overruns a single `tabular` page;
349    // `longtable` breaks across pages so the whole table is actually readable.
350    let environment = if table.rows.len() > 24 {
351        "longtable"
352    } else {
353        "tabular"
354    };
355
356    let mut output = format!("\\begin{{{environment}}}{{{spec}}}\n");
357    if !table.headers.is_empty() {
358        output.push_str(&latex_row(&normalize_row(&table.headers, width)));
359        output.push_str("\\hline\n");
360    }
361
362    for row in &table.rows {
363        output.push_str(&latex_row(&normalize_row(row, width)));
364    }
365
366    output.push_str(&format!("\\end{{{environment}}}"));
367    output
368}
369
370/// LaTeX column spec: a column is right-aligned (`r`) when its body cells are
371/// mostly figures (so a financial statement's number columns line up the way the
372/// source does); everything else is left-aligned (`l`).
373fn latex_column_spec(table: &TableBlock, width: usize) -> String {
374    (0..width)
375        .map(|column| {
376            let (mut total, mut numeric) = (0usize, 0usize);
377            for row in &table.rows {
378                if let Some(cell) = row.get(column) {
379                    let cell = cell.trim();
380                    if cell.is_empty() {
381                        continue;
382                    }
383                    total += 1;
384                    if cell_is_numeric(cell) {
385                        numeric += 1;
386                    }
387                }
388            }
389            if total > 0 && numeric * 2 >= total {
390                'r'
391            } else {
392                'l'
393            }
394        })
395        .collect()
396}
397
398/// A cell that reads as a figure — digits possibly wrapped in `$`, parentheses,
399/// commas, a percent, a decimal point, or a dash placeholder.
400fn cell_is_numeric(text: &str) -> bool {
401    let mut digits = 0usize;
402    for character in text.chars() {
403        match character {
404            '0'..='9' => digits += 1,
405            '$' | '(' | ')' | ',' | '.' | '%' | '-' | '+' | ' ' | '\u{2014}' | '\u{2013}' => {}
406            _ => return false,
407        }
408    }
409    digits >= 1
410}
411
412fn render_latex_figure(figure: &FigureBlock) -> String {
413    let label = figure
414        .caption
415        .as_deref()
416        .or(figure.alt_text.as_deref())
417        .or(figure.image_ref.as_deref())
418        .unwrap_or("image");
419    format!("[Image: {}]", escape_latex(label))
420}
421
422fn heading_level(kind: &str) -> Option<usize> {
423    let level = kind.strip_prefix("heading_")?.parse::<usize>().ok()?;
424    (1..=6).contains(&level).then_some(level)
425}
426
427fn latex_row(cells: &[String]) -> String {
428    format!(
429        "{} \\\\\n",
430        cells
431            .iter()
432            .map(|cell| escape_latex(cell))
433            .collect::<Vec<_>>()
434            .join(" & ")
435    )
436}
437
438fn escape_latex(text: &str) -> String {
439    let mut escaped = String::with_capacity(text.len());
440
441    for character in text.chars() {
442        match character {
443            '\\' => escaped.push_str("\\textbackslash{}"),
444            '&' => escaped.push_str("\\&"),
445            '%' => escaped.push_str("\\%"),
446            '$' => escaped.push_str("\\$"),
447            '#' => escaped.push_str("\\#"),
448            '_' => escaped.push_str("\\_"),
449            '{' => escaped.push_str("\\{"),
450            '}' => escaped.push_str("\\}"),
451            '~' => escaped.push_str("\\textasciitilde{}"),
452            '^' => escaped.push_str("\\textasciicircum{}"),
453            '\n' => escaped.push('\n'),
454            character if character.is_control() && character.is_whitespace() => escaped.push(' '),
455            character if character.is_control() => {}
456            character if !character.is_ascii() => {
457                escaped.push_str(latex_unicode_ascii_fallback(character));
458            }
459            _ => escaped.push(character),
460        }
461    }
462
463    escaped
464}
465
466fn latex_unicode_ascii_fallback(character: char) -> &'static str {
467    match character {
468        '\u{00a0}' => " ",
469        '–' | '−' => "-",
470        '—' => "---",
471        '‘' | '’' | '‚' => "'",
472        '“' | '”' | '„' => "\"",
473        '•' => "*",
474        '…' => "...",
475        '×' => "x",
476        '÷' => "/",
477        '≤' => "<=",
478        '≥' => ">=",
479        '≠' => "!=",
480        '±' => "+/-",
481        _ => "?",
482    }
483}
484
485#[cfg(test)]
486mod tests {
487    use super::*;
488    use crate::ir::{Metadata, Page, TableCell};
489
490    fn cell(row: usize, column: usize, text: &str, col_span: usize, row_span: usize) -> TableCell {
491        TableCell {
492            row,
493            column,
494            text: text.to_owned(),
495            bbox: None,
496            is_header: row == 0,
497            col_span,
498            row_span,
499        }
500    }
501
502    fn doc_with(blocks: Vec<Block>) -> Document {
503        Document {
504            schema_version: crate::ir::SCHEMA_VERSION.to_owned(),
505            metadata: Metadata {
506                format: "pdf".to_owned(),
507                engine: "test".to_owned(),
508                source: None,
509                title: None,
510                character_count: 0,
511                word_count: 0,
512                block_count: blocks.len(),
513                file_size_bytes: None,
514                pdf_version: None,
515                encrypted: false,
516            },
517            pages: vec![Page {
518                number: 1,
519                blocks,
520                ..Default::default()
521            }],
522            assets: Vec::new(),
523            warnings: Vec::new(),
524        }
525    }
526
527    #[test]
528    fn prerendered_html_table_is_emitted_verbatim() {
529        let table = TableBlock {
530            html: Some("<table><tr><td>X</td></tr></table>".to_owned()),
531            ..Default::default()
532        };
533        assert_eq!(
534            render_markdown_table(&table),
535            "<table><tr><td>X</td></tr></table>"
536        );
537    }
538
539    #[test]
540    fn spanning_cells_render_as_html_with_span_attrs() {
541        let table = TableBlock {
542            cells: vec![
543                cell(0, 0, "Header", 2, 1),
544                cell(1, 0, "a", 1, 1),
545                cell(1, 1, "b", 1, 1),
546            ],
547            ..Default::default()
548        };
549        let out = render_markdown_table(&table);
550        assert!(out.starts_with("<table>"), "got: {out}");
551        assert!(out.contains("colspan=\"2\""), "got: {out}");
552        assert!(out.contains("<th colspan=\"2\">Header</th>"), "got: {out}");
553        assert!(out.contains("<td>a</td>"), "got: {out}");
554    }
555
556    #[test]
557    fn simple_table_without_spans_stays_pipe_markdown() {
558        let table = TableBlock {
559            headers: vec!["a".to_owned(), "b".to_owned()],
560            rows: vec![vec!["1".to_owned(), "2".to_owned()]],
561            ..Default::default()
562        };
563        let out = render_markdown_table(&table);
564        assert!(out.contains("| a | b |"), "got: {out}");
565        assert!(!out.contains("<table>"), "got: {out}");
566    }
567
568    #[test]
569    fn html_escape_escapes_markup() {
570        assert_eq!(html_escape("a < b & \"c\""), "a &lt; b &amp; &quot;c&quot;");
571    }
572
573    #[test]
574    fn page_furniture_excluded_from_markdown() {
575        let blocks = vec![
576            Block::Text(TextBlock {
577                text: "RUNNING HEADER".to_owned(),
578                kind: "page_header".to_owned(),
579                ..Default::default()
580            }),
581            Block::Text(TextBlock {
582                text: "Body paragraph.".to_owned(),
583                kind: "paragraph".to_owned(),
584                ..Default::default()
585            }),
586        ];
587        let md = MarkdownRenderer.render(&doc_with(blocks)).unwrap();
588        assert!(md.contains("Body paragraph."));
589        assert!(!md.contains("RUNNING HEADER"), "furniture leaked: {md}");
590    }
591
592    #[test]
593    fn heading_kind_renders_with_hashes() {
594        let blocks = vec![Block::Text(TextBlock {
595            text: "Title".to_owned(),
596            kind: "heading_2".to_owned(),
597            ..Default::default()
598        })];
599        let md = MarkdownRenderer.render(&doc_with(blocks)).unwrap();
600        assert_eq!(md.trim(), "## Title");
601    }
602}