1use crate::error::Result;
2use crate::ir::{Block, BlockKind, Document, FigureBlock, TableBlock, TextBlock};
3
4pub trait Renderer {
5 fn render(&self, document: &Document) -> Result<String>;
6}
7
8#[derive(Debug, Default, Clone, Copy)]
9pub struct MarkdownRenderer;
10
11impl Renderer for MarkdownRenderer {
12 fn render(&self, document: &Document) -> Result<String> {
13 let mut rendered_blocks = Vec::new();
14
15 for page in &document.pages {
16 for block in &page.blocks {
17 match block {
18 Block::Text(text) => {
19 if BlockKind::parse(&text.kind).is_page_furniture() {
22 continue;
23 }
24 rendered_blocks.push(render_markdown_text(text));
25 }
26 Block::Table(table) => rendered_blocks.push(render_markdown_table(table)),
27 Block::Figure(figure) => {
28 rendered_blocks.push(render_markdown_figure(figure));
29 }
30 }
31 }
32 }
33
34 Ok(rendered_blocks.join("\n\n"))
35 }
36}
37
38#[derive(Debug, Default, Clone, Copy)]
39pub struct JsonRenderer;
40
41impl Renderer for JsonRenderer {
42 fn render(&self, document: &Document) -> Result<String> {
43 Ok(serde_json::to_string_pretty(document)?)
44 }
45}
46
47#[derive(Debug, Default, Clone, Copy)]
48pub struct LatexRenderer;
49
50impl Renderer for LatexRenderer {
51 fn render(&self, document: &Document) -> Result<String> {
52 let mut output = String::from(
53 "\\documentclass{article}\n\\usepackage{longtable}\n\\begin{document}\n\n",
54 );
55
56 for page in &document.pages {
57 for block in &page.blocks {
58 match block {
59 Block::Text(text) => {
60 output.push_str(&render_latex_text(text));
61 output.push_str("\n\n");
62 }
63 Block::Table(table) => {
64 output.push_str(&render_latex_table(table));
65 output.push_str("\n\n");
66 }
67 Block::Figure(figure) => {
68 output.push_str(&render_latex_figure(figure));
69 output.push_str("\n\n");
70 }
71 }
72 }
73 }
74
75 output.push_str("\\end{document}\n");
76 Ok(output)
77 }
78}
79
80fn render_markdown_text(text: &TextBlock) -> String {
81 if let Some(level) = heading_level(&text.kind) {
82 return format!(
83 "{} {}",
84 "#".repeat(level),
85 sanitize_markdown_text(&text.text)
86 );
87 }
88 if text.kind == "list" {
89 return text
90 .text
91 .lines()
92 .filter(|line| !line.trim().is_empty())
93 .map(|line| format!("- {}", sanitize_markdown_text(line.trim())))
94 .collect::<Vec<_>>()
95 .join("\n");
96 }
97 let body = sanitize_markdown_text(&text.text);
98 let (bold, italic) = block_emphasis(text);
99 emphasize_markdown(&body, bold, italic)
100}
101
102fn block_emphasis(block: &TextBlock) -> (bool, bool) {
105 let mut any = false;
106 let mut bold = true;
107 let mut italic = true;
108 for span in block.lines.iter().flat_map(|line| line.spans.iter()) {
109 if span.text.trim().is_empty() {
110 continue;
111 }
112 any = true;
113 bold &= span.bold;
114 italic &= span.italic;
115 }
116 if any {
117 (bold, italic)
118 } else {
119 (false, false)
120 }
121}
122
123fn emphasize_markdown(text: &str, bold: bool, italic: bool) -> String {
124 let marker = match (bold, italic) {
125 (true, true) => "***",
126 (true, false) => "**",
127 (false, true) => "*",
128 (false, false) => return text.to_owned(),
129 };
130 if text.is_empty() {
131 return text.to_owned();
132 }
133 format!("{marker}{text}{marker}")
134}
135
136fn emphasize_latex(text: &str, bold: bool, italic: bool) -> String {
137 match (bold, italic) {
138 (true, true) => format!("\\textbf{{\\textit{{{text}}}}}"),
139 (true, false) => format!("\\textbf{{{text}}}"),
140 (false, true) => format!("\\textit{{{text}}}"),
141 (false, false) => text.to_owned(),
142 }
143}
144
145fn render_markdown_table(table: &TableBlock) -> String {
146 if let Some(html) = &table.html {
150 let html = html.trim();
151 if !html.is_empty() {
152 return html.to_owned();
153 }
154 }
155 if table.cells.iter().any(|c| c.col_span > 1 || c.row_span > 1) {
156 if let Some(html) = render_html_table_from_cells(table) {
157 return html;
158 }
159 }
160
161 let width = table
162 .headers
163 .len()
164 .max(table.rows.iter().map(Vec::len).max().unwrap_or_default());
165
166 if width == 0 {
167 return String::new();
168 }
169
170 let headers = normalize_row(&table.headers, width);
171 let separators = vec!["---".to_owned(); width];
172 let rows = table
173 .rows
174 .iter()
175 .map(|row| normalize_row(row, width))
176 .collect::<Vec<_>>();
177
178 let mut lines = Vec::with_capacity(rows.len() + 2);
179 lines.push(markdown_row(&headers));
180 lines.push(markdown_row(&separators));
181 lines.extend(rows.iter().map(|row| markdown_row(row)));
182 lines.join("\n")
183}
184
185fn render_html_table_from_cells(table: &TableBlock) -> Option<String> {
189 html_table_from_cells(&table.cells, table.caption.as_deref())
190}
191
192pub fn html_table_from_cells(cells: &[crate::ir::TableCell], caption: Option<&str>) -> Option<String> {
197 if cells.is_empty() {
198 return None;
199 }
200 let max_row = cells.iter().map(|c| c.row).max()?;
201 let mut rows: Vec<Vec<&crate::ir::TableCell>> = vec![Vec::new(); max_row + 1];
202 for cell in cells {
203 if cell.row < rows.len() {
204 rows[cell.row].push(cell);
205 }
206 }
207 for row in &mut rows {
208 row.sort_by_key(|c| c.column);
209 }
210
211 let mut html = String::from("<table>\n");
212 if let Some(caption) = caption {
213 let caption = caption.trim();
214 if !caption.is_empty() {
215 html.push_str(&format!("<caption>{}</caption>\n", html_escape(caption)));
216 }
217 }
218 for row in &rows {
219 html.push_str("<tr>");
220 for cell in row {
221 let tag = if cell.is_header { "th" } else { "td" };
222 let mut attrs = String::new();
223 if cell.col_span > 1 {
224 attrs.push_str(&format!(" colspan=\"{}\"", cell.col_span));
225 }
226 if cell.row_span > 1 {
227 attrs.push_str(&format!(" rowspan=\"{}\"", cell.row_span));
228 }
229 html.push_str(&format!(
230 "<{tag}{attrs}>{}</{tag}>",
231 html_escape(cell.text.trim())
232 ));
233 }
234 html.push_str("</tr>\n");
235 }
236 html.push_str("</table>");
237 Some(html)
238}
239
240fn html_escape(text: &str) -> String {
241 let mut out = String::with_capacity(text.len());
242 for ch in text.chars() {
243 match ch {
244 '&' => out.push_str("&"),
245 '<' => out.push_str("<"),
246 '>' => out.push_str(">"),
247 '"' => out.push_str("""),
248 _ => out.push(ch),
249 }
250 }
251 out
252}
253
254fn render_markdown_figure(figure: &FigureBlock) -> String {
255 let alt_text = figure
256 .alt_text
257 .as_deref()
258 .or(figure.caption.as_deref())
259 .or(figure.image_ref.as_deref())
260 .unwrap_or("image");
261 let image_ref = figure.image_ref.as_deref().unwrap_or("#image");
262 let image = format!(
263 "",
264 sanitize_markdown_text(alt_text).replace(['[', ']'], ""),
265 image_ref
266 );
267 if let Some(caption) = &figure.caption {
268 let caption = sanitize_markdown_text(caption);
269 if !caption.is_empty() && caption != alt_text {
270 return format!("{image}\n\n{caption}");
271 }
272 }
273 image
274}
275
276fn markdown_row(cells: &[String]) -> String {
277 format!(
278 "| {} |",
279 cells
280 .iter()
281 .map(|cell| sanitize_markdown_text(cell).replace('|', "\\|"))
282 .collect::<Vec<_>>()
283 .join(" | ")
284 )
285}
286
287fn sanitize_markdown_text(text: &str) -> String {
288 text.lines()
289 .map(|line| {
290 line.chars()
291 .filter(|character| !is_non_printing_control(*character))
292 .collect::<String>()
293 .split_whitespace()
294 .collect::<Vec<_>>()
295 .join(" ")
296 })
297 .collect::<Vec<_>>()
298 .join("\n")
299}
300
301fn is_non_printing_control(character: char) -> bool {
302 character.is_control() && !matches!(character, '\n' | '\r' | '\t')
303}
304
305fn normalize_row(row: &[String], width: usize) -> Vec<String> {
306 let mut normalized = row.to_vec();
307 normalized.resize(width, String::new());
308 normalized
309}
310
311fn render_latex_text(text: &TextBlock) -> String {
312 if let Some(level) = heading_level(&text.kind) {
313 let command = match level {
314 1 => "section",
315 2 => "subsection",
316 3 => "subsubsection",
317 _ => "paragraph",
318 };
319 return format!("\\{command}{{{}}}", escape_latex(&text.text));
320 }
321 if text.kind == "list" {
322 let items = text
323 .text
324 .lines()
325 .filter(|line| !line.trim().is_empty())
326 .map(|line| format!("\\item {}", escape_latex(line.trim())))
327 .collect::<Vec<_>>();
328 if !items.is_empty() {
329 return format!("\\begin{{itemize}}\n{}\n\\end{{itemize}}", items.join("\n"));
330 }
331 }
332 let body = escape_latex(&text.text);
333 let (bold, italic) = block_emphasis(text);
334 emphasize_latex(&body, bold, italic)
335}
336
337fn render_latex_table(table: &TableBlock) -> String {
338 let width = table
339 .headers
340 .len()
341 .max(table.rows.iter().map(Vec::len).max().unwrap_or_default());
342
343 if width == 0 {
344 return String::new();
345 }
346
347 let spec = latex_column_spec(table, width);
348 let environment = if table.rows.len() > 24 {
351 "longtable"
352 } else {
353 "tabular"
354 };
355
356 let mut output = format!("\\begin{{{environment}}}{{{spec}}}\n");
357 if !table.headers.is_empty() {
358 output.push_str(&latex_row(&normalize_row(&table.headers, width)));
359 output.push_str("\\hline\n");
360 }
361
362 for row in &table.rows {
363 output.push_str(&latex_row(&normalize_row(row, width)));
364 }
365
366 output.push_str(&format!("\\end{{{environment}}}"));
367 output
368}
369
370fn latex_column_spec(table: &TableBlock, width: usize) -> String {
374 (0..width)
375 .map(|column| {
376 let (mut total, mut numeric) = (0usize, 0usize);
377 for row in &table.rows {
378 if let Some(cell) = row.get(column) {
379 let cell = cell.trim();
380 if cell.is_empty() {
381 continue;
382 }
383 total += 1;
384 if cell_is_numeric(cell) {
385 numeric += 1;
386 }
387 }
388 }
389 if total > 0 && numeric * 2 >= total {
390 'r'
391 } else {
392 'l'
393 }
394 })
395 .collect()
396}
397
398fn cell_is_numeric(text: &str) -> bool {
401 let mut digits = 0usize;
402 for character in text.chars() {
403 match character {
404 '0'..='9' => digits += 1,
405 '$' | '(' | ')' | ',' | '.' | '%' | '-' | '+' | ' ' | '\u{2014}' | '\u{2013}' => {}
406 _ => return false,
407 }
408 }
409 digits >= 1
410}
411
412fn render_latex_figure(figure: &FigureBlock) -> String {
413 let label = figure
414 .caption
415 .as_deref()
416 .or(figure.alt_text.as_deref())
417 .or(figure.image_ref.as_deref())
418 .unwrap_or("image");
419 format!("[Image: {}]", escape_latex(label))
420}
421
422fn heading_level(kind: &str) -> Option<usize> {
423 let level = kind.strip_prefix("heading_")?.parse::<usize>().ok()?;
424 (1..=6).contains(&level).then_some(level)
425}
426
427fn latex_row(cells: &[String]) -> String {
428 format!(
429 "{} \\\\\n",
430 cells
431 .iter()
432 .map(|cell| escape_latex(cell))
433 .collect::<Vec<_>>()
434 .join(" & ")
435 )
436}
437
438fn escape_latex(text: &str) -> String {
439 let mut escaped = String::with_capacity(text.len());
440
441 for character in text.chars() {
442 match character {
443 '\\' => escaped.push_str("\\textbackslash{}"),
444 '&' => escaped.push_str("\\&"),
445 '%' => escaped.push_str("\\%"),
446 '$' => escaped.push_str("\\$"),
447 '#' => escaped.push_str("\\#"),
448 '_' => escaped.push_str("\\_"),
449 '{' => escaped.push_str("\\{"),
450 '}' => escaped.push_str("\\}"),
451 '~' => escaped.push_str("\\textasciitilde{}"),
452 '^' => escaped.push_str("\\textasciicircum{}"),
453 '\n' => escaped.push('\n'),
454 character if character.is_control() && character.is_whitespace() => escaped.push(' '),
455 character if character.is_control() => {}
456 character if !character.is_ascii() => {
457 escaped.push_str(latex_unicode_ascii_fallback(character));
458 }
459 _ => escaped.push(character),
460 }
461 }
462
463 escaped
464}
465
466fn latex_unicode_ascii_fallback(character: char) -> &'static str {
467 match character {
468 '\u{00a0}' => " ",
469 '–' | '−' => "-",
470 '—' => "---",
471 '‘' | '’' | '‚' => "'",
472 '“' | '”' | '„' => "\"",
473 '•' => "*",
474 '…' => "...",
475 '×' => "x",
476 '÷' => "/",
477 '≤' => "<=",
478 '≥' => ">=",
479 '≠' => "!=",
480 '±' => "+/-",
481 _ => "?",
482 }
483}
484
485#[cfg(test)]
486mod tests {
487 use super::*;
488 use crate::ir::{Metadata, Page, TableCell};
489
490 fn cell(row: usize, column: usize, text: &str, col_span: usize, row_span: usize) -> TableCell {
491 TableCell {
492 row,
493 column,
494 text: text.to_owned(),
495 bbox: None,
496 is_header: row == 0,
497 col_span,
498 row_span,
499 }
500 }
501
502 fn doc_with(blocks: Vec<Block>) -> Document {
503 Document {
504 schema_version: crate::ir::SCHEMA_VERSION.to_owned(),
505 metadata: Metadata {
506 format: "pdf".to_owned(),
507 engine: "test".to_owned(),
508 source: None,
509 title: None,
510 character_count: 0,
511 word_count: 0,
512 block_count: blocks.len(),
513 file_size_bytes: None,
514 pdf_version: None,
515 encrypted: false,
516 },
517 pages: vec![Page {
518 number: 1,
519 blocks,
520 ..Default::default()
521 }],
522 assets: Vec::new(),
523 warnings: Vec::new(),
524 }
525 }
526
527 #[test]
528 fn prerendered_html_table_is_emitted_verbatim() {
529 let table = TableBlock {
530 html: Some("<table><tr><td>X</td></tr></table>".to_owned()),
531 ..Default::default()
532 };
533 assert_eq!(
534 render_markdown_table(&table),
535 "<table><tr><td>X</td></tr></table>"
536 );
537 }
538
539 #[test]
540 fn spanning_cells_render_as_html_with_span_attrs() {
541 let table = TableBlock {
542 cells: vec![
543 cell(0, 0, "Header", 2, 1),
544 cell(1, 0, "a", 1, 1),
545 cell(1, 1, "b", 1, 1),
546 ],
547 ..Default::default()
548 };
549 let out = render_markdown_table(&table);
550 assert!(out.starts_with("<table>"), "got: {out}");
551 assert!(out.contains("colspan=\"2\""), "got: {out}");
552 assert!(out.contains("<th colspan=\"2\">Header</th>"), "got: {out}");
553 assert!(out.contains("<td>a</td>"), "got: {out}");
554 }
555
556 #[test]
557 fn simple_table_without_spans_stays_pipe_markdown() {
558 let table = TableBlock {
559 headers: vec!["a".to_owned(), "b".to_owned()],
560 rows: vec![vec!["1".to_owned(), "2".to_owned()]],
561 ..Default::default()
562 };
563 let out = render_markdown_table(&table);
564 assert!(out.contains("| a | b |"), "got: {out}");
565 assert!(!out.contains("<table>"), "got: {out}");
566 }
567
568 #[test]
569 fn html_escape_escapes_markup() {
570 assert_eq!(html_escape("a < b & \"c\""), "a < b & "c"");
571 }
572
573 #[test]
574 fn page_furniture_excluded_from_markdown() {
575 let blocks = vec![
576 Block::Text(TextBlock {
577 text: "RUNNING HEADER".to_owned(),
578 kind: "page_header".to_owned(),
579 ..Default::default()
580 }),
581 Block::Text(TextBlock {
582 text: "Body paragraph.".to_owned(),
583 kind: "paragraph".to_owned(),
584 ..Default::default()
585 }),
586 ];
587 let md = MarkdownRenderer.render(&doc_with(blocks)).unwrap();
588 assert!(md.contains("Body paragraph."));
589 assert!(!md.contains("RUNNING HEADER"), "furniture leaked: {md}");
590 }
591
592 #[test]
593 fn heading_kind_renders_with_hashes() {
594 let blocks = vec![Block::Text(TextBlock {
595 text: "Title".to_owned(),
596 kind: "heading_2".to_owned(),
597 ..Default::default()
598 })];
599 let md = MarkdownRenderer.render(&doc_with(blocks)).unwrap();
600 assert_eq!(md.trim(), "## Title");
601 }
602}