Skip to main content

papyrus_core/renderer/
mod.rs

1use crate::ast::{Document, Node, Span};
2
3// ── Private helpers ──────────────────────────────────────────────────────────
4
5/// Escape all CommonMark structurally-significant characters in inline content.
6///
7/// Escapes (CommonMark spec §2.4):
8/// - Punctuation that can form emphasis, code, links, headings, lists, etc.
9/// - `<`, `>` — prevent autolinks (§6.6) and raw HTML blocks (§6.11)
10/// - `&` — prevent HTML entity references (§2.5)
11fn escape_text(input: &str) -> String {
12    let mut out = String::with_capacity(input.len());
13    for ch in input.chars() {
14        match ch {
15            '\\' | '`' | '*' | '_' | '{' | '}' | '[' | ']' | '(' | ')' | '#' | '+' | '-' | '.'
16            | '!' | '|' | '<' | '>' | '&' => {
17                out.push('\\');
18                out.push(ch);
19            }
20            _ => out.push(ch),
21        }
22    }
23    out
24}
25
26// ── Crate-internal rendering ─────────────────────────────────────────────────
27
28/// Render a single span to its CommonMark inline representation.
29///
30/// Plain spans preserve whitespace verbatim (inter-word spaces from the PDF
31/// extractor are intentional). Formatted spans trim surrounding whitespace
32/// before applying markers so `"  **bold**  "` — which CommonMark parsers
33/// reject as a valid emphasis run per §6.2 — is never emitted.
34///
35/// When a formatted span's content is whitespace-only (e.g., `" "` in bold
36/// from a spacing glyph in the content stream), the span collapses to a
37/// single space `" "` rather than empty string. This preserves inter-word
38/// boundaries that would otherwise fuse adjacent words:
39/// `"Click"` + `" "(bold)` + `"here"(bold)` → `"Click **here**"` not
40/// `"Click**here**"`.
41pub(crate) fn render_span(span: &Span) -> String {
42    if !span.bold && !span.italic {
43        // Plain spans: preserve text verbatim for spacing; return empty only
44        // for truly empty source text.
45        if span.text.is_empty() {
46            return String::new();
47        }
48        return escape_text(&span.text);
49    }
50
51    // Formatted spans: trim to avoid whitespace-wrapped markers.
52    let core = span.text.trim();
53
54    if core.is_empty() {
55        // Source had only whitespace — preserve one space as a word separator
56        // so adjacent plain spans are not fused by the filter in render_spans.
57        return if span.text.is_empty() {
58            String::new()
59        } else {
60            " ".to_string()
61        };
62    }
63
64    let marker = match (span.bold, span.italic) {
65        (true, true) => "***",
66        (true, false) => "**",
67        (false, true) => "*",
68        (false, false) => unreachable!("plain spans return early above"),
69    };
70
71    format!("{marker}{}{marker}", escape_text(core))
72}
73
74fn render_spans(spans: &[Span]) -> String {
75    spans
76        .iter()
77        .map(render_span)
78        .filter(|s| !s.is_empty())
79        .collect::<String>()
80}
81
82fn trim_trailing_ws_per_line(input: &str) -> String {
83    input
84        .lines()
85        .map(str::trim_end)
86        .collect::<Vec<_>>()
87        .join("\n")
88}
89
90/// Render a single AST node to its CommonMark block representation.
91///
92/// Empty headings and paragraphs (all spans resolve to whitespace) produce
93/// an empty string rather than `"### \n\n"` or `"\n\n"`, so the document
94/// normalisation step in `render_document` can strip them cleanly.
95pub(crate) fn render_node(node: &Node) -> String {
96    match node {
97        Node::Heading { level, spans } => {
98            let text = trim_trailing_ws_per_line(&render_spans(spans));
99            if text.is_empty() {
100                return String::new();
101            }
102            let hashes = "#".repeat((*level).clamp(1, 6) as usize);
103            format!("{hashes} {text}\n\n")
104        }
105        Node::Paragraph { spans } => {
106            let text = trim_trailing_ws_per_line(&render_spans(spans));
107            if text.is_empty() {
108                return String::new();
109            }
110            format!("{text}\n\n")
111        }
112        Node::RawText(text) => {
113            // RawText is a best-effort fallback from unresolved fonts; content
114            // is passed through without escaping (it may already be plain text
115            // that should not be double-escaped).
116            let cleaned = trim_trailing_ws_per_line(text);
117            if cleaned.is_empty() {
118                return String::new();
119            }
120            format!("{cleaned}\n\n")
121        }
122    }
123}
124
125// ── Public API ───────────────────────────────────────────────────────────────
126
127pub fn render_document(document: &Document) -> String {
128    let body = document
129        .nodes
130        .iter()
131        .map(render_node)
132        .collect::<String>()
133        .trim_start_matches('\n')
134        .trim_end_matches('\n')
135        .to_string();
136
137    if body.is_empty() {
138        String::new()
139    } else {
140        format!("{body}\n")
141    }
142}
143
144// ── Tests ────────────────────────────────────────────────────────────────────
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149    use crate::ast::DocumentMetadata;
150
151    fn span(text: &str, bold: bool, italic: bool) -> Span {
152        Span {
153            text: text.to_string(),
154            bold,
155            italic,
156            font_size: 12.0,
157            font_name: None,
158        }
159    }
160
161    // ── escape_text ──────────────────────────────────────────────────────────
162
163    #[test]
164    fn escape_text_escapes_all_commonmark_special_chars() {
165        // Original set
166        let raw = r"\`*_{}[]()#+-.!|";
167        let escaped = escape_text(raw);
168        assert_eq!(escaped, r"\\\`\*\_\{\}\[\]\(\)\#\+\-\.\!\|");
169    }
170
171    #[test]
172    fn escape_text_escapes_html_structural_chars() {
173        // <, >, & must be escaped to prevent autolinks, raw HTML, and entity refs
174        assert_eq!(escape_text("<"), r"\<");
175        assert_eq!(escape_text(">"), r"\>");
176        assert_eq!(escape_text("&"), r"\&");
177        assert_eq!(escape_text("A < B & C > D"), r"A \< B \& C \> D");
178    }
179
180    #[test]
181    fn escape_text_leaves_safe_text_unchanged() {
182        assert_eq!(escape_text("Papyrus Renderer 123"), "Papyrus Renderer 123");
183    }
184
185    // ── render_span ──────────────────────────────────────────────────────────
186
187    #[test]
188    fn render_span_supports_plain_bold_italic_and_bold_italic() {
189        assert_eq!(render_span(&span("plain", false, false)), "plain");
190        assert_eq!(render_span(&span("bold", true, false)), "**bold**");
191        assert_eq!(render_span(&span("italic", false, true)), "*italic*");
192        assert_eq!(render_span(&span("both", true, true)), "***both***");
193    }
194
195    #[test]
196    fn render_span_plain_preserves_whitespace_for_inter_word_spacing() {
197        // Space-only plain spans are intentional inter-word separators.
198        assert_eq!(render_span(&span(" ", false, false)), " ");
199        assert_eq!(render_span(&span("  hello  ", false, false)), "  hello  ");
200    }
201
202    #[test]
203    fn render_span_drops_empty_formatted_output_but_preserves_spacing() {
204        // Truly empty source → empty output
205        assert_eq!(render_span(&span("", true, false)), "");
206        assert_eq!(render_span(&span("", true, true)), "");
207        // Whitespace-only source → single space (word boundary preservation)
208        assert_eq!(render_span(&span(" ", true, false)), " ");
209        assert_eq!(render_span(&span("   ", true, true)), " ");
210        assert_eq!(render_span(&span("\t", false, true)), " ");
211    }
212
213    #[test]
214    fn render_span_trims_surrounding_whitespace_before_applying_markers() {
215        assert_eq!(
216            render_span(&span("  bold me  ", true, false)),
217            "**bold me**"
218        );
219        assert_eq!(render_span(&span("\tbold\t", false, true)), "*bold*");
220    }
221
222    #[test]
223    fn render_span_escapes_inner_text_without_escaping_markers() {
224        assert_eq!(render_span(&span("A*B", true, false)), "**A\\*B**");
225    }
226
227    #[test]
228    fn render_span_escapes_html_chars_in_plain_and_formatted() {
229        assert_eq!(render_span(&span("a < b", false, false)), r"a \< b");
230        assert_eq!(render_span(&span("a > b", false, false)), r"a \> b");
231        assert_eq!(render_span(&span("a & b", false, false)), r"a \& b");
232        assert_eq!(render_span(&span("x < y", true, false)), r"**x \< y**");
233    }
234
235    #[test]
236    fn render_spans_preserves_inter_word_space_from_formatted_whitespace_span() {
237        // This is the critical inter-word fusing regression test.
238        // PDF extractors emit inter-word spaces as separate spans that inherit
239        // the current font's bold/italic state.
240        let spans = vec![
241            span("Click", false, false),
242            span(" ", true, false), // bold space from PDF — must become " " not ""
243            span("here", true, false),
244        ];
245        // Expected: "Click **here**" (space preserved, consecutive bold fused)
246        // The bold space collapses to " " which render_spans keeps.
247        // "Click" + " " + "**here**" = "Click **here**"
248        let result = render_spans(&spans);
249        assert_eq!(result, "Click **here**");
250    }
251
252    // ── render_node ──────────────────────────────────────────────────────────
253
254    #[test]
255    fn render_node_heading_uses_hash_prefix_and_blank_line() {
256        let node = Node::Heading {
257            level: 3,
258            spans: vec![span("Heading", false, false)],
259        };
260        assert_eq!(render_node(&node), "### Heading\n\n");
261    }
262
263    #[test]
264    fn render_node_heading_level_clamping() {
265        // Level 0 → clamp to 1 → "#"
266        let h0 = Node::Heading {
267            level: 0,
268            spans: vec![span("X", false, false)],
269        };
270        assert_eq!(render_node(&h0), "# X\n\n");
271        // Level 7 → clamp to 6 → "######"
272        let h7 = Node::Heading {
273            level: 7,
274            spans: vec![span("X", false, false)],
275        };
276        assert_eq!(render_node(&h7), "###### X\n\n");
277    }
278
279    #[test]
280    fn render_node_empty_heading_produces_empty_string() {
281        // All-whitespace spans collapse; heading should not emit "### \n\n"
282        let node = Node::Heading {
283            level: 3,
284            spans: vec![span("", true, false), span("   ", false, true)],
285        };
286        assert_eq!(render_node(&node), "");
287    }
288
289    #[test]
290    fn render_node_paragraph_joins_spans_without_extra_spaces() {
291        let node = Node::Paragraph {
292            spans: vec![
293                span("Hello", false, false),
294                span(" ", false, false),
295                span("world", true, false),
296            ],
297        };
298        assert_eq!(render_node(&node), "Hello **world**\n\n");
299    }
300
301    #[test]
302    fn render_node_empty_paragraph_produces_empty_string() {
303        let node = Node::Paragraph {
304            spans: vec![span("", true, false)],
305        };
306        assert_eq!(render_node(&node), "");
307    }
308
309    #[test]
310    fn render_node_raw_text_passthrough_appends_blank_line() {
311        assert_eq!(render_node(&Node::RawText("raw".to_string())), "raw\n\n");
312    }
313
314    #[test]
315    fn render_node_empty_raw_text_produces_empty_string() {
316        assert_eq!(render_node(&Node::RawText(String::new())), "");
317        assert_eq!(render_node(&Node::RawText("   ".to_string())), "");
318    }
319
320    // ── render_document ──────────────────────────────────────────────────────
321
322    #[test]
323    fn render_document_has_single_trailing_newline_for_non_empty_docs() {
324        let doc = Document {
325            metadata: DocumentMetadata {
326                title: None,
327                author: None,
328                page_count: 1,
329            },
330            nodes: vec![
331                Node::Heading {
332                    level: 1,
333                    spans: vec![span("Title", false, false)],
334                },
335                Node::Paragraph {
336                    spans: vec![span("Body", false, false)],
337                },
338            ],
339        };
340
341        let markdown = render_document(&doc);
342        assert_eq!(markdown, "# Title\n\nBody\n");
343        assert!(markdown.ends_with('\n'));
344        assert!(!markdown.ends_with("\n\n"));
345    }
346
347    #[test]
348    fn render_document_empty_doc_is_empty_string() {
349        let doc = Document {
350            metadata: DocumentMetadata {
351                title: None,
352                author: None,
353                page_count: 0,
354            },
355            nodes: vec![],
356        };
357        assert_eq!(render_document(&doc), "");
358    }
359
360    #[test]
361    fn render_document_skips_empty_nodes_cleanly() {
362        // An all-whitespace heading sandwiched between real content must not
363        // produce extra blank lines in the output.
364        let doc = Document {
365            metadata: DocumentMetadata {
366                title: None,
367                author: None,
368                page_count: 1,
369            },
370            nodes: vec![
371                Node::Paragraph {
372                    spans: vec![span("Before", false, false)],
373                },
374                Node::Heading {
375                    level: 2,
376                    spans: vec![span("   ", false, false)],
377                },
378                Node::Paragraph {
379                    spans: vec![span("After", false, false)],
380                },
381            ],
382        };
383        let markdown = render_document(&doc);
384        assert_eq!(markdown, "Before\n\nAfter\n");
385    }
386}