Skip to main content

justpdf_core/text/
format.rs

1//! Output formats for extracted text: plain text, HTML, JSON, Markdown.
2
3use super::PageText;
4
5/// Output format enum.
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7pub enum OutputFormat {
8    PlainText,
9    Html,
10    Json,
11    Markdown,
12}
13
14/// Format a single page's text in the specified format.
15pub fn format_page(page: &PageText, format: OutputFormat) -> String {
16    match format {
17        OutputFormat::PlainText => format_plain(page),
18        OutputFormat::Html => format_html(page),
19        OutputFormat::Json => format_json(page),
20        OutputFormat::Markdown => format_markdown(page),
21    }
22}
23
24/// Format multiple pages.
25pub fn format_pages(pages: &[PageText], format: OutputFormat) -> String {
26    match format {
27        OutputFormat::PlainText => {
28            pages
29                .iter()
30                .map(format_plain)
31                .collect::<Vec<_>>()
32                .join("\n\n")
33        }
34        OutputFormat::Html => format_html_multi(pages),
35        OutputFormat::Json => format_json_multi(pages),
36        OutputFormat::Markdown => {
37            pages
38                .iter()
39                .enumerate()
40                .map(|(i, p)| {
41                    let mut s = format!("## Page {}\n\n", i + 1);
42                    s.push_str(&format_markdown(p));
43                    s
44                })
45                .collect::<Vec<_>>()
46                .join("\n\n---\n\n")
47        }
48    }
49}
50
51// ---------------------------------------------------------------------------
52// Plain text
53// ---------------------------------------------------------------------------
54
55fn format_plain(page: &PageText) -> String {
56    page.plain_text()
57}
58
59// ---------------------------------------------------------------------------
60// HTML
61// ---------------------------------------------------------------------------
62
63fn format_html(page: &PageText) -> String {
64    let mut html = String::new();
65    html.push_str("<div class=\"page\">\n");
66
67    for block in &page.blocks {
68        html.push_str("  <p>");
69        for (i, line) in block.lines.iter().enumerate() {
70            if i > 0 {
71                html.push_str("<br/>\n    ");
72            }
73            html.push_str(&html_escape(&line.text));
74        }
75        html.push_str("</p>\n");
76    }
77
78    html.push_str("</div>");
79    html
80}
81
82fn format_html_multi(pages: &[PageText]) -> String {
83    let mut html = String::new();
84    html.push_str("<!DOCTYPE html>\n<html>\n<head>\n");
85    html.push_str("  <meta charset=\"utf-8\">\n");
86    html.push_str("  <title>Extracted Text</title>\n");
87    html.push_str("  <style>\n");
88    html.push_str("    .page { margin-bottom: 2em; padding-bottom: 1em; border-bottom: 1px solid #ccc; }\n");
89    html.push_str("    p { margin: 0.5em 0; }\n");
90    html.push_str("  </style>\n");
91    html.push_str("</head>\n<body>\n");
92
93    for (i, page) in pages.iter().enumerate() {
94        html.push_str(&format!("<h2>Page {}</h2>\n", i + 1));
95        html.push_str(&format_html(page));
96        html.push('\n');
97    }
98
99    html.push_str("</body>\n</html>");
100    html
101}
102
103fn html_escape(s: &str) -> String {
104    s.replace('&', "&amp;")
105        .replace('<', "&lt;")
106        .replace('>', "&gt;")
107        .replace('"', "&quot;")
108}
109
110// ---------------------------------------------------------------------------
111// JSON
112// ---------------------------------------------------------------------------
113
114fn format_json(page: &PageText) -> String {
115    let mut json = String::new();
116    json.push_str("{\n");
117    json.push_str(&format!("  \"page_index\": {},\n", page.page_index));
118
119    // Blocks
120    if page.blocks.is_empty() {
121        json.push_str("  \"blocks\": [],\n");
122    } else {
123        json.push_str("  \"blocks\": [\n");
124        for (bi, block) in page.blocks.iter().enumerate() {
125            json.push_str("    {\n");
126            json.push_str(&format!("      \"text\": {},\n", json_string(&block.text)));
127
128            // Lines
129            json.push_str("      \"lines\": [\n");
130            for (li, line) in block.lines.iter().enumerate() {
131                json.push_str("        {\n");
132                json.push_str(&format!("          \"text\": {},\n", json_string(&line.text)));
133                json.push_str(&format!("          \"x\": {:.2},\n", line.x));
134                json.push_str(&format!("          \"y\": {:.2},\n", line.y));
135
136                // Words
137                json.push_str("          \"words\": [\n");
138                for (wi, word) in line.words.iter().enumerate() {
139                    json.push_str("            {\n");
140                    json.push_str(&format!("              \"text\": {},\n", json_string(&word.text)));
141                    json.push_str(&format!("              \"x\": {:.2},\n", word.x));
142                    json.push_str(&format!("              \"y\": {:.2},\n", word.y));
143                    json.push_str(&format!("              \"width\": {:.2},\n", word.width));
144                    json.push_str(&format!("              \"font_size\": {:.2}\n", word.font_size));
145                    json.push_str("            }");
146                    if wi + 1 < line.words.len() {
147                        json.push(',');
148                    }
149                    json.push('\n');
150                }
151                json.push_str("          ]\n");
152
153                json.push_str("        }");
154                if li + 1 < block.lines.len() {
155                    json.push(',');
156                }
157                json.push('\n');
158            }
159            json.push_str("      ]\n");
160
161            json.push_str("    }");
162            if bi + 1 < page.blocks.len() {
163                json.push(',');
164            }
165            json.push('\n');
166        }
167        json.push_str("  ],\n");
168    }
169
170    // Characters (compact)
171    json.push_str(&format!("  \"char_count\": {}\n", page.chars.len()));
172
173    json.push('}');
174    json
175}
176
177fn format_json_multi(pages: &[PageText]) -> String {
178    let mut json = String::new();
179    json.push_str("{\n  \"pages\": [\n");
180
181    for (i, page) in pages.iter().enumerate() {
182        json.push_str("    ");
183        // Indent the page JSON
184        let page_json = format_json(page);
185        for (li, line) in page_json.lines().enumerate() {
186            if li > 0 {
187                json.push_str("\n    ");
188            }
189            json.push_str(line);
190        }
191        if i + 1 < pages.len() {
192            json.push(',');
193        }
194        json.push('\n');
195    }
196
197    json.push_str("  ]\n}");
198    json
199}
200
201fn json_string(s: &str) -> String {
202    let mut result = String::with_capacity(s.len() + 2);
203    result.push('"');
204    for c in s.chars() {
205        match c {
206            '"' => result.push_str("\\\""),
207            '\\' => result.push_str("\\\\"),
208            '\n' => result.push_str("\\n"),
209            '\r' => result.push_str("\\r"),
210            '\t' => result.push_str("\\t"),
211            c if c < '\x20' => {
212                result.push_str(&format!("\\u{:04x}", c as u32));
213            }
214            _ => result.push(c),
215        }
216    }
217    result.push('"');
218    result
219}
220
221// ---------------------------------------------------------------------------
222// Markdown
223// ---------------------------------------------------------------------------
224
225fn format_markdown(page: &PageText) -> String {
226    let mut md = String::new();
227
228    for (i, block) in page.blocks.iter().enumerate() {
229        if i > 0 {
230            md.push_str("\n\n");
231        }
232        md.push_str(&block.text);
233    }
234
235    md
236}
237
238// ---------------------------------------------------------------------------
239// Tests
240// ---------------------------------------------------------------------------
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use crate::text::{TextBlock, TextChar, TextLine, TextWord};
246
247    fn make_test_page() -> PageText {
248        PageText {
249            page_index: 0,
250            chars: vec![
251                TextChar {
252                    unicode: "H".into(),
253                    x: 72.0,
254                    y: 720.0,
255                    font_size: 12.0,
256                    font_name: "F1".into(),
257                    width: 7.0,
258                },
259                TextChar {
260                    unicode: "i".into(),
261                    x: 79.0,
262                    y: 720.0,
263                    font_size: 12.0,
264                    font_name: "F1".into(),
265                    width: 3.0,
266                },
267            ],
268            lines: vec![TextLine {
269                text: "Hi there".into(),
270                words: vec![
271                    TextWord {
272                        text: "Hi".into(),
273                        x: 72.0,
274                        y: 720.0,
275                        width: 10.0,
276                        font_size: 12.0,
277                    },
278                    TextWord {
279                        text: "there".into(),
280                        x: 90.0,
281                        y: 720.0,
282                        width: 28.0,
283                        font_size: 12.0,
284                    },
285                ],
286                x: 72.0,
287                y: 720.0,
288            }],
289            blocks: vec![TextBlock {
290                text: "Hi there".into(),
291                lines: vec![TextLine {
292                    text: "Hi there".into(),
293                    words: vec![
294                        TextWord {
295                            text: "Hi".into(),
296                            x: 72.0,
297                            y: 720.0,
298                            width: 10.0,
299                            font_size: 12.0,
300                        },
301                        TextWord {
302                            text: "there".into(),
303                            x: 90.0,
304                            y: 720.0,
305                            width: 28.0,
306                            font_size: 12.0,
307                        },
308                    ],
309                    x: 72.0,
310                    y: 720.0,
311                }],
312            }],
313        }
314    }
315
316    #[test]
317    fn test_plain_text() {
318        let page = make_test_page();
319        let text = format_page(&page, OutputFormat::PlainText);
320        assert_eq!(text, "Hi there");
321    }
322
323    #[test]
324    fn test_html_output() {
325        let page = make_test_page();
326        let html = format_page(&page, OutputFormat::Html);
327        assert!(html.contains("<div class=\"page\">"));
328        assert!(html.contains("<p>Hi there</p>"));
329        assert!(html.contains("</div>"));
330    }
331
332    #[test]
333    fn test_html_escaping() {
334        let page = PageText {
335            page_index: 0,
336            chars: Vec::new(),
337            lines: Vec::new(),
338            blocks: vec![TextBlock {
339                text: "a < b & c > d".into(),
340                lines: vec![TextLine {
341                    text: "a < b & c > d".into(),
342                    words: Vec::new(),
343                    x: 0.0,
344                    y: 0.0,
345                }],
346            }],
347        };
348        let html = format_page(&page, OutputFormat::Html);
349        assert!(html.contains("a &lt; b &amp; c &gt; d"));
350    }
351
352    #[test]
353    fn test_json_output() {
354        let page = make_test_page();
355        let json = format_page(&page, OutputFormat::Json);
356        assert!(json.contains("\"page_index\": 0"));
357        assert!(json.contains("\"text\": \"Hi there\""));
358        assert!(json.contains("\"blocks\""));
359        assert!(json.contains("\"words\""));
360    }
361
362    #[test]
363    fn test_json_string_escaping() {
364        assert_eq!(json_string("hello"), "\"hello\"");
365        assert_eq!(json_string("a\"b"), "\"a\\\"b\"");
366        assert_eq!(json_string("a\\b"), "\"a\\\\b\"");
367        assert_eq!(json_string("a\nb"), "\"a\\nb\"");
368    }
369
370    #[test]
371    fn test_markdown_output() {
372        let page = make_test_page();
373        let md = format_page(&page, OutputFormat::Markdown);
374        assert_eq!(md, "Hi there");
375    }
376
377    #[test]
378    fn test_multi_page_html() {
379        let pages = vec![make_test_page(), make_test_page()];
380        let html = format_pages(&pages, OutputFormat::Html);
381        assert!(html.contains("<!DOCTYPE html>"));
382        assert!(html.contains("Page 1"));
383        assert!(html.contains("Page 2"));
384    }
385
386    #[test]
387    fn test_multi_page_json() {
388        let pages = vec![make_test_page()];
389        let json = format_pages(&pages, OutputFormat::Json);
390        assert!(json.contains("\"pages\""));
391        assert!(json.contains("\"page_index\": 0"));
392    }
393
394    #[test]
395    fn test_multi_page_markdown() {
396        let pages = vec![make_test_page(), make_test_page()];
397        let md = format_pages(&pages, OutputFormat::Markdown);
398        assert!(md.contains("## Page 1"));
399        assert!(md.contains("## Page 2"));
400        assert!(md.contains("---"));
401    }
402
403    #[test]
404    fn test_empty_page() {
405        let page = PageText {
406            page_index: 0,
407            chars: Vec::new(),
408            lines: Vec::new(),
409            blocks: Vec::new(),
410        };
411        assert_eq!(format_page(&page, OutputFormat::PlainText), "");
412        assert!(format_page(&page, OutputFormat::Html).contains("<div class=\"page\">"));
413        assert!(format_page(&page, OutputFormat::Json).contains("\"blocks\": []"));
414    }
415}