1use super::PageText;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7pub enum OutputFormat {
8 PlainText,
9 Html,
10 Json,
11 Markdown,
12}
13
14pub fn format_page(page: &PageText, format: OutputFormat) -> String {
16 match format {
17 OutputFormat::PlainText => format_plain(page),
18 OutputFormat::Html => format_html(page),
19 OutputFormat::Json => format_json(page),
20 OutputFormat::Markdown => format_markdown(page),
21 }
22}
23
24pub fn format_pages(pages: &[PageText], format: OutputFormat) -> String {
26 match format {
27 OutputFormat::PlainText => {
28 pages
29 .iter()
30 .map(format_plain)
31 .collect::<Vec<_>>()
32 .join("\n\n")
33 }
34 OutputFormat::Html => format_html_multi(pages),
35 OutputFormat::Json => format_json_multi(pages),
36 OutputFormat::Markdown => {
37 pages
38 .iter()
39 .enumerate()
40 .map(|(i, p)| {
41 let mut s = format!("## Page {}\n\n", i + 1);
42 s.push_str(&format_markdown(p));
43 s
44 })
45 .collect::<Vec<_>>()
46 .join("\n\n---\n\n")
47 }
48 }
49}
50
51fn format_plain(page: &PageText) -> String {
56 page.plain_text()
57}
58
59fn format_html(page: &PageText) -> String {
64 let mut html = String::new();
65 html.push_str("<div class=\"page\">\n");
66
67 for block in &page.blocks {
68 html.push_str(" <p>");
69 for (i, line) in block.lines.iter().enumerate() {
70 if i > 0 {
71 html.push_str("<br/>\n ");
72 }
73 html.push_str(&html_escape(&line.text));
74 }
75 html.push_str("</p>\n");
76 }
77
78 html.push_str("</div>");
79 html
80}
81
82fn format_html_multi(pages: &[PageText]) -> String {
83 let mut html = String::new();
84 html.push_str("<!DOCTYPE html>\n<html>\n<head>\n");
85 html.push_str(" <meta charset=\"utf-8\">\n");
86 html.push_str(" <title>Extracted Text</title>\n");
87 html.push_str(" <style>\n");
88 html.push_str(" .page { margin-bottom: 2em; padding-bottom: 1em; border-bottom: 1px solid #ccc; }\n");
89 html.push_str(" p { margin: 0.5em 0; }\n");
90 html.push_str(" </style>\n");
91 html.push_str("</head>\n<body>\n");
92
93 for (i, page) in pages.iter().enumerate() {
94 html.push_str(&format!("<h2>Page {}</h2>\n", i + 1));
95 html.push_str(&format_html(page));
96 html.push('\n');
97 }
98
99 html.push_str("</body>\n</html>");
100 html
101}
102
103fn html_escape(s: &str) -> String {
104 s.replace('&', "&")
105 .replace('<', "<")
106 .replace('>', ">")
107 .replace('"', """)
108}
109
110fn format_json(page: &PageText) -> String {
115 let mut json = String::new();
116 json.push_str("{\n");
117 json.push_str(&format!(" \"page_index\": {},\n", page.page_index));
118
119 if page.blocks.is_empty() {
121 json.push_str(" \"blocks\": [],\n");
122 } else {
123 json.push_str(" \"blocks\": [\n");
124 for (bi, block) in page.blocks.iter().enumerate() {
125 json.push_str(" {\n");
126 json.push_str(&format!(" \"text\": {},\n", json_string(&block.text)));
127
128 json.push_str(" \"lines\": [\n");
130 for (li, line) in block.lines.iter().enumerate() {
131 json.push_str(" {\n");
132 json.push_str(&format!(" \"text\": {},\n", json_string(&line.text)));
133 json.push_str(&format!(" \"x\": {:.2},\n", line.x));
134 json.push_str(&format!(" \"y\": {:.2},\n", line.y));
135
136 json.push_str(" \"words\": [\n");
138 for (wi, word) in line.words.iter().enumerate() {
139 json.push_str(" {\n");
140 json.push_str(&format!(" \"text\": {},\n", json_string(&word.text)));
141 json.push_str(&format!(" \"x\": {:.2},\n", word.x));
142 json.push_str(&format!(" \"y\": {:.2},\n", word.y));
143 json.push_str(&format!(" \"width\": {:.2},\n", word.width));
144 json.push_str(&format!(" \"font_size\": {:.2}\n", word.font_size));
145 json.push_str(" }");
146 if wi + 1 < line.words.len() {
147 json.push(',');
148 }
149 json.push('\n');
150 }
151 json.push_str(" ]\n");
152
153 json.push_str(" }");
154 if li + 1 < block.lines.len() {
155 json.push(',');
156 }
157 json.push('\n');
158 }
159 json.push_str(" ]\n");
160
161 json.push_str(" }");
162 if bi + 1 < page.blocks.len() {
163 json.push(',');
164 }
165 json.push('\n');
166 }
167 json.push_str(" ],\n");
168 }
169
170 json.push_str(&format!(" \"char_count\": {}\n", page.chars.len()));
172
173 json.push('}');
174 json
175}
176
177fn format_json_multi(pages: &[PageText]) -> String {
178 let mut json = String::new();
179 json.push_str("{\n \"pages\": [\n");
180
181 for (i, page) in pages.iter().enumerate() {
182 json.push_str(" ");
183 let page_json = format_json(page);
185 for (li, line) in page_json.lines().enumerate() {
186 if li > 0 {
187 json.push_str("\n ");
188 }
189 json.push_str(line);
190 }
191 if i + 1 < pages.len() {
192 json.push(',');
193 }
194 json.push('\n');
195 }
196
197 json.push_str(" ]\n}");
198 json
199}
200
201fn json_string(s: &str) -> String {
202 let mut result = String::with_capacity(s.len() + 2);
203 result.push('"');
204 for c in s.chars() {
205 match c {
206 '"' => result.push_str("\\\""),
207 '\\' => result.push_str("\\\\"),
208 '\n' => result.push_str("\\n"),
209 '\r' => result.push_str("\\r"),
210 '\t' => result.push_str("\\t"),
211 c if c < '\x20' => {
212 result.push_str(&format!("\\u{:04x}", c as u32));
213 }
214 _ => result.push(c),
215 }
216 }
217 result.push('"');
218 result
219}
220
221fn format_markdown(page: &PageText) -> String {
226 let mut md = String::new();
227
228 for (i, block) in page.blocks.iter().enumerate() {
229 if i > 0 {
230 md.push_str("\n\n");
231 }
232 md.push_str(&block.text);
233 }
234
235 md
236}
237
238#[cfg(test)]
243mod tests {
244 use super::*;
245 use crate::text::{TextBlock, TextChar, TextLine, TextWord};
246
247 fn make_test_page() -> PageText {
248 PageText {
249 page_index: 0,
250 chars: vec![
251 TextChar {
252 unicode: "H".into(),
253 x: 72.0,
254 y: 720.0,
255 font_size: 12.0,
256 font_name: "F1".into(),
257 width: 7.0,
258 },
259 TextChar {
260 unicode: "i".into(),
261 x: 79.0,
262 y: 720.0,
263 font_size: 12.0,
264 font_name: "F1".into(),
265 width: 3.0,
266 },
267 ],
268 lines: vec![TextLine {
269 text: "Hi there".into(),
270 words: vec![
271 TextWord {
272 text: "Hi".into(),
273 x: 72.0,
274 y: 720.0,
275 width: 10.0,
276 font_size: 12.0,
277 },
278 TextWord {
279 text: "there".into(),
280 x: 90.0,
281 y: 720.0,
282 width: 28.0,
283 font_size: 12.0,
284 },
285 ],
286 x: 72.0,
287 y: 720.0,
288 }],
289 blocks: vec![TextBlock {
290 text: "Hi there".into(),
291 lines: vec![TextLine {
292 text: "Hi there".into(),
293 words: vec![
294 TextWord {
295 text: "Hi".into(),
296 x: 72.0,
297 y: 720.0,
298 width: 10.0,
299 font_size: 12.0,
300 },
301 TextWord {
302 text: "there".into(),
303 x: 90.0,
304 y: 720.0,
305 width: 28.0,
306 font_size: 12.0,
307 },
308 ],
309 x: 72.0,
310 y: 720.0,
311 }],
312 }],
313 }
314 }
315
316 #[test]
317 fn test_plain_text() {
318 let page = make_test_page();
319 let text = format_page(&page, OutputFormat::PlainText);
320 assert_eq!(text, "Hi there");
321 }
322
323 #[test]
324 fn test_html_output() {
325 let page = make_test_page();
326 let html = format_page(&page, OutputFormat::Html);
327 assert!(html.contains("<div class=\"page\">"));
328 assert!(html.contains("<p>Hi there</p>"));
329 assert!(html.contains("</div>"));
330 }
331
332 #[test]
333 fn test_html_escaping() {
334 let page = PageText {
335 page_index: 0,
336 chars: Vec::new(),
337 lines: Vec::new(),
338 blocks: vec![TextBlock {
339 text: "a < b & c > d".into(),
340 lines: vec![TextLine {
341 text: "a < b & c > d".into(),
342 words: Vec::new(),
343 x: 0.0,
344 y: 0.0,
345 }],
346 }],
347 };
348 let html = format_page(&page, OutputFormat::Html);
349 assert!(html.contains("a < b & c > d"));
350 }
351
352 #[test]
353 fn test_json_output() {
354 let page = make_test_page();
355 let json = format_page(&page, OutputFormat::Json);
356 assert!(json.contains("\"page_index\": 0"));
357 assert!(json.contains("\"text\": \"Hi there\""));
358 assert!(json.contains("\"blocks\""));
359 assert!(json.contains("\"words\""));
360 }
361
362 #[test]
363 fn test_json_string_escaping() {
364 assert_eq!(json_string("hello"), "\"hello\"");
365 assert_eq!(json_string("a\"b"), "\"a\\\"b\"");
366 assert_eq!(json_string("a\\b"), "\"a\\\\b\"");
367 assert_eq!(json_string("a\nb"), "\"a\\nb\"");
368 }
369
370 #[test]
371 fn test_markdown_output() {
372 let page = make_test_page();
373 let md = format_page(&page, OutputFormat::Markdown);
374 assert_eq!(md, "Hi there");
375 }
376
377 #[test]
378 fn test_multi_page_html() {
379 let pages = vec![make_test_page(), make_test_page()];
380 let html = format_pages(&pages, OutputFormat::Html);
381 assert!(html.contains("<!DOCTYPE html>"));
382 assert!(html.contains("Page 1"));
383 assert!(html.contains("Page 2"));
384 }
385
386 #[test]
387 fn test_multi_page_json() {
388 let pages = vec![make_test_page()];
389 let json = format_pages(&pages, OutputFormat::Json);
390 assert!(json.contains("\"pages\""));
391 assert!(json.contains("\"page_index\": 0"));
392 }
393
394 #[test]
395 fn test_multi_page_markdown() {
396 let pages = vec![make_test_page(), make_test_page()];
397 let md = format_pages(&pages, OutputFormat::Markdown);
398 assert!(md.contains("## Page 1"));
399 assert!(md.contains("## Page 2"));
400 assert!(md.contains("---"));
401 }
402
403 #[test]
404 fn test_empty_page() {
405 let page = PageText {
406 page_index: 0,
407 chars: Vec::new(),
408 lines: Vec::new(),
409 blocks: Vec::new(),
410 };
411 assert_eq!(format_page(&page, OutputFormat::PlainText), "");
412 assert!(format_page(&page, OutputFormat::Html).contains("<div class=\"page\">"));
413 assert!(format_page(&page, OutputFormat::Json).contains("\"blocks\": []"));
414 }
415}