edgeparse_core/output/
text.rs1use crate::models::content::ContentElement;
4use crate::models::document::PdfDocument;
5use crate::models::table::TableTokenRow;
6use crate::EdgePdfError;
7
8pub fn to_text(doc: &PdfDocument) -> Result<String, EdgePdfError> {
13 let mut output = String::new();
14
15 if doc.kids.is_empty() {
16 output.push_str("[No content extracted]\n");
17 return Ok(output);
18 }
19
20 for element in &doc.kids {
21 render_element(&mut output, element);
22 }
23
24 Ok(output)
25}
26
27fn token_rows_text(rows: &[TableTokenRow]) -> String {
29 rows.iter()
30 .flat_map(|row| row.iter())
31 .map(|token| token.base.value.as_str())
32 .collect::<Vec<_>>()
33 .join(" ")
34}
35
36fn render_element(out: &mut String, element: &ContentElement) {
37 match element {
38 ContentElement::Heading(h) => {
39 let text = h.base.base.value();
40 let trimmed = text.trim();
41 if !trimmed.is_empty() {
42 out.push_str(trimmed);
43 out.push_str("\n\n");
44 }
45 }
46 ContentElement::Paragraph(p) => {
47 let text = p.base.value();
48 let trimmed = clean_text(&text);
49 if !trimmed.is_empty() {
50 out.push_str(&trimmed);
51 out.push_str("\n\n");
52 }
53 }
54 ContentElement::List(list) => {
55 for item in &list.list_items {
56 let label = token_rows_text(&item.label.content);
57 let body = token_rows_text(&item.body.content);
58 let label_trimmed = label.trim();
59 let body_trimmed = body.trim();
60 if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
61 if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
62 out.push_str(&format!(" {} {}\n", label_trimmed, body_trimmed));
63 } else if !body_trimmed.is_empty() {
64 out.push_str(&format!(" {}\n", body_trimmed));
65 } else {
66 out.push_str(&format!(" {}\n", label_trimmed));
67 }
68 }
69 }
70 out.push('\n');
71 }
72 ContentElement::Image(_) => {
73 out.push_str("[Image]\n\n");
74 }
75 ContentElement::HeaderFooter(_) => {
76 }
78 ContentElement::TextBlock(tb) => {
79 let text = tb.value();
80 let trimmed = clean_text(&text);
81 if !trimmed.is_empty() {
82 out.push_str(&trimmed);
83 out.push_str("\n\n");
84 }
85 }
86 ContentElement::TextLine(tl) => {
87 let text = tl.value();
88 let trimmed = text.trim();
89 if !trimmed.is_empty() {
90 out.push_str(trimmed);
91 out.push('\n');
92 }
93 }
94 ContentElement::TextChunk(tc) => {
95 out.push_str(&tc.value);
96 }
97 _ => {}
98 }
99}
100
101fn clean_text(text: &str) -> String {
103 let trimmed = text.trim();
104 if trimmed.is_empty() {
105 return String::new();
106 }
107 let mut result = String::with_capacity(trimmed.len());
108 let mut prev_space = false;
109 for ch in trimmed.chars() {
110 if ch == ' ' || ch == '\t' {
111 if !prev_space {
112 result.push(' ');
113 prev_space = true;
114 }
115 } else {
116 result.push(ch);
117 prev_space = false;
118 }
119 }
120 result
121}
122
123#[cfg(test)]
124mod tests {
125 use super::*;
126
127 #[test]
128 fn test_empty_doc() {
129 let doc = PdfDocument::new("test.pdf".to_string());
130 let text = to_text(&doc).unwrap();
131 assert!(text.contains("No content extracted"));
132 }
133}