Skip to main content

edgeparse_core/output/
csv.rs

1//! CSV output generator.
2//!
3//! Extracts table data from a PdfDocument and renders each table
4//! as comma-separated values. Non-table content is output as single-column rows.
5
6use crate::models::content::ContentElement;
7use crate::models::document::PdfDocument;
8use crate::models::table::TableBorderCell;
9use crate::EdgePdfError;
10
11/// Generate CSV representation of a PdfDocument.
12///
13/// Each table becomes a block of CSV rows. Non-table text appears
14/// as single-column entries separated by blank lines between tables.
15///
16/// # Errors
17/// Returns `EdgePdfError::OutputError` on failures.
18pub fn to_csv(doc: &PdfDocument) -> Result<String, EdgePdfError> {
19    let mut output = String::new();
20
21    if doc.kids.is_empty() {
22        return Ok(output);
23    }
24
25    let mut first = true;
26    for element in &doc.kids {
27        match element {
28            ContentElement::TableBorder(table) => {
29                if !first {
30                    output.push('\n');
31                }
32                for row in &table.rows {
33                    let cells: Vec<String> = row
34                        .cells
35                        .iter()
36                        .map(|cell| csv_escape(&cell_text(cell)))
37                        .collect();
38                    output.push_str(&cells.join(","));
39                    output.push('\n');
40                }
41                first = false;
42            }
43            ContentElement::Paragraph(p) => {
44                let text = p.base.value();
45                let trimmed = text.trim();
46                if !trimmed.is_empty() {
47                    output.push_str(&csv_escape(trimmed));
48                    output.push('\n');
49                }
50            }
51            ContentElement::Heading(h) => {
52                let text = h.base.base.value();
53                let trimmed = text.trim();
54                if !trimmed.is_empty() {
55                    output.push_str(&csv_escape(trimmed));
56                    output.push('\n');
57                }
58            }
59            _ => {}
60        }
61    }
62
63    Ok(output)
64}
65
66/// Extract plain text from a table cell's content tokens.
67fn cell_text(cell: &TableBorderCell) -> String {
68    cell.content
69        .iter()
70        .map(|token| token.base.value.as_str())
71        .collect::<Vec<_>>()
72        .join(" ")
73        .trim()
74        .to_string()
75}
76
77/// Escape a value for CSV: quote it if it contains commas, quotes, or newlines.
78fn csv_escape(value: &str) -> String {
79    if value.contains(',') || value.contains('"') || value.contains('\n') || value.contains('\r') {
80        format!("\"{}\"", value.replace('"', "\"\""))
81    } else {
82        value.to_string()
83    }
84}
85
86#[cfg(test)]
87mod tests {
88    use super::*;
89
90    #[test]
91    fn test_csv_escape_plain() {
92        assert_eq!(csv_escape("hello"), "hello");
93    }
94
95    #[test]
96    fn test_csv_escape_comma() {
97        assert_eq!(csv_escape("a,b"), "\"a,b\"");
98    }
99
100    #[test]
101    fn test_csv_escape_quotes() {
102        assert_eq!(csv_escape("say \"hi\""), "\"say \"\"hi\"\"\"");
103    }
104
105    #[test]
106    fn test_csv_escape_newline() {
107        assert_eq!(csv_escape("line1\nline2"), "\"line1\nline2\"");
108    }
109
110    #[test]
111    fn test_empty_doc() {
112        let doc = PdfDocument::new("test.pdf".to_string());
113        let csv = to_csv(&doc).unwrap();
114        assert!(csv.is_empty());
115    }
116}