Skip to main content

invoice_parser/extractors/
xlsx.rs

1use crate::error::{InvoiceParserError, Result};
2use calamine::{open_workbook, Data, Range, Reader, Xlsx};
3use std::fs::File;
4use std::io::BufReader;
5use std::path::Path;
6
7pub struct XlsxExtractor;
8
9impl XlsxExtractor {
10    pub fn extract_sheets<P: AsRef<Path>>(path: P) -> Result<Vec<SheetData>> {
11        let path = path.as_ref();
12
13        if !path.exists() {
14            return Err(InvoiceParserError::FileReadError(std::io::Error::new(
15                std::io::ErrorKind::NotFound,
16                format!("File not found: {}", path.display()),
17            )));
18        }
19
20        let extension = path
21            .extension()
22            .and_then(|e| e.to_str())
23            .map(|e| e.to_lowercase());
24
25        match extension.as_deref() {
26            Some("xlsx") | Some("xls") | Some("xlsm") => {}
27            _ => {
28                return Err(InvoiceParserError::InvalidFileFormat {
29                    expected: "xlsx/xls/xlsm".to_string(),
30                    actual: extension.unwrap_or_else(|| "unknown".to_string()),
31                });
32            }
33        }
34
35        let mut workbook: Xlsx<BufReader<File>> =
36            open_workbook(path).map_err(|e: calamine::XlsxError| {
37                InvoiceParserError::XlsxParsingError(e.to_string())
38            })?;
39
40        let sheet_names: Vec<String> = workbook
41            .sheet_names()
42            .iter()
43            .map(|s| s.to_string())
44            .collect();
45        let mut sheets = Vec::new();
46
47        for name in sheet_names {
48            if let Ok(range) = workbook.worksheet_range(&name) {
49                sheets.push(SheetData::from_range(&name, &range));
50            }
51        }
52
53        if sheets.is_empty() {
54            return Err(InvoiceParserError::EmptyDocument);
55        }
56
57        Ok(sheets)
58    }
59}
60
61#[derive(Debug, Clone)]
62pub struct CellValue {
63    pub row: u32,
64    pub col: u32,
65    pub value: String,
66}
67
68#[derive(Debug, Clone)]
69pub struct SheetData {
70    pub name: String,
71    pub rows: Vec<Vec<String>>,
72    pub row_count: usize,
73    pub col_count: usize,
74}
75
76impl SheetData {
77    fn from_range(name: &str, range: &Range<Data>) -> Self {
78        let rows: Vec<Vec<String>> = range
79            .rows()
80            .map(|row| row.iter().map(Self::cell_to_string).collect())
81            .collect();
82
83        let row_count = rows.len();
84        let col_count = rows.first().map(|r| r.len()).unwrap_or(0);
85
86        Self {
87            name: name.to_string(),
88            rows,
89            row_count,
90            col_count,
91        }
92    }
93
94    fn cell_to_string(cell: &Data) -> String {
95        match cell {
96            Data::Empty => String::new(),
97            Data::String(s) => s.clone(),
98            Data::Float(f) => {
99                if f.fract() == 0.0 {
100                    format!("{:.0}", f)
101                } else {
102                    f.to_string()
103                }
104            }
105            Data::Int(i) => i.to_string(),
106            Data::Bool(b) => b.to_string(),
107            Data::Error(e) => format!("ERROR: {:?}", e),
108            Data::DateTime(dt) => format!("{}", dt),
109            Data::DateTimeIso(s) => s.clone(),
110            Data::DurationIso(s) => s.clone(),
111        }
112    }
113
114    pub fn get_cell(&self, row: usize, col: usize) -> Option<&str> {
115        self.rows
116            .get(row)
117            .and_then(|r| r.get(col))
118            .map(|s| s.as_str())
119    }
120
121    pub fn find_cell_containing(&self, pattern: &str) -> Option<(usize, usize)> {
122        for (row_idx, row) in self.rows.iter().enumerate() {
123            for (col_idx, cell) in row.iter().enumerate() {
124                if cell.to_lowercase().contains(&pattern.to_lowercase()) {
125                    return Some((row_idx, col_idx));
126                }
127            }
128        }
129        None
130    }
131
132    pub fn get_column(&self, col: usize) -> Vec<&str> {
133        self.rows
134            .iter()
135            .filter_map(|row| row.get(col).map(|s| s.as_str()))
136            .collect()
137    }
138
139    pub fn get_row(&self, row: usize) -> Option<&Vec<String>> {
140        self.rows.get(row)
141    }
142
143    pub fn to_text(&self) -> String {
144        self.rows
145            .iter()
146            .map(|row| row.join("\t"))
147            .collect::<Vec<_>>()
148            .join("\n")
149    }
150}