invoice_parser/extractors/
xlsx.rs1use crate::error::{InvoiceParserError, Result};
2use calamine::{open_workbook, Data, Range, Reader, Xlsx};
3use std::fs::File;
4use std::io::BufReader;
5use std::path::Path;
6
7pub struct XlsxExtractor;
8
9impl XlsxExtractor {
10 pub fn extract_sheets<P: AsRef<Path>>(path: P) -> Result<Vec<SheetData>> {
11 let path = path.as_ref();
12
13 if !path.exists() {
14 return Err(InvoiceParserError::FileReadError(std::io::Error::new(
15 std::io::ErrorKind::NotFound,
16 format!("File not found: {}", path.display()),
17 )));
18 }
19
20 let extension = path
21 .extension()
22 .and_then(|e| e.to_str())
23 .map(|e| e.to_lowercase());
24
25 match extension.as_deref() {
26 Some("xlsx") | Some("xls") | Some("xlsm") => {}
27 _ => {
28 return Err(InvoiceParserError::InvalidFileFormat {
29 expected: "xlsx/xls/xlsm".to_string(),
30 actual: extension.unwrap_or_else(|| "unknown".to_string()),
31 });
32 }
33 }
34
35 let mut workbook: Xlsx<BufReader<File>> =
36 open_workbook(path).map_err(|e: calamine::XlsxError| {
37 InvoiceParserError::XlsxParsingError(e.to_string())
38 })?;
39
40 let sheet_names: Vec<String> = workbook
41 .sheet_names()
42 .iter()
43 .map(|s| s.to_string())
44 .collect();
45 let mut sheets = Vec::new();
46
47 for name in sheet_names {
48 if let Ok(range) = workbook.worksheet_range(&name) {
49 sheets.push(SheetData::from_range(&name, &range));
50 }
51 }
52
53 if sheets.is_empty() {
54 return Err(InvoiceParserError::EmptyDocument);
55 }
56
57 Ok(sheets)
58 }
59}
60
61#[derive(Debug, Clone)]
62pub struct CellValue {
63 pub row: u32,
64 pub col: u32,
65 pub value: String,
66}
67
68#[derive(Debug, Clone)]
69pub struct SheetData {
70 pub name: String,
71 pub rows: Vec<Vec<String>>,
72 pub row_count: usize,
73 pub col_count: usize,
74}
75
76impl SheetData {
77 fn from_range(name: &str, range: &Range<Data>) -> Self {
78 let rows: Vec<Vec<String>> = range
79 .rows()
80 .map(|row| row.iter().map(Self::cell_to_string).collect())
81 .collect();
82
83 let row_count = rows.len();
84 let col_count = rows.first().map(|r| r.len()).unwrap_or(0);
85
86 Self {
87 name: name.to_string(),
88 rows,
89 row_count,
90 col_count,
91 }
92 }
93
94 fn cell_to_string(cell: &Data) -> String {
95 match cell {
96 Data::Empty => String::new(),
97 Data::String(s) => s.clone(),
98 Data::Float(f) => {
99 if f.fract() == 0.0 {
100 format!("{:.0}", f)
101 } else {
102 f.to_string()
103 }
104 }
105 Data::Int(i) => i.to_string(),
106 Data::Bool(b) => b.to_string(),
107 Data::Error(e) => format!("ERROR: {:?}", e),
108 Data::DateTime(dt) => format!("{}", dt),
109 Data::DateTimeIso(s) => s.clone(),
110 Data::DurationIso(s) => s.clone(),
111 }
112 }
113
114 pub fn get_cell(&self, row: usize, col: usize) -> Option<&str> {
115 self.rows
116 .get(row)
117 .and_then(|r| r.get(col))
118 .map(|s| s.as_str())
119 }
120
121 pub fn find_cell_containing(&self, pattern: &str) -> Option<(usize, usize)> {
122 for (row_idx, row) in self.rows.iter().enumerate() {
123 for (col_idx, cell) in row.iter().enumerate() {
124 if cell.to_lowercase().contains(&pattern.to_lowercase()) {
125 return Some((row_idx, col_idx));
126 }
127 }
128 }
129 None
130 }
131
132 pub fn get_column(&self, col: usize) -> Vec<&str> {
133 self.rows
134 .iter()
135 .filter_map(|row| row.get(col).map(|s| s.as_str()))
136 .collect()
137 }
138
139 pub fn get_row(&self, row: usize) -> Option<&Vec<String>> {
140 self.rows.get(row)
141 }
142
143 pub fn to_text(&self) -> String {
144 self.rows
145 .iter()
146 .map(|row| row.join("\t"))
147 .collect::<Vec<_>>()
148 .join("\n")
149 }
150}