use crate::error::{InvoiceParserError, Result};
use calamine::{open_workbook, Data, Range, Reader, Xlsx};
use std::fs::File;
use std::io::BufReader;
use std::path::Path;
pub struct XlsxExtractor;
impl XlsxExtractor {
pub fn extract_sheets<P: AsRef<Path>>(path: P) -> Result<Vec<SheetData>> {
let path = path.as_ref();
if !path.exists() {
return Err(InvoiceParserError::FileReadError(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("File not found: {}", path.display()),
)));
}
let extension = path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase());
match extension.as_deref() {
Some("xlsx") | Some("xls") | Some("xlsm") => {}
_ => {
return Err(InvoiceParserError::InvalidFileFormat {
expected: "xlsx/xls/xlsm".to_string(),
actual: extension.unwrap_or_else(|| "unknown".to_string()),
});
}
}
let mut workbook: Xlsx<BufReader<File>> =
open_workbook(path).map_err(|e: calamine::XlsxError| {
InvoiceParserError::XlsxParsingError(e.to_string())
})?;
let sheet_names: Vec<String> = workbook
.sheet_names()
.iter()
.map(|s| s.to_string())
.collect();
let mut sheets = Vec::new();
for name in sheet_names {
if let Ok(range) = workbook.worksheet_range(&name) {
sheets.push(SheetData::from_range(&name, &range));
}
}
if sheets.is_empty() {
return Err(InvoiceParserError::EmptyDocument);
}
Ok(sheets)
}
}
#[derive(Debug, Clone)]
pub struct CellValue {
pub row: u32,
pub col: u32,
pub value: String,
}
#[derive(Debug, Clone)]
pub struct SheetData {
pub name: String,
pub rows: Vec<Vec<String>>,
pub row_count: usize,
pub col_count: usize,
}
impl SheetData {
fn from_range(name: &str, range: &Range<Data>) -> Self {
let rows: Vec<Vec<String>> = range
.rows()
.map(|row| row.iter().map(Self::cell_to_string).collect())
.collect();
let row_count = rows.len();
let col_count = rows.first().map(|r| r.len()).unwrap_or(0);
Self {
name: name.to_string(),
rows,
row_count,
col_count,
}
}
fn cell_to_string(cell: &Data) -> String {
match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Float(f) => {
if f.fract() == 0.0 {
format!("{:.0}", f)
} else {
f.to_string()
}
}
Data::Int(i) => i.to_string(),
Data::Bool(b) => b.to_string(),
Data::Error(e) => format!("ERROR: {:?}", e),
Data::DateTime(dt) => format!("{}", dt),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
}
}
pub fn get_cell(&self, row: usize, col: usize) -> Option<&str> {
self.rows
.get(row)
.and_then(|r| r.get(col))
.map(|s| s.as_str())
}
pub fn find_cell_containing(&self, pattern: &str) -> Option<(usize, usize)> {
for (row_idx, row) in self.rows.iter().enumerate() {
for (col_idx, cell) in row.iter().enumerate() {
if cell.to_lowercase().contains(&pattern.to_lowercase()) {
return Some((row_idx, col_idx));
}
}
}
None
}
pub fn get_column(&self, col: usize) -> Vec<&str> {
self.rows
.iter()
.filter_map(|row| row.get(col).map(|s| s.as_str()))
.collect()
}
pub fn get_row(&self, row: usize) -> Option<&Vec<String>> {
self.rows.get(row)
}
pub fn to_text(&self) -> String {
self.rows
.iter()
.map(|row| row.join("\t"))
.collect::<Vec<_>>()
.join("\n")
}
}