br-invoice-parser 0.1.10

A Rust library for parsing invoices and bills from PDF and XLSX files
Documentation
use crate::error::{InvoiceParserError, Result};
use calamine::{open_workbook, Data, Range, Reader, Xlsx};
use std::fs::File;
use std::io::BufReader;
use std::path::Path;

pub struct XlsxExtractor;

impl XlsxExtractor {
    pub fn extract_sheets<P: AsRef<Path>>(path: P) -> Result<Vec<SheetData>> {
        let path = path.as_ref();

        if !path.exists() {
            return Err(InvoiceParserError::FileReadError(std::io::Error::new(
                std::io::ErrorKind::NotFound,
                format!("File not found: {}", path.display()),
            )));
        }

        let extension = path
            .extension()
            .and_then(|e| e.to_str())
            .map(|e| e.to_lowercase());

        match extension.as_deref() {
            Some("xlsx") | Some("xls") | Some("xlsm") => {}
            _ => {
                return Err(InvoiceParserError::InvalidFileFormat {
                    expected: "xlsx/xls/xlsm".to_string(),
                    actual: extension.unwrap_or_else(|| "unknown".to_string()),
                });
            }
        }

        let mut workbook: Xlsx<BufReader<File>> =
            open_workbook(path).map_err(|e: calamine::XlsxError| {
                InvoiceParserError::XlsxParsingError(e.to_string())
            })?;

        let sheet_names: Vec<String> = workbook
            .sheet_names()
            .iter()
            .map(|s| s.to_string())
            .collect();
        let mut sheets = Vec::new();

        for name in sheet_names {
            if let Ok(range) = workbook.worksheet_range(&name) {
                sheets.push(SheetData::from_range(&name, &range));
            }
        }

        if sheets.is_empty() {
            return Err(InvoiceParserError::EmptyDocument);
        }

        Ok(sheets)
    }
}

#[derive(Debug, Clone)]
pub struct CellValue {
    pub row: u32,
    pub col: u32,
    pub value: String,
}

#[derive(Debug, Clone)]
pub struct SheetData {
    pub name: String,
    pub rows: Vec<Vec<String>>,
    pub row_count: usize,
    pub col_count: usize,
}

impl SheetData {
    fn from_range(name: &str, range: &Range<Data>) -> Self {
        let rows: Vec<Vec<String>> = range
            .rows()
            .map(|row| row.iter().map(Self::cell_to_string).collect())
            .collect();

        let row_count = rows.len();
        let col_count = rows.first().map(|r| r.len()).unwrap_or(0);

        Self {
            name: name.to_string(),
            rows,
            row_count,
            col_count,
        }
    }

    fn cell_to_string(cell: &Data) -> String {
        match cell {
            Data::Empty => String::new(),
            Data::String(s) => s.clone(),
            Data::Float(f) => {
                if f.fract() == 0.0 {
                    format!("{:.0}", f)
                } else {
                    f.to_string()
                }
            }
            Data::Int(i) => i.to_string(),
            Data::Bool(b) => b.to_string(),
            Data::Error(e) => format!("ERROR: {:?}", e),
            Data::DateTime(dt) => format!("{}", dt),
            Data::DateTimeIso(s) => s.clone(),
            Data::DurationIso(s) => s.clone(),
        }
    }

    pub fn get_cell(&self, row: usize, col: usize) -> Option<&str> {
        self.rows
            .get(row)
            .and_then(|r| r.get(col))
            .map(|s| s.as_str())
    }

    pub fn find_cell_containing(&self, pattern: &str) -> Option<(usize, usize)> {
        for (row_idx, row) in self.rows.iter().enumerate() {
            for (col_idx, cell) in row.iter().enumerate() {
                if cell.to_lowercase().contains(&pattern.to_lowercase()) {
                    return Some((row_idx, col_idx));
                }
            }
        }
        None
    }

    pub fn get_column(&self, col: usize) -> Vec<&str> {
        self.rows
            .iter()
            .filter_map(|row| row.get(col).map(|s| s.as_str()))
            .collect()
    }

    pub fn get_row(&self, row: usize) -> Option<&Vec<String>> {
        self.rows.get(row)
    }

    pub fn to_text(&self) -> String {
        self.rows
            .iter()
            .map(|row| row.join("\t"))
            .collect::<Vec<_>>()
            .join("\n")
    }
}