br-invoice-parser 0.1.10

A Rust library for parsing invoices and bills from PDF and XLSX files
Documentation
use crate::error::{InvoiceParserError, Result};
use std::path::Path;

pub struct PdfExtractor;

impl PdfExtractor {
    pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
        let path = path.as_ref();

        if !path.exists() {
            return Err(InvoiceParserError::FileReadError(std::io::Error::new(
                std::io::ErrorKind::NotFound,
                format!("File not found: {}", path.display()),
            )));
        }

        let extension = path
            .extension()
            .and_then(|e| e.to_str())
            .map(|e| e.to_lowercase());

        if extension.as_deref() != Some("pdf") {
            return Err(InvoiceParserError::InvalidFileFormat {
                expected: "pdf".to_string(),
                actual: extension.unwrap_or_else(|| "unknown".to_string()),
            });
        }

        pdf_extract::extract_text(path)
            .map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
    }

    pub fn extract_text_from_bytes(bytes: &[u8]) -> Result<String> {
        pdf_extract::extract_text_from_mem(bytes)
            .map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
    }
}

pub struct ExtractedPage {
    pub page_number: usize,
    pub text: String,
}

pub struct PdfDocument {
    pub pages: Vec<ExtractedPage>,
    pub full_text: String,
}

impl PdfDocument {
    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
        let full_text = PdfExtractor::extract_text(path)?;
        let pages = Self::split_into_pages(&full_text);

        Ok(Self { pages, full_text })
    }

    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
        let full_text = PdfExtractor::extract_text_from_bytes(bytes)?;
        let pages = Self::split_into_pages(&full_text);

        Ok(Self { pages, full_text })
    }

    fn split_into_pages(text: &str) -> Vec<ExtractedPage> {
        text.split('\x0C')
            .enumerate()
            .map(|(i, page_text)| ExtractedPage {
                page_number: i + 1,
                text: page_text.to_string(),
            })
            .filter(|p| !p.text.trim().is_empty())
            .collect()
    }

    pub fn page_count(&self) -> usize {
        self.pages.len()
    }
}