Skip to main content

invoice_parser/parsers/
invoice.rs

1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{aws_direct, common, ecloudvalley, format_detector};
5use std::path::Path;
6
7pub struct InvoiceParser;
8
9impl InvoiceParser {
10    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
11        let path = path.as_ref();
12        let extension = path
13            .extension()
14            .and_then(|e| e.to_str())
15            .map(|e| e.to_lowercase());
16
17        match extension.as_deref() {
18            Some("pdf") => Self::parse_pdf(path),
19            Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
20            Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
21            None => Err(InvoiceParserError::UnsupportedFileType(
22                "unknown".to_string(),
23            )),
24        }
25    }
26
27    pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
28        let doc = PdfDocument::from_file(&path)?;
29        let invoice = Self::parse_text(&doc.full_text)?;
30
31        Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
32    }
33
34    pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
35        let doc = PdfDocument::from_bytes(bytes)?;
36        let invoice = Self::parse_text(&doc.full_text)?;
37
38        Ok(ParseResult::single(invoice))
39    }
40
41    pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
42        let sheets = XlsxExtractor::extract_sheets(&path)?;
43        let mut invoices = Vec::new();
44
45        for sheet in &sheets {
46            if let Ok(invoice) = Self::parse_sheet(sheet) {
47                invoices.push(invoice);
48            }
49        }
50
51        if invoices.is_empty() {
52            let combined_text: String = sheets
53                .iter()
54                .map(|s| s.to_text())
55                .collect::<Vec<_>>()
56                .join("\n");
57            let invoice = Self::parse_text(&combined_text)?;
58            invoices.push(invoice);
59        }
60
61        Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
62    }
63
64    pub fn parse_text(text: &str) -> Result<Invoice> {
65        if text.trim().is_empty() {
66            return Err(InvoiceParserError::EmptyDocument);
67        }
68
69        let format = format_detector::detect_format(text);
70        let invoice = Self::parse_by_format(text, format);
71
72        Ok(invoice)
73    }
74
75    fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
76        match format {
77            DocumentFormat::AwsDirect => aws_direct::parse(text),
78            DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
79            _ => Self::parse_generic(text, format),
80        }
81    }
82
83    fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
84        let mut invoice = Invoice::new();
85        invoice.document_format = format;
86        common::fill_common_fields(&mut invoice, text);
87        invoice
88    }
89
90    fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
91        let text = sheet.to_text();
92        Self::parse_text(&text)
93    }
94}