Skip to main content

invoice_parser/parsers/
invoice.rs

1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5    aliyun_direct, aws_direct, common, ecloudvalley, format_detector, microfusion, ucloud,
6};
7use std::path::Path;
8
9pub struct InvoiceParser;
10
11impl InvoiceParser {
12    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
13        let path = path.as_ref();
14        let extension = path
15            .extension()
16            .and_then(|e| e.to_str())
17            .map(|e| e.to_lowercase());
18
19        match extension.as_deref() {
20            Some("pdf") => Self::parse_pdf(path),
21            Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
22            Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
23            None => Err(InvoiceParserError::UnsupportedFileType(
24                "unknown".to_string(),
25            )),
26        }
27    }
28
29    pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
30        let doc = PdfDocument::from_file(&path)?;
31        let invoice = Self::parse_text(&doc.full_text)?;
32
33        Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
34    }
35
36    pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
37        let doc = PdfDocument::from_bytes(bytes)?;
38        let invoice = Self::parse_text(&doc.full_text)?;
39
40        Ok(ParseResult::single(invoice))
41    }
42
43    pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
44        let sheets = XlsxExtractor::extract_sheets(&path)?;
45        let mut invoices = Vec::new();
46
47        for sheet in &sheets {
48            if let Ok(invoice) = Self::parse_sheet(sheet) {
49                invoices.push(invoice);
50            }
51        }
52
53        if invoices.is_empty() {
54            let combined_text: String = sheets
55                .iter()
56                .map(|s| s.to_text())
57                .collect::<Vec<_>>()
58                .join("\n");
59            let invoice = Self::parse_text(&combined_text)?;
60            invoices.push(invoice);
61        }
62
63        Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
64    }
65
66    pub fn parse_text(text: &str) -> Result<Invoice> {
67        if text.trim().is_empty() {
68            return Err(InvoiceParserError::EmptyDocument);
69        }
70
71        let format = format_detector::detect_format(text);
72        let invoice = Self::parse_by_format(text, format);
73
74        Ok(invoice)
75    }
76
77    fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
78        match format {
79            DocumentFormat::AwsDirect => aws_direct::parse(text),
80            DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
81            DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
82            DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
83            DocumentFormat::UCloud => ucloud::parse(text),
84            _ => Self::parse_generic(text, format),
85        }
86    }
87
88    fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
89        let mut invoice = Invoice::new();
90        invoice.document_format = format;
91        common::fill_common_fields(&mut invoice, text);
92        invoice
93    }
94
95    fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
96        let text = sheet.to_text();
97        Self::parse_text(&text)
98    }
99}