Skip to main content

invoice_parser/parsers/
invoice.rs

1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5    aliyun_direct, aws_direct, common, ecloudvalley, format_detector, microfusion, ucloud,
6};
7use lazy_static::lazy_static;
8use regex::Regex;
9use std::path::Path;
10
11lazy_static! {
12    static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
13}
14
15pub struct InvoiceParser;
16
17impl InvoiceParser {
18    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
19        let path = path.as_ref();
20        let extension = path
21            .extension()
22            .and_then(|e| e.to_str())
23            .map(|e| e.to_lowercase());
24
25        match extension.as_deref() {
26            Some("pdf") => Self::parse_pdf(path),
27            Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
28            Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
29            None => Err(InvoiceParserError::UnsupportedFileType(
30                "unknown".to_string(),
31            )),
32        }
33    }
34
35    pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
36        let doc = PdfDocument::from_file(&path)?;
37        let invoice = Self::parse_text(&doc.full_text)?;
38
39        Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
40    }
41
42    pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
43        let doc = PdfDocument::from_bytes(bytes)?;
44        let invoice = Self::parse_text(&doc.full_text)?;
45
46        Ok(ParseResult::single(invoice))
47    }
48
49    pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
50        let sheets = XlsxExtractor::extract_sheets(&path)?;
51
52        let combined_text: String = sheets
53            .iter()
54            .map(|s| s.to_text())
55            .collect::<Vec<_>>()
56            .join("\n");
57
58        let format = format_detector::detect_format(&combined_text);
59
60        if format == DocumentFormat::UCloud {
61            let mut invoice = Self::parse_text(&combined_text)?;
62
63            if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
64                if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(filename) {
65                    let customer_id = caps.get(1).map(|m| m.as_str().to_string());
66                    invoice.customer_id = customer_id.clone();
67                    invoice.account_name = customer_id;
68                }
69            }
70
71            return Ok(
72                ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
73            );
74        }
75
76        let mut invoices = Vec::new();
77        for sheet in &sheets {
78            if let Ok(invoice) = Self::parse_sheet(sheet) {
79                invoices.push(invoice);
80            }
81        }
82
83        if invoices.is_empty() {
84            let invoice = Self::parse_text(&combined_text)?;
85            invoices.push(invoice);
86        }
87
88        Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
89    }
90
91    pub fn parse_text(text: &str) -> Result<Invoice> {
92        if text.trim().is_empty() {
93            return Err(InvoiceParserError::EmptyDocument);
94        }
95
96        let format = format_detector::detect_format(text);
97        let invoice = Self::parse_by_format(text, format);
98
99        Ok(invoice)
100    }
101
102    fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
103        match format {
104            DocumentFormat::AwsDirect => aws_direct::parse(text),
105            DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
106            DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
107            DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
108            DocumentFormat::UCloud => ucloud::parse(text),
109            _ => Self::parse_generic(text, format),
110        }
111    }
112
113    fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
114        let mut invoice = Invoice::new();
115        invoice.document_format = format;
116        common::fill_common_fields(&mut invoice, text);
117        invoice
118    }
119
120    fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
121        let text = sheet.to_text();
122        Self::parse_text(&text)
123    }
124}