Skip to main content

invoice_parser/parsers/
invoice.rs

1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5    aliyun_direct, aws_direct, common, ecloudvalley, format_detector, lokalise, microfusion, mux,
6    sentry, ucloud,
7};
8use lazy_static::lazy_static;
9use regex::Regex;
10use std::path::Path;
11
12lazy_static! {
13    static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
14}
15
16pub struct InvoiceParser;
17
18impl InvoiceParser {
19    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
20        let path = path.as_ref();
21        let extension = path
22            .extension()
23            .and_then(|e| e.to_str())
24            .map(|e| e.to_lowercase());
25
26        match extension.as_deref() {
27            Some("pdf") => Self::parse_pdf(path),
28            Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
29            Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
30            None => Err(InvoiceParserError::UnsupportedFileType(
31                "unknown".to_string(),
32            )),
33        }
34    }
35
36    pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
37        let doc = PdfDocument::from_file(&path)?;
38        let invoice = Self::parse_text(&doc.full_text)?;
39
40        Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
41    }
42
43    pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
44        let doc = PdfDocument::from_bytes(bytes)?;
45        let invoice = Self::parse_text(&doc.full_text)?;
46
47        Ok(ParseResult::single(invoice))
48    }
49
50    pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
51        let sheets = XlsxExtractor::extract_sheets(&path)?;
52
53        let combined_text: String = sheets
54            .iter()
55            .map(|s| s.to_text())
56            .collect::<Vec<_>>()
57            .join("\n");
58
59        let format = format_detector::detect_format(&combined_text);
60
61        if format == DocumentFormat::UCloud {
62            let mut invoice = Self::parse_text(&combined_text)?;
63
64            if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
65                if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(filename) {
66                    let customer_id = caps.get(1).map(|m| m.as_str().to_string());
67                    invoice.customer_id = customer_id.clone();
68                    invoice.account_name = customer_id;
69                }
70            }
71
72            return Ok(
73                ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
74            );
75        }
76
77        let mut invoices = Vec::new();
78        for sheet in &sheets {
79            if let Ok(invoice) = Self::parse_sheet(sheet) {
80                invoices.push(invoice);
81            }
82        }
83
84        if invoices.is_empty() {
85            let invoice = Self::parse_text(&combined_text)?;
86            invoices.push(invoice);
87        }
88
89        Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
90    }
91
92    pub fn parse_text(text: &str) -> Result<Invoice> {
93        if text.trim().is_empty() {
94            return Err(InvoiceParserError::EmptyDocument);
95        }
96
97        let format = format_detector::detect_format(text);
98        let invoice = Self::parse_by_format(text, format);
99
100        Ok(invoice)
101    }
102
103    fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
104        match format {
105            DocumentFormat::AwsDirect => aws_direct::parse(text),
106            DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
107            DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
108            DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
109            DocumentFormat::UCloud => ucloud::parse(text),
110            DocumentFormat::Lokalise => lokalise::parse(text),
111            DocumentFormat::Sentry => sentry::parse(text),
112            DocumentFormat::Mux => mux::parse(text),
113            _ => Self::parse_generic(text, format),
114        }
115    }
116
117    fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
118        let mut invoice = Invoice::new();
119        invoice.document_format = format;
120        common::fill_common_fields(&mut invoice, text);
121        invoice
122    }
123
124    fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
125        let text = sheet.to_text();
126        Self::parse_text(&text)
127    }
128}