invoice_parser/parsers/
invoice.rs1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5 aliyun_direct, aws_direct, common, ecloudvalley, format_detector, microfusion, ucloud,
6};
7use lazy_static::lazy_static;
8use regex::Regex;
9use std::path::Path;
10
11lazy_static! {
12 static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
13}
14
15pub struct InvoiceParser;
16
17impl InvoiceParser {
18 pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
19 let path = path.as_ref();
20 let extension = path
21 .extension()
22 .and_then(|e| e.to_str())
23 .map(|e| e.to_lowercase());
24
25 match extension.as_deref() {
26 Some("pdf") => Self::parse_pdf(path),
27 Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
28 Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
29 None => Err(InvoiceParserError::UnsupportedFileType(
30 "unknown".to_string(),
31 )),
32 }
33 }
34
35 pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
36 let doc = PdfDocument::from_file(&path)?;
37 let invoice = Self::parse_text(&doc.full_text)?;
38
39 Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
40 }
41
42 pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
43 let doc = PdfDocument::from_bytes(bytes)?;
44 let invoice = Self::parse_text(&doc.full_text)?;
45
46 Ok(ParseResult::single(invoice))
47 }
48
49 pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
50 let sheets = XlsxExtractor::extract_sheets(&path)?;
51
52 let combined_text: String = sheets
53 .iter()
54 .map(|s| s.to_text())
55 .collect::<Vec<_>>()
56 .join("\n");
57
58 let format = format_detector::detect_format(&combined_text);
59
60 if format == DocumentFormat::UCloud {
61 let mut invoice = Self::parse_text(&combined_text)?;
62
63 if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
64 if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(filename) {
65 let customer_id = caps.get(1).map(|m| m.as_str().to_string());
66 invoice.customer_id = customer_id.clone();
67 invoice.account_name = customer_id;
68 }
69 }
70
71 return Ok(
72 ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
73 );
74 }
75
76 let mut invoices = Vec::new();
77 for sheet in &sheets {
78 if let Ok(invoice) = Self::parse_sheet(sheet) {
79 invoices.push(invoice);
80 }
81 }
82
83 if invoices.is_empty() {
84 let invoice = Self::parse_text(&combined_text)?;
85 invoices.push(invoice);
86 }
87
88 Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
89 }
90
91 pub fn parse_text(text: &str) -> Result<Invoice> {
92 if text.trim().is_empty() {
93 return Err(InvoiceParserError::EmptyDocument);
94 }
95
96 let format = format_detector::detect_format(text);
97 let invoice = Self::parse_by_format(text, format);
98
99 Ok(invoice)
100 }
101
102 fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
103 match format {
104 DocumentFormat::AwsDirect => aws_direct::parse(text),
105 DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
106 DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
107 DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
108 DocumentFormat::UCloud => ucloud::parse(text),
109 _ => Self::parse_generic(text, format),
110 }
111 }
112
113 fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
114 let mut invoice = Invoice::new();
115 invoice.document_format = format;
116 common::fill_common_fields(&mut invoice, text);
117 invoice
118 }
119
120 fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
121 let text = sheet.to_text();
122 Self::parse_text(&text)
123 }
124}