invoice_parser/parsers/
invoice.rs1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5 aliyun_direct, aws_direct, common, ecloudvalley, format_detector, lokalise, microfusion, mux,
6 sentry, ucloud,
7};
8use lazy_static::lazy_static;
9use regex::Regex;
10use std::path::Path;
11
12lazy_static! {
13 static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
14}
15
16pub struct InvoiceParser;
17
18impl InvoiceParser {
19 pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
20 let path = path.as_ref();
21 let extension = path
22 .extension()
23 .and_then(|e| e.to_str())
24 .map(|e| e.to_lowercase());
25
26 match extension.as_deref() {
27 Some("pdf") => Self::parse_pdf(path),
28 Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
29 Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
30 None => Err(InvoiceParserError::UnsupportedFileType(
31 "unknown".to_string(),
32 )),
33 }
34 }
35
36 pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
37 let doc = PdfDocument::from_file(&path)?;
38 let invoice = Self::parse_text(&doc.full_text)?;
39
40 Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
41 }
42
43 pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
44 let doc = PdfDocument::from_bytes(bytes)?;
45 let invoice = Self::parse_text(&doc.full_text)?;
46
47 Ok(ParseResult::single(invoice))
48 }
49
50 pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
51 let sheets = XlsxExtractor::extract_sheets(&path)?;
52
53 let combined_text: String = sheets
54 .iter()
55 .map(|s| s.to_text())
56 .collect::<Vec<_>>()
57 .join("\n");
58
59 let format = format_detector::detect_format(&combined_text);
60
61 if format == DocumentFormat::UCloud {
62 let mut invoice = Self::parse_text(&combined_text)?;
63
64 if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
65 if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(filename) {
66 let customer_id = caps.get(1).map(|m| m.as_str().to_string());
67 invoice.customer_id = customer_id.clone();
68 invoice.account_name = customer_id;
69 }
70 }
71
72 return Ok(
73 ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
74 );
75 }
76
77 let mut invoices = Vec::new();
78 for sheet in &sheets {
79 if let Ok(invoice) = Self::parse_sheet(sheet) {
80 invoices.push(invoice);
81 }
82 }
83
84 if invoices.is_empty() {
85 let invoice = Self::parse_text(&combined_text)?;
86 invoices.push(invoice);
87 }
88
89 Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
90 }
91
92 pub fn parse_text(text: &str) -> Result<Invoice> {
93 if text.trim().is_empty() {
94 return Err(InvoiceParserError::EmptyDocument);
95 }
96
97 let format = format_detector::detect_format(text);
98 let invoice = Self::parse_by_format(text, format);
99
100 Ok(invoice)
101 }
102
103 fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
104 match format {
105 DocumentFormat::AwsDirect => aws_direct::parse(text),
106 DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
107 DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
108 DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
109 DocumentFormat::UCloud => ucloud::parse(text),
110 DocumentFormat::Lokalise => lokalise::parse(text),
111 DocumentFormat::Sentry => sentry::parse(text),
112 DocumentFormat::Mux => mux::parse(text),
113 _ => Self::parse_generic(text, format),
114 }
115 }
116
117 fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
118 let mut invoice = Invoice::new();
119 invoice.document_format = format;
120 common::fill_common_fields(&mut invoice, text);
121 invoice
122 }
123
124 fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
125 let text = sheet.to_text();
126 Self::parse_text(&text)
127 }
128}