invoice_parser/parsers/
invoice.rs1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5 aliyun_direct, aws_direct, common, ecloudvalley, format_detector, microfusion, ucloud,
6};
7use std::path::Path;
8
9pub struct InvoiceParser;
10
11impl InvoiceParser {
12 pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
13 let path = path.as_ref();
14 let extension = path
15 .extension()
16 .and_then(|e| e.to_str())
17 .map(|e| e.to_lowercase());
18
19 match extension.as_deref() {
20 Some("pdf") => Self::parse_pdf(path),
21 Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
22 Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
23 None => Err(InvoiceParserError::UnsupportedFileType(
24 "unknown".to_string(),
25 )),
26 }
27 }
28
29 pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
30 let doc = PdfDocument::from_file(&path)?;
31 let invoice = Self::parse_text(&doc.full_text)?;
32
33 Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
34 }
35
36 pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
37 let doc = PdfDocument::from_bytes(bytes)?;
38 let invoice = Self::parse_text(&doc.full_text)?;
39
40 Ok(ParseResult::single(invoice))
41 }
42
43 pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
44 let sheets = XlsxExtractor::extract_sheets(&path)?;
45 let mut invoices = Vec::new();
46
47 for sheet in &sheets {
48 if let Ok(invoice) = Self::parse_sheet(sheet) {
49 invoices.push(invoice);
50 }
51 }
52
53 if invoices.is_empty() {
54 let combined_text: String = sheets
55 .iter()
56 .map(|s| s.to_text())
57 .collect::<Vec<_>>()
58 .join("\n");
59 let invoice = Self::parse_text(&combined_text)?;
60 invoices.push(invoice);
61 }
62
63 Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
64 }
65
66 pub fn parse_text(text: &str) -> Result<Invoice> {
67 if text.trim().is_empty() {
68 return Err(InvoiceParserError::EmptyDocument);
69 }
70
71 let format = format_detector::detect_format(text);
72 let invoice = Self::parse_by_format(text, format);
73
74 Ok(invoice)
75 }
76
77 fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
78 match format {
79 DocumentFormat::AwsDirect => aws_direct::parse(text),
80 DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
81 DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
82 DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
83 DocumentFormat::UCloud => ucloud::parse(text),
84 _ => Self::parse_generic(text, format),
85 }
86 }
87
88 fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
89 let mut invoice = Invoice::new();
90 invoice.document_format = format;
91 common::fill_common_fields(&mut invoice, text);
92 invoice
93 }
94
95 fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
96 let text = sheet.to_text();
97 Self::parse_text(&text)
98 }
99}