invoice_parser/parsers/
invoice.rs1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{aws_direct, common, ecloudvalley, format_detector};
5use std::path::Path;
6
7pub struct InvoiceParser;
8
9impl InvoiceParser {
10 pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
11 let path = path.as_ref();
12 let extension = path
13 .extension()
14 .and_then(|e| e.to_str())
15 .map(|e| e.to_lowercase());
16
17 match extension.as_deref() {
18 Some("pdf") => Self::parse_pdf(path),
19 Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
20 Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
21 None => Err(InvoiceParserError::UnsupportedFileType(
22 "unknown".to_string(),
23 )),
24 }
25 }
26
27 pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
28 let doc = PdfDocument::from_file(&path)?;
29 let invoice = Self::parse_text(&doc.full_text)?;
30
31 Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
32 }
33
34 pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
35 let doc = PdfDocument::from_bytes(bytes)?;
36 let invoice = Self::parse_text(&doc.full_text)?;
37
38 Ok(ParseResult::single(invoice))
39 }
40
41 pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
42 let sheets = XlsxExtractor::extract_sheets(&path)?;
43 let mut invoices = Vec::new();
44
45 for sheet in &sheets {
46 if let Ok(invoice) = Self::parse_sheet(sheet) {
47 invoices.push(invoice);
48 }
49 }
50
51 if invoices.is_empty() {
52 let combined_text: String = sheets
53 .iter()
54 .map(|s| s.to_text())
55 .collect::<Vec<_>>()
56 .join("\n");
57 let invoice = Self::parse_text(&combined_text)?;
58 invoices.push(invoice);
59 }
60
61 Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
62 }
63
64 pub fn parse_text(text: &str) -> Result<Invoice> {
65 if text.trim().is_empty() {
66 return Err(InvoiceParserError::EmptyDocument);
67 }
68
69 let format = format_detector::detect_format(text);
70 let invoice = Self::parse_by_format(text, format);
71
72 Ok(invoice)
73 }
74
75 fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
76 match format {
77 DocumentFormat::AwsDirect => aws_direct::parse(text),
78 DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
79 _ => Self::parse_generic(text, format),
80 }
81 }
82
83 fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
84 let mut invoice = Invoice::new();
85 invoice.document_format = format;
86 common::fill_common_fields(&mut invoice, text);
87 invoice
88 }
89
90 fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
91 let text = sheet.to_text();
92 Self::parse_text(&text)
93 }
94}