invoice_parser/parsers/
invoice.rs1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5 aliyun_direct, aliyun_usage_detail, atlassian, aws_direct, azure_csp, azure_plan_daily,
6 cdn_overage, cdn_traffic, chargebee, cloudmile, common, contentsquare, datastar, digicentre_hk,
7 ecloudvalley, edgenext, format_detector, generic_consultant, google_workspace_billing, hubspot,
8 lokalise, metaage_akamai, microfusion, microfusion_gcp_usage, mlytics_consolidated, mux,
9 reachtop, sentry, slack, tencent_edgeone, ucloud, vnetwork, vnis_invoice, vnis_summary,
10};
11use lazy_static::lazy_static;
12use regex::Regex;
13use std::path::Path;
14
15lazy_static! {
16 static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
17 static ref AZURE_CSP_FILENAME_PATTERN: Regex =
19 Regex::new(r"^(.+?)_Azure_(\d{4})(\d{2})").unwrap();
20}
21
22pub struct InvoiceParser;
23
24impl InvoiceParser {
25 pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
26 let path = path.as_ref();
27 let extension = path
28 .extension()
29 .and_then(|e| e.to_str())
30 .map(|e| e.to_lowercase());
31
32 match extension.as_deref() {
33 Some("pdf") => Self::parse_pdf(path),
34 Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
35 Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
36 None => Err(InvoiceParserError::UnsupportedFileType(
37 "unknown".to_string(),
38 )),
39 }
40 }
41
42 pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
43 let doc = PdfDocument::from_file(&path)?;
44 let mut invoice = Self::parse_text(&doc.full_text)?;
45
46 if invoice.document_format == DocumentFormat::AzureCsp {
47 if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
48 if let Some(caps) = AZURE_CSP_FILENAME_PATTERN.captures(filename) {
49 let account_name = caps.get(1).map(|m| m.as_str().to_string());
50 let year = caps.get(2).map(|m| m.as_str()).unwrap_or("");
51 let month = caps.get(3).map(|m| m.as_str()).unwrap_or("");
52 invoice.account_name = account_name;
53 invoice.billing_period = Some(format!("{}-{}", year, month));
54 }
55 }
56 }
57
58 Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
59 }
60
61 pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
62 let doc = PdfDocument::from_bytes(bytes)?;
63 let invoice = Self::parse_text(&doc.full_text)?;
64
65 Ok(ParseResult::single(invoice))
66 }
67
68 pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
69 let sheets = XlsxExtractor::extract_sheets(&path)?;
70
71 let combined_text: String = sheets
72 .iter()
73 .map(|s| s.to_text())
74 .collect::<Vec<_>>()
75 .join("\n");
76
77 let format = format_detector::detect_format(&combined_text);
78
79 if format == DocumentFormat::UCloud {
80 let mut invoice = Self::parse_text(&combined_text)?;
81
82 if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
83 if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(filename) {
84 let customer_id = caps.get(1).map(|m| m.as_str().to_string());
85 invoice.customer_id = customer_id.clone();
86 invoice.account_name = customer_id;
87 }
88 }
89
90 return Ok(
91 ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
92 );
93 }
94
95 let mut invoices = Vec::new();
96 for sheet in &sheets {
97 if let Ok(invoice) = Self::parse_sheet(sheet) {
98 invoices.push(invoice);
99 }
100 }
101
102 Self::merge_gcp_usage_discount(&mut invoices);
103
104 if invoices.is_empty() {
105 let invoice = Self::parse_text(&combined_text)?;
106 invoices.push(invoice);
107 }
108
109 Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
110 }
111
112 pub fn parse_text(text: &str) -> Result<Invoice> {
113 if text.trim().is_empty() {
114 return Err(InvoiceParserError::EmptyDocument);
115 }
116
117 let format = format_detector::detect_format(text);
118 let invoice = Self::parse_by_format(text, format);
119
120 Ok(invoice)
121 }
122
123 fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
124 match format {
125 DocumentFormat::AwsDirect => aws_direct::parse(text),
126 DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
127 DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
128 DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
129 DocumentFormat::UCloud => ucloud::parse(text),
130 DocumentFormat::Lokalise => lokalise::parse(text),
131 DocumentFormat::Sentry => sentry::parse(text),
132 DocumentFormat::Mux => mux::parse(text),
133 DocumentFormat::MlyticsConsolidated => mlytics_consolidated::parse(text),
134 DocumentFormat::AzureCsp => azure_csp::parse(text),
135 DocumentFormat::AliyunUsageDetail => aliyun_usage_detail::parse(text),
136 DocumentFormat::MicrofusionGcpUsage => microfusion_gcp_usage::parse(text),
137 DocumentFormat::Chargebee => chargebee::parse(text),
138 DocumentFormat::Edgenext => edgenext::parse(text),
139 DocumentFormat::DataStar => datastar::parse(text),
140 DocumentFormat::DigicentreHk => digicentre_hk::parse(text),
141 DocumentFormat::CloudMile => cloudmile::parse(text),
142 DocumentFormat::MetaageAkamai => metaage_akamai::parse(text),
143 DocumentFormat::VnisInvoice => vnis_invoice::parse(text),
144 DocumentFormat::TencentEdgeOne => tencent_edgeone::parse(text),
145 DocumentFormat::AzurePlanDaily => azure_plan_daily::parse(text),
146 DocumentFormat::GoogleWorkspaceBilling => google_workspace_billing::parse(text),
147 DocumentFormat::HubSpot => hubspot::parse(text),
148 DocumentFormat::Reachtop => reachtop::parse(text),
149 DocumentFormat::GenericConsultant => generic_consultant::parse(text),
150 DocumentFormat::CdnOverageDetail => cdn_overage::parse(text),
151 DocumentFormat::Atlassian => atlassian::parse(text),
152 DocumentFormat::Contentsquare => contentsquare::parse(text),
153 DocumentFormat::Slack => slack::parse(text),
154 DocumentFormat::VNetwork => vnetwork::parse(text),
155 DocumentFormat::VnisSummary => vnis_summary::parse(text),
156 DocumentFormat::CdnTraffic => cdn_traffic::parse(text),
157 DocumentFormat::NonInvoice => {
158 let mut invoice = Invoice::new();
159 invoice.document_format = DocumentFormat::NonInvoice;
160 invoice.raw_text = Some(text.to_string());
161 invoice
162 }
163 _ => Self::parse_generic(text, format),
164 }
165 }
166
167 fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
168 let mut invoice = Invoice::new();
169 invoice.document_format = format;
170 common::fill_common_fields(&mut invoice, text);
171 invoice
172 }
173
174 fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
175 let text = sheet.to_text();
176 Self::parse_text(&text)
177 }
178
179 fn merge_gcp_usage_discount(invoices: &mut Vec<Invoice>) {
180 let discount = invoices
181 .iter()
182 .filter(|i| i.document_format == DocumentFormat::Unknown)
183 .filter_map(|i| {
184 i.raw_text
185 .as_ref()
186 .and_then(|t| microfusion_gcp_usage::parse_discount(t))
187 })
188 .sum::<f64>();
189
190 if discount.abs() < 0.001 {
191 return;
192 }
193
194 if let Some(gcp) = invoices
195 .iter_mut()
196 .find(|i| i.document_format == DocumentFormat::MicrofusionGcpUsage)
197 {
198 gcp.discount_amount = Some(discount);
199 gcp.total_amount = ((gcp.total_amount + discount) * 100.0).round() / 100.0;
200 }
201
202 invoices.retain(|i| {
203 !(i.document_format == DocumentFormat::Unknown
204 && i.raw_text
205 .as_ref()
206 .map(|t| microfusion_gcp_usage::parse_discount(t).is_some())
207 .unwrap_or(false))
208 });
209 }
210}