1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5 aliyun_direct, aliyun_usage_detail, atlassian, aws_direct, azure_csp, azure_plan_daily,
6 cdn_overage, cdn_traffic, chargebee, cloudmile, common, contentsquare, datastar, digicentre_hk,
7 ecloudvalley, edgenext, format_detector, generic_consultant, google_workspace_billing, hubspot,
8 lokalise, metaage_akamai, microfusion, microfusion_gcp_usage, mlytics_consolidated, mux,
9 reachtop, sentry, slack, tencent_edgeone, ucloud, vnetwork, vnis_invoice, vnis_summary,
10};
11use lazy_static::lazy_static;
12use regex::Regex;
13use std::path::Path;
14
15lazy_static! {
16 static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
17 static ref AZURE_CSP_FILENAME_PATTERN: Regex =
19 Regex::new(r"^(.+?)_Azure_(\d{4})(\d{2})").unwrap();
20}
21
22pub struct InvoiceParser;
23
24impl InvoiceParser {
25 pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
26 let path = path.as_ref();
27 let extension = path
28 .extension()
29 .and_then(|e| e.to_str())
30 .map(|e| e.to_lowercase());
31
32 match extension.as_deref() {
33 Some("pdf") => Self::parse_pdf(path),
34 Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
35 Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
36 None => Err(InvoiceParserError::UnsupportedFileType(
37 "unknown".to_string(),
38 )),
39 }
40 }
41
42 pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
43 let doc = PdfDocument::from_file(&path)?;
44 let text = &doc.full_text;
45 if text.trim().is_empty() {
46 return Err(InvoiceParserError::EmptyDocument);
47 }
48
49 let filename = path.as_ref().file_name().and_then(|f| f.to_str());
50 let format = format_detector::detect_format_with_filename(text, filename);
51 let mut invoice = Self::parse_by_format(text, format);
52
53 if invoice.document_format == DocumentFormat::AzureCsp {
54 if let Some(fname) = filename {
55 if let Some(caps) = AZURE_CSP_FILENAME_PATTERN.captures(fname) {
56 let account_name = caps.get(1).map(|m| m.as_str().to_string());
57 let year = caps.get(2).map(|m| m.as_str()).unwrap_or("");
58 let month = caps.get(3).map(|m| m.as_str()).unwrap_or("");
59 invoice.account_name = account_name;
60 invoice.billing_period = Some(format!("{}-{}", year, month));
61 }
62 }
63 }
64
65 Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
66 }
67
68 pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
69 let doc = PdfDocument::from_bytes(bytes)?;
70 let invoice = Self::parse_text(&doc.full_text)?;
71
72 Ok(ParseResult::single(invoice))
73 }
74
75 pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
76 let sheets = XlsxExtractor::extract_sheets(&path)?;
77
78 let combined_text: String = sheets
79 .iter()
80 .map(|s| s.to_text())
81 .collect::<Vec<_>>()
82 .join("\n");
83
84 let filename = path.as_ref().file_name().and_then(|f| f.to_str());
85 if let Some(fname) = filename {
88 let fname_lower = fname.to_lowercase();
89 if fname_lower.contains("mlytics inv#")
90 || fname_lower.contains("mlytics invoice_#")
91 || fname_lower.contains("mlytics invoice #")
92 || fname_lower.contains("bytes_by_traffic")
93 {
94 let invoice = Invoice { document_format: DocumentFormat::NonInvoice, ..Default::default() };
95 return Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()));
96 }
97 }
98 let format = format_detector::detect_format(&combined_text);
99 if format == DocumentFormat::UCloud {
100 let mut invoice = Self::parse_text(&combined_text)?;
101
102 if let Some(fname) = filename {
103 if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(fname) {
104 let customer_id = caps.get(1).map(|m| m.as_str().to_string());
105 invoice.customer_id = customer_id.clone();
106 invoice.account_name = customer_id;
107 }
108 }
109
110 return Ok(
111 ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
112 );
113 }
114
115 let mut invoices = Vec::new();
116 for sheet in &sheets {
117 if let Ok(invoice) = Self::parse_sheet(sheet) {
118 invoices.push(invoice);
119 }
120 }
121
122 Self::merge_gcp_usage_discount(&mut invoices);
123
124 if invoices.is_empty() {
125 let invoice = Self::parse_text(&combined_text)?;
126 invoices.push(invoice);
127 }
128
129 Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
130 }
131
132 pub fn parse_text(text: &str) -> Result<Invoice> {
133 if text.trim().is_empty() {
134 return Err(InvoiceParserError::EmptyDocument);
135 }
136
137 let format = format_detector::detect_format(text);
138 let invoice = Self::parse_by_format(text, format);
139
140 Ok(invoice)
141 }
142
143 fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
144 match format {
145 DocumentFormat::AwsDirect => aws_direct::parse(text),
146 DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
147 DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
148 DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
149 DocumentFormat::UCloud => ucloud::parse(text),
150 DocumentFormat::Lokalise => lokalise::parse(text),
151 DocumentFormat::Sentry => sentry::parse(text),
152 DocumentFormat::Mux => mux::parse(text),
153 DocumentFormat::MlyticsConsolidated => mlytics_consolidated::parse(text),
154 DocumentFormat::AzureCsp => azure_csp::parse(text),
155 DocumentFormat::AliyunUsageDetail => aliyun_usage_detail::parse(text),
156 DocumentFormat::MicrofusionGcpUsage => microfusion_gcp_usage::parse(text),
157 DocumentFormat::Chargebee => chargebee::parse(text),
158 DocumentFormat::Edgenext => edgenext::parse(text),
159 DocumentFormat::DataStar => datastar::parse(text),
160 DocumentFormat::DigicentreHk => digicentre_hk::parse(text),
161 DocumentFormat::CloudMile => cloudmile::parse(text),
162 DocumentFormat::MetaageAkamai => metaage_akamai::parse(text),
163 DocumentFormat::VnisInvoice => vnis_invoice::parse(text),
164 DocumentFormat::TencentEdgeOne => tencent_edgeone::parse(text),
165 DocumentFormat::AzurePlanDaily => azure_plan_daily::parse(text),
166 DocumentFormat::GoogleWorkspaceBilling => google_workspace_billing::parse(text),
167 DocumentFormat::HubSpot => hubspot::parse(text),
168 DocumentFormat::Reachtop => reachtop::parse(text),
169 DocumentFormat::GenericConsultant => generic_consultant::parse(text),
170 DocumentFormat::CdnOverageDetail => cdn_overage::parse(text),
171 DocumentFormat::Atlassian => atlassian::parse(text),
172 DocumentFormat::Contentsquare => contentsquare::parse(text),
173 DocumentFormat::Slack => slack::parse(text),
174 DocumentFormat::VNetwork => vnetwork::parse(text),
175 DocumentFormat::VnisSummary => vnis_summary::parse(text),
176 DocumentFormat::CdnTraffic => cdn_traffic::parse(text),
177 DocumentFormat::NonInvoice => {
178 let mut invoice = Invoice::new();
179 invoice.document_format = DocumentFormat::NonInvoice;
180 invoice.raw_text = Some(text.to_string());
181 invoice
182 }
183 _ => Self::parse_generic(text, format),
184 }
185 }
186
187 fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
188 let mut invoice = Invoice::new();
189 invoice.document_format = format;
190 common::fill_common_fields(&mut invoice, text);
191 invoice.line_items = common::extract_reseller_line_items(text);
193 invoice
194 }
195
196 fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
197 let text = sheet.to_text();
198 Self::parse_text(&text)
199 }
200
201 fn merge_gcp_usage_discount(invoices: &mut Vec<Invoice>) {
202 let discount = invoices
203 .iter()
204 .filter(|i| i.document_format == DocumentFormat::Unknown)
205 .filter_map(|i| {
206 i.raw_text
207 .as_ref()
208 .and_then(|t| microfusion_gcp_usage::parse_discount(t))
209 })
210 .sum::<f64>();
211
212 if discount.abs() < 0.001 {
213 return;
214 }
215
216 if let Some(gcp) = invoices
217 .iter_mut()
218 .find(|i| i.document_format == DocumentFormat::MicrofusionGcpUsage)
219 {
220 gcp.discount_amount = Some(discount);
221 gcp.total_amount = ((gcp.total_amount + discount) * 100.0).round() / 100.0;
222 }
223
224 invoices.retain(|i| {
225 !(i.document_format == DocumentFormat::Unknown
226 && i.raw_text
227 .as_ref()
228 .map(|t| microfusion_gcp_usage::parse_discount(t).is_some())
229 .unwrap_or(false))
230 });
231 }
232}