Skip to main content

invoice_parser/parsers/
invoice.rs

1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5    aliyun_direct, aliyun_usage_detail, atlassian, aws_direct, azure_csp, azure_plan_daily,
6    cdn_overage, cdn_traffic, chargebee, cloudmile, common, contentsquare, datastar, digicentre_hk,
7    ecloudvalley, edgenext, format_detector, generic_consultant, google_workspace_billing, hubspot,
8    lokalise, metaage_akamai, microfusion, microfusion_gcp_usage, mlytics_consolidated, mux,
9    reachtop, sentry, slack, tencent_edgeone, ucloud, vnetwork, vnis_invoice, vnis_summary,
10};
11use lazy_static::lazy_static;
12use regex::Regex;
13use std::path::Path;
14
15lazy_static! {
16    static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
17    // "HPK_Azure_202511.pdf" → billing_period=2025-11, account_name=HPK
18    static ref AZURE_CSP_FILENAME_PATTERN: Regex =
19        Regex::new(r"^(.+?)_Azure_(\d{4})(\d{2})").unwrap();
20}
21
22pub struct InvoiceParser;
23
24impl InvoiceParser {
25    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
26        let path = path.as_ref();
27        let extension = path
28            .extension()
29            .and_then(|e| e.to_str())
30            .map(|e| e.to_lowercase());
31
32        match extension.as_deref() {
33            Some("pdf") => Self::parse_pdf(path),
34            Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
35            Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
36            None => Err(InvoiceParserError::UnsupportedFileType(
37                "unknown".to_string(),
38            )),
39        }
40    }
41
42    pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
43        let doc = PdfDocument::from_file(&path)?;
44        let mut invoice = Self::parse_text(&doc.full_text)?;
45
46        if invoice.document_format == DocumentFormat::AzureCsp {
47            if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
48                if let Some(caps) = AZURE_CSP_FILENAME_PATTERN.captures(filename) {
49                    let account_name = caps.get(1).map(|m| m.as_str().to_string());
50                    let year = caps.get(2).map(|m| m.as_str()).unwrap_or("");
51                    let month = caps.get(3).map(|m| m.as_str()).unwrap_or("");
52                    invoice.account_name = account_name;
53                    invoice.billing_period = Some(format!("{}-{}", year, month));
54                }
55            }
56        }
57
58        Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
59    }
60
61    pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
62        let doc = PdfDocument::from_bytes(bytes)?;
63        let invoice = Self::parse_text(&doc.full_text)?;
64
65        Ok(ParseResult::single(invoice))
66    }
67
68    pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
69        let sheets = XlsxExtractor::extract_sheets(&path)?;
70
71        let combined_text: String = sheets
72            .iter()
73            .map(|s| s.to_text())
74            .collect::<Vec<_>>()
75            .join("\n");
76
77        let format = format_detector::detect_format(&combined_text);
78
79        if format == DocumentFormat::UCloud {
80            let mut invoice = Self::parse_text(&combined_text)?;
81
82            if let Some(filename) = path.as_ref().file_name().and_then(|f| f.to_str()) {
83                if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(filename) {
84                    let customer_id = caps.get(1).map(|m| m.as_str().to_string());
85                    invoice.customer_id = customer_id.clone();
86                    invoice.account_name = customer_id;
87                }
88            }
89
90            return Ok(
91                ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
92            );
93        }
94
95        let mut invoices = Vec::new();
96        for sheet in &sheets {
97            if let Ok(invoice) = Self::parse_sheet(sheet) {
98                invoices.push(invoice);
99            }
100        }
101
102        Self::merge_gcp_usage_discount(&mut invoices);
103
104        if invoices.is_empty() {
105            let invoice = Self::parse_text(&combined_text)?;
106            invoices.push(invoice);
107        }
108
109        Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
110    }
111
112    pub fn parse_text(text: &str) -> Result<Invoice> {
113        if text.trim().is_empty() {
114            return Err(InvoiceParserError::EmptyDocument);
115        }
116
117        let format = format_detector::detect_format(text);
118        let invoice = Self::parse_by_format(text, format);
119
120        Ok(invoice)
121    }
122
123    fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
124        match format {
125            DocumentFormat::AwsDirect => aws_direct::parse(text),
126            DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
127            DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
128            DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
129            DocumentFormat::UCloud => ucloud::parse(text),
130            DocumentFormat::Lokalise => lokalise::parse(text),
131            DocumentFormat::Sentry => sentry::parse(text),
132            DocumentFormat::Mux => mux::parse(text),
133            DocumentFormat::MlyticsConsolidated => mlytics_consolidated::parse(text),
134            DocumentFormat::AzureCsp => azure_csp::parse(text),
135            DocumentFormat::AliyunUsageDetail => aliyun_usage_detail::parse(text),
136            DocumentFormat::MicrofusionGcpUsage => microfusion_gcp_usage::parse(text),
137            DocumentFormat::Chargebee => chargebee::parse(text),
138            DocumentFormat::Edgenext => edgenext::parse(text),
139            DocumentFormat::DataStar => datastar::parse(text),
140            DocumentFormat::DigicentreHk => digicentre_hk::parse(text),
141            DocumentFormat::CloudMile => cloudmile::parse(text),
142            DocumentFormat::MetaageAkamai => metaage_akamai::parse(text),
143            DocumentFormat::VnisInvoice => vnis_invoice::parse(text),
144            DocumentFormat::TencentEdgeOne => tencent_edgeone::parse(text),
145            DocumentFormat::AzurePlanDaily => azure_plan_daily::parse(text),
146            DocumentFormat::GoogleWorkspaceBilling => google_workspace_billing::parse(text),
147            DocumentFormat::HubSpot => hubspot::parse(text),
148            DocumentFormat::Reachtop => reachtop::parse(text),
149            DocumentFormat::GenericConsultant => generic_consultant::parse(text),
150            DocumentFormat::CdnOverageDetail => cdn_overage::parse(text),
151            DocumentFormat::Atlassian => atlassian::parse(text),
152            DocumentFormat::Contentsquare => contentsquare::parse(text),
153            DocumentFormat::Slack => slack::parse(text),
154            DocumentFormat::VNetwork => vnetwork::parse(text),
155            DocumentFormat::VnisSummary => vnis_summary::parse(text),
156            DocumentFormat::CdnTraffic => cdn_traffic::parse(text),
157            DocumentFormat::NonInvoice => {
158                let mut invoice = Invoice::new();
159                invoice.document_format = DocumentFormat::NonInvoice;
160                invoice.raw_text = Some(text.to_string());
161                invoice
162            }
163            _ => Self::parse_generic(text, format),
164        }
165    }
166
167    fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
168        let mut invoice = Invoice::new();
169        invoice.document_format = format;
170        common::fill_common_fields(&mut invoice, text);
171        invoice
172    }
173
174    fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
175        let text = sheet.to_text();
176        Self::parse_text(&text)
177    }
178
179    fn merge_gcp_usage_discount(invoices: &mut Vec<Invoice>) {
180        let discount = invoices
181            .iter()
182            .filter(|i| i.document_format == DocumentFormat::Unknown)
183            .filter_map(|i| {
184                i.raw_text
185                    .as_ref()
186                    .and_then(|t| microfusion_gcp_usage::parse_discount(t))
187            })
188            .sum::<f64>();
189
190        if discount.abs() < 0.001 {
191            return;
192        }
193
194        if let Some(gcp) = invoices
195            .iter_mut()
196            .find(|i| i.document_format == DocumentFormat::MicrofusionGcpUsage)
197        {
198            gcp.discount_amount = Some(discount);
199            gcp.total_amount = ((gcp.total_amount + discount) * 100.0).round() / 100.0;
200        }
201
202        invoices.retain(|i| {
203            !(i.document_format == DocumentFormat::Unknown
204                && i.raw_text
205                    .as_ref()
206                    .map(|t| microfusion_gcp_usage::parse_discount(t).is_some())
207                    .unwrap_or(false))
208        });
209    }
210}