Skip to main content

invoice_parser/parsers/
invoice.rs

1use crate::error::{InvoiceParserError, Result};
2use crate::extractors::{PdfDocument, SheetData, XlsxExtractor};
3use crate::models::{DocumentFormat, Invoice, ParseResult};
4use crate::parsers::{
5    aliyun_direct, aliyun_usage_detail, atlassian, aws_direct, azure_csp, azure_plan_daily,
6    cdn_overage, cdn_traffic, chargebee, cloudmile, common, contentsquare, datastar, digicentre_hk,
7    ecloudvalley, edgenext, format_detector, generic_consultant, google_workspace_billing, hubspot,
8    lokalise, metaage_akamai, microfusion, microfusion_gcp_usage, mlytics_consolidated, mux,
9    reachtop, sentry, slack, tencent_edgeone, ucloud, vnetwork, vnis_invoice, vnis_summary,
10};
11use lazy_static::lazy_static;
12use regex::Regex;
13use std::path::Path;
14
15lazy_static! {
16    static ref UCLOUD_FILENAME_PATTERN: Regex = Regex::new(r"^(\d+)_").unwrap();
17    // "HPK_Azure_202511.pdf" → billing_period=2025-11, account_name=HPK
18    static ref AZURE_CSP_FILENAME_PATTERN: Regex =
19        Regex::new(r"^(.+?)_Azure_(\d{4})(\d{2})").unwrap();
20}
21
22pub struct InvoiceParser;
23
24impl InvoiceParser {
25    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
26        let path = path.as_ref();
27        let extension = path
28            .extension()
29            .and_then(|e| e.to_str())
30            .map(|e| e.to_lowercase());
31
32        match extension.as_deref() {
33            Some("pdf") => Self::parse_pdf(path),
34            Some("xlsx") | Some("xls") | Some("xlsm") => Self::parse_xlsx(path),
35            Some(ext) => Err(InvoiceParserError::UnsupportedFileType(ext.to_string())),
36            None => Err(InvoiceParserError::UnsupportedFileType(
37                "unknown".to_string(),
38            )),
39        }
40    }
41
42    pub fn parse_pdf<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
43        let doc = PdfDocument::from_file(&path)?;
44        let text = &doc.full_text;
45        if text.trim().is_empty() {
46            return Err(InvoiceParserError::EmptyDocument);
47        }
48
49        let filename = path.as_ref().file_name().and_then(|f| f.to_str());
50        let format = format_detector::detect_format_with_filename(text, filename);
51        let mut invoice = Self::parse_by_format(text, format);
52
53        if invoice.document_format == DocumentFormat::AzureCsp {
54            if let Some(fname) = filename {
55                if let Some(caps) = AZURE_CSP_FILENAME_PATTERN.captures(fname) {
56                    let account_name = caps.get(1).map(|m| m.as_str().to_string());
57                    let year = caps.get(2).map(|m| m.as_str()).unwrap_or("");
58                    let month = caps.get(3).map(|m| m.as_str()).unwrap_or("");
59                    invoice.account_name = account_name;
60                    invoice.billing_period = Some(format!("{}-{}", year, month));
61                }
62            }
63        }
64
65        Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()))
66    }
67
68    pub fn parse_pdf_bytes(bytes: &[u8]) -> Result<ParseResult> {
69        let doc = PdfDocument::from_bytes(bytes)?;
70        let invoice = Self::parse_text(&doc.full_text)?;
71
72        Ok(ParseResult::single(invoice))
73    }
74
75    pub fn parse_xlsx<P: AsRef<Path>>(path: P) -> Result<ParseResult> {
76        let sheets = XlsxExtractor::extract_sheets(&path)?;
77
78        let combined_text: String = sheets
79            .iter()
80            .map(|s| s.to_text())
81            .collect::<Vec<_>>()
82            .join("\n");
83
84        let filename = path.as_ref().file_name().and_then(|f| f.to_str());
85        // Filename-only NonInvoice check (e.g. "mlytics inv#", "bytes_by_traffic")
86        // Text-based NonInvoice is handled per-sheet via parse_text → detect_format
87        if let Some(fname) = filename {
88            let fname_lower = fname.to_lowercase();
89            if fname_lower.contains("mlytics inv#")
90                || fname_lower.contains("mlytics invoice_#")
91                || fname_lower.contains("mlytics invoice #")
92                || fname_lower.contains("bytes_by_traffic")
93            {
94                let invoice = Invoice { document_format: DocumentFormat::NonInvoice, ..Default::default() };
95                return Ok(ParseResult::single(invoice).with_source(path.as_ref().display().to_string()));
96            }
97        }
98        let format = format_detector::detect_format(&combined_text);
99        if format == DocumentFormat::UCloud {
100            let mut invoice = Self::parse_text(&combined_text)?;
101
102            if let Some(fname) = filename {
103                if let Some(caps) = UCLOUD_FILENAME_PATTERN.captures(fname) {
104                    let customer_id = caps.get(1).map(|m| m.as_str().to_string());
105                    invoice.customer_id = customer_id.clone();
106                    invoice.account_name = customer_id;
107                }
108            }
109
110            return Ok(
111                ParseResult::single(invoice).with_source(path.as_ref().display().to_string())
112            );
113        }
114
115        let mut invoices = Vec::new();
116        for sheet in &sheets {
117            if let Ok(invoice) = Self::parse_sheet(sheet) {
118                invoices.push(invoice);
119            }
120        }
121
122        Self::merge_gcp_usage_discount(&mut invoices);
123
124        if invoices.is_empty() {
125            let invoice = Self::parse_text(&combined_text)?;
126            invoices.push(invoice);
127        }
128
129        Ok(ParseResult::multiple(invoices).with_source(path.as_ref().display().to_string()))
130    }
131
132    pub fn parse_text(text: &str) -> Result<Invoice> {
133        if text.trim().is_empty() {
134            return Err(InvoiceParserError::EmptyDocument);
135        }
136
137        let format = format_detector::detect_format(text);
138        let invoice = Self::parse_by_format(text, format);
139
140        Ok(invoice)
141    }
142
143    fn parse_by_format(text: &str, format: DocumentFormat) -> Invoice {
144        match format {
145            DocumentFormat::AwsDirect => aws_direct::parse(text),
146            DocumentFormat::ECloudValleyAws => ecloudvalley::parse(text),
147            DocumentFormat::MicrofusionAliyun => microfusion::parse(text),
148            DocumentFormat::AliyunDirect => aliyun_direct::parse(text),
149            DocumentFormat::UCloud => ucloud::parse(text),
150            DocumentFormat::Lokalise => lokalise::parse(text),
151            DocumentFormat::Sentry => sentry::parse(text),
152            DocumentFormat::Mux => mux::parse(text),
153            DocumentFormat::MlyticsConsolidated => mlytics_consolidated::parse(text),
154            DocumentFormat::AzureCsp => azure_csp::parse(text),
155            DocumentFormat::AliyunUsageDetail => aliyun_usage_detail::parse(text),
156            DocumentFormat::MicrofusionGcpUsage => microfusion_gcp_usage::parse(text),
157            DocumentFormat::Chargebee => chargebee::parse(text),
158            DocumentFormat::Edgenext => edgenext::parse(text),
159            DocumentFormat::DataStar => datastar::parse(text),
160            DocumentFormat::DigicentreHk => digicentre_hk::parse(text),
161            DocumentFormat::CloudMile => cloudmile::parse(text),
162            DocumentFormat::MetaageAkamai => metaage_akamai::parse(text),
163            DocumentFormat::VnisInvoice => vnis_invoice::parse(text),
164            DocumentFormat::TencentEdgeOne => tencent_edgeone::parse(text),
165            DocumentFormat::AzurePlanDaily => azure_plan_daily::parse(text),
166            DocumentFormat::GoogleWorkspaceBilling => google_workspace_billing::parse(text),
167            DocumentFormat::HubSpot => hubspot::parse(text),
168            DocumentFormat::Reachtop => reachtop::parse(text),
169            DocumentFormat::GenericConsultant => generic_consultant::parse(text),
170            DocumentFormat::CdnOverageDetail => cdn_overage::parse(text),
171            DocumentFormat::Atlassian => atlassian::parse(text),
172            DocumentFormat::Contentsquare => contentsquare::parse(text),
173            DocumentFormat::Slack => slack::parse(text),
174            DocumentFormat::VNetwork => vnetwork::parse(text),
175            DocumentFormat::VnisSummary => vnis_summary::parse(text),
176            DocumentFormat::CdnTraffic => cdn_traffic::parse(text),
177            DocumentFormat::NonInvoice => {
178                let mut invoice = Invoice::new();
179                invoice.document_format = DocumentFormat::NonInvoice;
180                invoice.raw_text = Some(text.to_string());
181                invoice
182            }
183            _ => Self::parse_generic(text, format),
184        }
185    }
186
187    fn parse_generic(text: &str, format: DocumentFormat) -> Invoice {
188        let mut invoice = Invoice::new();
189        invoice.document_format = format;
190        common::fill_common_fields(&mut invoice, text);
191        // Try reseller line item extraction as fallback
192        invoice.line_items = common::extract_reseller_line_items(text);
193        invoice
194    }
195
196    fn parse_sheet(sheet: &SheetData) -> Result<Invoice> {
197        let text = sheet.to_text();
198        Self::parse_text(&text)
199    }
200
201    fn merge_gcp_usage_discount(invoices: &mut Vec<Invoice>) {
202        let discount = invoices
203            .iter()
204            .filter(|i| i.document_format == DocumentFormat::Unknown)
205            .filter_map(|i| {
206                i.raw_text
207                    .as_ref()
208                    .and_then(|t| microfusion_gcp_usage::parse_discount(t))
209            })
210            .sum::<f64>();
211
212        if discount.abs() < 0.001 {
213            return;
214        }
215
216        if let Some(gcp) = invoices
217            .iter_mut()
218            .find(|i| i.document_format == DocumentFormat::MicrofusionGcpUsage)
219        {
220            gcp.discount_amount = Some(discount);
221            gcp.total_amount = ((gcp.total_amount + discount) * 100.0).round() / 100.0;
222        }
223
224        invoices.retain(|i| {
225            !(i.document_format == DocumentFormat::Unknown
226                && i.raw_text
227                    .as_ref()
228                    .map(|t| microfusion_gcp_usage::parse_discount(t).is_some())
229                    .unwrap_or(false))
230        });
231    }
232}