invoice_parser/extractors/
pdf.rs1use crate::error::{InvoiceParserError, Result};
2use std::path::Path;
3
4pub struct PdfExtractor;
5
6impl PdfExtractor {
7 pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
8 let path = path.as_ref();
9
10 if !path.exists() {
11 return Err(InvoiceParserError::FileReadError(std::io::Error::new(
12 std::io::ErrorKind::NotFound,
13 format!("File not found: {}", path.display()),
14 )));
15 }
16
17 let extension = path
18 .extension()
19 .and_then(|e| e.to_str())
20 .map(|e| e.to_lowercase());
21
22 if extension.as_deref() != Some("pdf") {
23 return Err(InvoiceParserError::InvalidFileFormat {
24 expected: "pdf".to_string(),
25 actual: extension.unwrap_or_else(|| "unknown".to_string()),
26 });
27 }
28
29 pdf_extract::extract_text(path)
30 .map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
31 }
32
33 pub fn extract_text_from_bytes(bytes: &[u8]) -> Result<String> {
34 pdf_extract::extract_text_from_mem(bytes)
35 .map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
36 }
37}
38
39pub struct ExtractedPage {
40 pub page_number: usize,
41 pub text: String,
42}
43
44pub struct PdfDocument {
45 pub pages: Vec<ExtractedPage>,
46 pub full_text: String,
47}
48
49impl PdfDocument {
50 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
51 let full_text = PdfExtractor::extract_text(path)?;
52 let pages = Self::split_into_pages(&full_text);
53
54 Ok(Self { pages, full_text })
55 }
56
57 pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
58 let full_text = PdfExtractor::extract_text_from_bytes(bytes)?;
59 let pages = Self::split_into_pages(&full_text);
60
61 Ok(Self { pages, full_text })
62 }
63
64 fn split_into_pages(text: &str) -> Vec<ExtractedPage> {
65 text.split('\x0C')
66 .enumerate()
67 .map(|(i, page_text)| ExtractedPage {
68 page_number: i + 1,
69 text: page_text.to_string(),
70 })
71 .filter(|p| !p.text.trim().is_empty())
72 .collect()
73 }
74
75 pub fn page_count(&self) -> usize {
76 self.pages.len()
77 }
78}