invoice_parser/extractors/
pdf.rs1use crate::error::{InvoiceParserError, Result};
2use std::path::Path;
3
4pub struct PdfExtractor;
5
6impl PdfExtractor {
7 pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
8 let path = path.as_ref();
9
10 if !path.exists() {
11 return Err(InvoiceParserError::FileReadError(std::io::Error::new(
12 std::io::ErrorKind::NotFound,
13 format!("File not found: {}", path.display()),
14 )));
15 }
16
17 let extension = path
18 .extension()
19 .and_then(|e| e.to_str())
20 .map(|e| e.to_lowercase());
21
22 if extension.as_deref() != Some("pdf") {
23 return Err(InvoiceParserError::InvalidFileFormat {
24 expected: "pdf".to_string(),
25 actual: extension.unwrap_or_else(|| "unknown".to_string()),
26 });
27 }
28
29 match std::panic::catch_unwind(|| pdf_extract::extract_text(path)) {
30 Ok(Ok(text)) => Ok(text),
31 Ok(Err(e)) => Err(InvoiceParserError::PdfExtractionError(e.to_string())),
32 Err(e) => {
33 let msg = if let Some(s) = e.downcast_ref::<String>() {
34 s.clone()
35 } else if let Some(s) = e.downcast_ref::<&str>() {
36 s.to_string()
37 } else {
38 "unknown panic in pdf_extract".to_string()
39 };
40 Err(InvoiceParserError::PdfExtractionError(msg))
41 }
42 }
43 }
44
45 pub fn extract_text_from_bytes(bytes: &[u8]) -> Result<String> {
46 match std::panic::catch_unwind(|| pdf_extract::extract_text_from_mem(bytes)) {
47 Ok(Ok(text)) => Ok(text),
48 Ok(Err(e)) => Err(InvoiceParserError::PdfExtractionError(e.to_string())),
49 Err(e) => {
50 let msg = if let Some(s) = e.downcast_ref::<String>() {
51 s.clone()
52 } else if let Some(s) = e.downcast_ref::<&str>() {
53 s.to_string()
54 } else {
55 "unknown panic in pdf_extract".to_string()
56 };
57 Err(InvoiceParserError::PdfExtractionError(msg))
58 }
59 }
60 }
61}
62
63pub struct ExtractedPage {
64 pub page_number: usize,
65 pub text: String,
66}
67
68pub struct PdfDocument {
69 pub pages: Vec<ExtractedPage>,
70 pub full_text: String,
71}
72
73impl PdfDocument {
74 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
75 let full_text = PdfExtractor::extract_text(path)?;
76 let pages = Self::split_into_pages(&full_text);
77
78 Ok(Self { pages, full_text })
79 }
80
81 pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
82 let full_text = PdfExtractor::extract_text_from_bytes(bytes)?;
83 let pages = Self::split_into_pages(&full_text);
84
85 Ok(Self { pages, full_text })
86 }
87
88 fn split_into_pages(text: &str) -> Vec<ExtractedPage> {
89 text.split('\x0C')
90 .enumerate()
91 .map(|(i, page_text)| ExtractedPage {
92 page_number: i + 1,
93 text: page_text.to_string(),
94 })
95 .filter(|p| !p.text.trim().is_empty())
96 .collect()
97 }
98
99 pub fn page_count(&self) -> usize {
100 self.pages.len()
101 }
102}