Skip to main content

invoice_parser/extractors/
pdf.rs

1use crate::error::{InvoiceParserError, Result};
2use std::path::Path;
3
4pub struct PdfExtractor;
5
6impl PdfExtractor {
7    pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
8        let path = path.as_ref();
9
10        if !path.exists() {
11            return Err(InvoiceParserError::FileReadError(std::io::Error::new(
12                std::io::ErrorKind::NotFound,
13                format!("File not found: {}", path.display()),
14            )));
15        }
16
17        let extension = path
18            .extension()
19            .and_then(|e| e.to_str())
20            .map(|e| e.to_lowercase());
21
22        if extension.as_deref() != Some("pdf") {
23            return Err(InvoiceParserError::InvalidFileFormat {
24                expected: "pdf".to_string(),
25                actual: extension.unwrap_or_else(|| "unknown".to_string()),
26            });
27        }
28
29        match std::panic::catch_unwind(|| pdf_extract::extract_text(path)) {
30            Ok(Ok(text)) => Ok(text),
31            Ok(Err(e)) => Err(InvoiceParserError::PdfExtractionError(e.to_string())),
32            Err(e) => {
33                let msg = if let Some(s) = e.downcast_ref::<String>() {
34                    s.clone()
35                } else if let Some(s) = e.downcast_ref::<&str>() {
36                    s.to_string()
37                } else {
38                    "unknown panic in pdf_extract".to_string()
39                };
40                Err(InvoiceParserError::PdfExtractionError(msg))
41            }
42        }
43    }
44
45    pub fn extract_text_from_bytes(bytes: &[u8]) -> Result<String> {
46        match std::panic::catch_unwind(|| pdf_extract::extract_text_from_mem(bytes)) {
47            Ok(Ok(text)) => Ok(text),
48            Ok(Err(e)) => Err(InvoiceParserError::PdfExtractionError(e.to_string())),
49            Err(e) => {
50                let msg = if let Some(s) = e.downcast_ref::<String>() {
51                    s.clone()
52                } else if let Some(s) = e.downcast_ref::<&str>() {
53                    s.to_string()
54                } else {
55                    "unknown panic in pdf_extract".to_string()
56                };
57                Err(InvoiceParserError::PdfExtractionError(msg))
58            }
59        }
60    }
61}
62
63pub struct ExtractedPage {
64    pub page_number: usize,
65    pub text: String,
66}
67
68pub struct PdfDocument {
69    pub pages: Vec<ExtractedPage>,
70    pub full_text: String,
71}
72
73impl PdfDocument {
74    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
75        let full_text = PdfExtractor::extract_text(path)?;
76        let pages = Self::split_into_pages(&full_text);
77
78        Ok(Self { pages, full_text })
79    }
80
81    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
82        let full_text = PdfExtractor::extract_text_from_bytes(bytes)?;
83        let pages = Self::split_into_pages(&full_text);
84
85        Ok(Self { pages, full_text })
86    }
87
88    fn split_into_pages(text: &str) -> Vec<ExtractedPage> {
89        text.split('\x0C')
90            .enumerate()
91            .map(|(i, page_text)| ExtractedPage {
92                page_number: i + 1,
93                text: page_text.to_string(),
94            })
95            .filter(|p| !p.text.trim().is_empty())
96            .collect()
97    }
98
99    pub fn page_count(&self) -> usize {
100        self.pages.len()
101    }
102}