Skip to main content

invoice_parser/extractors/
pdf.rs

1use crate::error::{InvoiceParserError, Result};
2use std::path::Path;
3
4pub struct PdfExtractor;
5
6impl PdfExtractor {
7    pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
8        let path = path.as_ref();
9
10        if !path.exists() {
11            return Err(InvoiceParserError::FileReadError(std::io::Error::new(
12                std::io::ErrorKind::NotFound,
13                format!("File not found: {}", path.display()),
14            )));
15        }
16
17        let extension = path
18            .extension()
19            .and_then(|e| e.to_str())
20            .map(|e| e.to_lowercase());
21
22        if extension.as_deref() != Some("pdf") {
23            return Err(InvoiceParserError::InvalidFileFormat {
24                expected: "pdf".to_string(),
25                actual: extension.unwrap_or_else(|| "unknown".to_string()),
26            });
27        }
28
29        pdf_extract::extract_text(path)
30            .map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
31    }
32
33    pub fn extract_text_from_bytes(bytes: &[u8]) -> Result<String> {
34        pdf_extract::extract_text_from_mem(bytes)
35            .map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
36    }
37}
38
39pub struct ExtractedPage {
40    pub page_number: usize,
41    pub text: String,
42}
43
44pub struct PdfDocument {
45    pub pages: Vec<ExtractedPage>,
46    pub full_text: String,
47}
48
49impl PdfDocument {
50    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
51        let full_text = PdfExtractor::extract_text(path)?;
52        let pages = Self::split_into_pages(&full_text);
53
54        Ok(Self { pages, full_text })
55    }
56
57    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
58        let full_text = PdfExtractor::extract_text_from_bytes(bytes)?;
59        let pages = Self::split_into_pages(&full_text);
60
61        Ok(Self { pages, full_text })
62    }
63
64    fn split_into_pages(text: &str) -> Vec<ExtractedPage> {
65        text.split('\x0C')
66            .enumerate()
67            .map(|(i, page_text)| ExtractedPage {
68                page_number: i + 1,
69                text: page_text.to_string(),
70            })
71            .filter(|p| !p.text.trim().is_empty())
72            .collect()
73    }
74
75    pub fn page_count(&self) -> usize {
76        self.pages.len()
77    }
78}