use crate::error::{InvoiceParserError, Result};
use std::path::Path;
pub struct PdfExtractor;
impl PdfExtractor {
pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
let path = path.as_ref();
if !path.exists() {
return Err(InvoiceParserError::FileReadError(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("File not found: {}", path.display()),
)));
}
let extension = path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase());
if extension.as_deref() != Some("pdf") {
return Err(InvoiceParserError::InvalidFileFormat {
expected: "pdf".to_string(),
actual: extension.unwrap_or_else(|| "unknown".to_string()),
});
}
pdf_extract::extract_text(path)
.map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
}
pub fn extract_text_from_bytes(bytes: &[u8]) -> Result<String> {
pdf_extract::extract_text_from_mem(bytes)
.map_err(|e| InvoiceParserError::PdfExtractionError(e.to_string()))
}
}
pub struct ExtractedPage {
pub page_number: usize,
pub text: String,
}
pub struct PdfDocument {
pub pages: Vec<ExtractedPage>,
pub full_text: String,
}
impl PdfDocument {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let full_text = PdfExtractor::extract_text(path)?;
let pages = Self::split_into_pages(&full_text);
Ok(Self { pages, full_text })
}
pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
let full_text = PdfExtractor::extract_text_from_bytes(bytes)?;
let pages = Self::split_into_pages(&full_text);
Ok(Self { pages, full_text })
}
fn split_into_pages(text: &str) -> Vec<ExtractedPage> {
text.split('\x0C')
.enumerate()
.map(|(i, page_text)| ExtractedPage {
page_number: i + 1,
text: page_text.to_string(),
})
.filter(|p| !p.text.trim().is_empty())
.collect()
}
pub fn page_count(&self) -> usize {
self.pages.len()
}
}