use paladin_core::platform::container::document::{
Document, DocumentError, DocumentMetadata, Page,
};
use std::path::Path;
#[doc(hidden)]
#[derive(Debug, Clone, Default)]
pub struct PdfExtractor;
impl PdfExtractor {
pub fn new() -> Self {
Self
}
pub fn extract(&self, path: &Path) -> Result<Document, DocumentError> {
let bytes = std::fs::read(path).map_err(|e| {
DocumentError::IoError(std::io::Error::new(
e.kind(),
format!("Failed to read PDF file: {}", e),
))
})?;
self.extract_bytes(&bytes)
}
pub fn extract_bytes(&self, bytes: &[u8]) -> Result<Document, DocumentError> {
let extracted_text = pdf_extract::extract_text_from_mem(bytes).map_err(|e| {
let error_msg = e.to_string();
if error_msg.contains("encrypted") || error_msg.contains("password") {
return DocumentError::EncryptedPdf;
}
if error_msg.contains("invalid") || error_msg.contains("corrupt") {
return DocumentError::CorruptedFile(error_msg);
}
DocumentError::ExtractionFailed(error_msg)
})?;
let pages = self.parse_text_into_pages(&extracted_text);
let metadata = self.extract_metadata_from_bytes(bytes, pages.len())?;
Ok(Document::new(pages, metadata))
}
fn parse_text_into_pages(&self, text: &str) -> Vec<Page> {
if text.trim().is_empty() {
return Vec::new();
}
let page_texts: Vec<&str> = text.split('\x0C').collect();
if page_texts.len() > 1 {
page_texts
.into_iter()
.enumerate()
.filter(|(_, content)| !content.trim().is_empty())
.map(|(idx, content)| Page::new(idx + 1, self.preserve_structure(content)))
.collect()
} else {
let paragraphs: Vec<&str> = text.split("\n\n\n").collect();
if paragraphs.len() > 1 {
paragraphs
.into_iter()
.enumerate()
.filter(|(_, content)| !content.trim().is_empty())
.map(|(idx, content)| Page::new(idx + 1, self.preserve_structure(content)))
.collect()
} else {
vec![Page::new(1, self.preserve_structure(text))]
}
}
}
fn preserve_structure(&self, text: &str) -> String {
let lines: Vec<&str> = text.lines().collect();
let mut result = String::new();
let mut prev_empty = false;
for line in lines {
let trimmed = line.trim();
if trimmed.is_empty() {
if !prev_empty && !result.is_empty() {
result.push('\n');
}
prev_empty = true;
} else {
if !result.is_empty() && !prev_empty {
result.push(' '); } else if !result.is_empty() {
result.push('\n'); }
result.push_str(trimmed);
prev_empty = false;
}
}
result
}
fn extract_metadata_from_bytes(
&self,
_bytes: &[u8],
page_count: usize,
) -> Result<DocumentMetadata, DocumentError> {
Ok(DocumentMetadata {
title: None,
author: None,
page_count,
creation_date: None,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pdf_extractor_creation() {
let extractor = PdfExtractor::new();
assert!(matches!(extractor, PdfExtractor));
}
#[test]
fn test_pdf_extractor_default() {
let extractor = PdfExtractor;
assert!(matches!(extractor, PdfExtractor));
}
#[test]
fn test_extract_bytes_invalid_pdf() {
let extractor = PdfExtractor::new();
let invalid_bytes = b"This is not a valid PDF";
let result = extractor.extract_bytes(invalid_bytes);
assert!(result.is_err());
match result {
Err(DocumentError::ExtractionFailed(_)) => (),
Err(DocumentError::CorruptedFile(_)) => (),
_ => panic!("Expected ExtractionFailed or CorruptedFile error"),
}
}
#[test]
fn test_extract_nonexistent_file() {
let extractor = PdfExtractor::new();
let path = Path::new("/nonexistent/file.pdf");
let result = extractor.extract(path);
assert!(result.is_err());
match result {
Err(DocumentError::IoError(_)) => (),
_ => panic!("Expected IoError for nonexistent file"),
}
}
#[test]
fn test_parse_text_into_pages_single_page() {
let extractor = PdfExtractor::new();
let text = "This is page one content.";
let pages = extractor.parse_text_into_pages(text);
assert_eq!(pages.len(), 1);
assert_eq!(pages[0].number, 1);
assert!(pages[0].content.contains("page one"));
}
#[test]
fn test_parse_text_into_pages_empty_text() {
let extractor = PdfExtractor::new();
let text = "";
let pages = extractor.parse_text_into_pages(text);
assert_eq!(pages.len(), 0);
}
#[test]
fn test_parse_text_into_pages_with_form_feed() {
let extractor = PdfExtractor::new();
let text = "Page one content.\x0CPage two content.\x0CPage three content.";
let pages = extractor.parse_text_into_pages(text);
assert_eq!(pages.len(), 3);
assert_eq!(pages[0].number, 1);
assert!(pages[0].content.contains("Page one"));
assert_eq!(pages[1].number, 2);
assert!(pages[1].content.contains("Page two"));
assert_eq!(pages[2].number, 3);
assert!(pages[2].content.contains("Page three"));
}
#[test]
fn test_parse_text_into_pages_with_triple_newlines() {
let extractor = PdfExtractor::new();
let text = "Section one content.\n\n\nSection two content.\n\n\nSection three content.";
let pages = extractor.parse_text_into_pages(text);
assert!(!pages.is_empty());
assert_eq!(pages[0].number, 1);
}
#[test]
fn test_preserve_structure_basic() {
let extractor = PdfExtractor::new();
let text = "Line one\nLine two\n\nNew paragraph\nContinuation";
let result = extractor.preserve_structure(text);
assert!(result.contains("Line one"));
assert!(result.contains("paragraph"));
assert!(result.contains("Continuation"));
}
#[test]
fn test_preserve_structure_excessive_whitespace() {
let extractor = PdfExtractor::new();
let text = "Line one \n\n\n\n Line two";
let result = extractor.preserve_structure(text);
assert!(result.contains("Line one"));
assert!(result.contains("Line two"));
assert!(!result.contains(" "));
}
#[test]
fn test_preserve_structure_empty() {
let extractor = PdfExtractor::new();
let text = "";
let result = extractor.preserve_structure(text);
assert_eq!(result, "");
}
#[test]
fn test_preserve_structure_only_whitespace() {
let extractor = PdfExtractor::new();
let text = " \n\n \n ";
let result = extractor.preserve_structure(text);
assert_eq!(result, "");
}
#[test]
fn test_extract_metadata_basic() {
let extractor = PdfExtractor::new();
let metadata = extractor.extract_metadata_from_bytes(&[], 5).unwrap();
assert_eq!(metadata.page_count, 5);
assert!(metadata.title.is_none());
assert!(metadata.author.is_none());
assert!(metadata.creation_date.is_none());
}
#[test]
fn test_extract_metadata_zero_pages() {
let extractor = PdfExtractor::new();
let metadata = extractor.extract_metadata_from_bytes(&[], 0).unwrap();
assert_eq!(metadata.page_count, 0);
}
#[test]
fn test_extract_metadata_large_document() {
let extractor = PdfExtractor::new();
let metadata = extractor.extract_metadata_from_bytes(&[], 1000).unwrap();
assert_eq!(metadata.page_count, 1000);
}
}