use super::bindings::{PdfiumHandle, bind_pdfium};
use super::error::{PdfError, Result};
use crate::core::config::PageConfig;
use crate::pdf::metadata::PdfExtractionMetadata;
use crate::types::{PageBoundary, PageContent};
use memchr::memmem;
use pdfium_render::prelude::*;
use std::borrow::Cow;
type PdfTextExtractionResult = (String, Option<Vec<PageBoundary>>, Option<Vec<PageContent>>);
pub struct PdfTextExtractor<'a> {
pdfium: PdfiumHandle<'a>,
}
impl PdfTextExtractor<'static> {
pub fn new() -> Result<Self> {
let pdfium = bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
Ok(PdfTextExtractor { pdfium })
}
}
impl PdfTextExtractor<'_> {
pub fn extract_text(&self, pdf_bytes: &[u8]) -> Result<String> {
self.extract_text_with_password(pdf_bytes, None)
}
pub fn extract_text_with_password(&self, pdf_bytes: &[u8], password: Option<&str>) -> Result<String> {
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
let err_msg = super::error::format_pdfium_error(e);
if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
PdfError::InvalidPassword
} else if err_msg.contains("password") || err_msg.contains("Password") {
PdfError::PasswordRequired
} else {
PdfError::InvalidPdf(err_msg)
}
})?;
let (content, _, _) = extract_text_from_pdf_document(&document, None, None)?;
Ok(content)
}
pub fn extract_text_with_passwords(&self, pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
let mut last_error = None;
for password in passwords {
match self.extract_text_with_password(pdf_bytes, Some(password)) {
Ok(text) => return Ok(text),
Err(e) => {
last_error = Some(e);
continue;
}
}
}
if let Some(err) = last_error {
return Err(err);
}
self.extract_text(pdf_bytes)
}
pub fn get_page_count(&self, pdf_bytes: &[u8]) -> Result<usize> {
let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, None).map_err(|e| {
let err_msg = super::error::format_pdfium_error(e);
if err_msg.contains("password") || err_msg.contains("Password") {
PdfError::PasswordRequired
} else {
PdfError::InvalidPdf(err_msg)
}
})?;
Ok(document.pages().len() as usize)
}
}
pub fn extract_text_from_pdf(pdf_bytes: &[u8]) -> Result<String> {
let extractor = PdfTextExtractor::new()?;
extractor.extract_text(pdf_bytes)
}
pub fn extract_text_from_pdf_with_password(pdf_bytes: &[u8], password: &str) -> Result<String> {
let extractor = PdfTextExtractor::new()?;
extractor.extract_text_with_password(pdf_bytes, Some(password))
}
pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
let extractor = PdfTextExtractor::new()?;
extractor.extract_text_with_passwords(pdf_bytes, passwords)
}
pub type PdfUnifiedExtractionResult = (
String,
Option<Vec<PageBoundary>>,
Option<Vec<PageContent>>,
PdfExtractionMetadata,
);
pub fn extract_text_and_metadata_from_pdf_document(
document: &PdfDocument<'_>,
extraction_config: Option<&crate::core::config::ExtractionConfig>,
) -> Result<PdfUnifiedExtractionResult> {
let page_config = extraction_config.and_then(|c| c.pages.as_ref());
let (text, boundaries, page_contents) = extract_text_from_pdf_document(document, page_config, extraction_config)?;
let metadata = crate::pdf::metadata::extract_metadata_from_document_impl(document, boundaries.as_deref(), &text)?;
Ok((text, boundaries, page_contents, metadata))
}
pub fn extract_text_from_pdf_document(
document: &PdfDocument<'_>,
page_config: Option<&PageConfig>,
extraction_config: Option<&crate::core::config::ExtractionConfig>,
) -> Result<PdfTextExtractionResult> {
if let Some(config) = page_config {
extract_text_lazy_with_tracking(document, config, extraction_config)
} else {
extract_text_lazy_fast_path(document)
}
}
pub(crate) fn strip_page_rotation(pdf_bytes: &[u8]) -> Cow<'_, [u8]> {
if !has_rotate_marker(pdf_bytes) {
return Cow::Borrowed(pdf_bytes);
}
let mut patched = pdf_bytes.to_vec();
let mut modified = false;
let mut pos = 0;
while pos + 7 < patched.len() {
let Some(offset) = find_rotate_offset(&patched, pos) else {
break;
};
let key_end = offset + 7;
let mut val_start = key_end;
while val_start < patched.len() && patched[val_start].is_ascii_whitespace() {
val_start += 1;
}
let mut val_end = val_start;
if val_end < patched.len() && patched[val_end] == b'-' {
val_end += 1;
}
while val_end < patched.len() && patched[val_end].is_ascii_digit() {
val_end += 1;
}
if val_end > val_start {
for byte in &mut patched[offset..val_end] {
*byte = b' ';
}
modified = true;
}
pos = if val_end > offset + 7 { val_end } else { offset + 7 };
}
if modified {
Cow::Owned(patched)
} else {
Cow::Borrowed(pdf_bytes)
}
}
fn has_rotate_marker(bytes: &[u8]) -> bool {
memmem::find(bytes, b"/Rotate").is_some()
}
fn find_rotate_offset(bytes: &[u8], start: usize) -> Option<usize> {
memmem::find(&bytes[start..], b"/Rotate").map(|p| start + p)
}
fn extract_text_lazy_fast_path(document: &PdfDocument<'_>) -> Result<PdfTextExtractionResult> {
let page_count = document.pages().len() as usize;
let mut content = String::new();
let mut total_sample_size = 0usize;
let mut sample_count = 0;
for (page_idx, page) in document.pages().iter().enumerate() {
let text = page
.text()
.map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
let page_text = text.all();
let page_size = page_text.len();
if page_idx > 0 {
content.push_str("\n\n");
}
content.push_str(&page_text);
if page_idx < 5 {
total_sample_size += page_size;
sample_count += 1;
}
if page_idx == 4 && sample_count > 0 && page_count > 5 {
let avg_page_size = total_sample_size / sample_count;
let estimated_remaining = avg_page_size * (page_count - 5);
content.reserve(estimated_remaining + (estimated_remaining / 10));
}
}
Ok((content, None, None))
}
fn extract_text_lazy_with_tracking(
document: &PdfDocument<'_>,
config: &PageConfig,
extraction_config: Option<&crate::core::config::ExtractionConfig>,
) -> Result<PdfTextExtractionResult> {
let mut content = String::new();
let page_count = document.pages().len() as usize;
let mut boundaries = Vec::with_capacity(page_count);
let mut page_contents = if config.extract_pages {
Some(Vec::with_capacity(page_count))
} else {
None
};
let should_extract_hierarchy = extraction_config
.and_then(|cfg| cfg.pdf_options.as_ref())
.and_then(|pdf_cfg| pdf_cfg.hierarchy.as_ref())
.map(|h_cfg| h_cfg.enabled)
.unwrap_or(false);
let hierarchy_config = extraction_config
.and_then(|cfg| cfg.pdf_options.as_ref())
.and_then(|pdf_cfg| pdf_cfg.hierarchy.as_ref())
.cloned();
let mut total_sample_size = 0usize;
let mut sample_count = 0;
for (page_idx, page) in document.pages().iter().enumerate() {
let page_number = page_idx + 1;
let text = page
.text()
.map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
let page_text_ref = text.all();
let page_size = page_text_ref.len();
if page_idx < 5 {
total_sample_size += page_size;
sample_count += 1;
}
if config.insert_page_markers {
let marker = config.marker_format.replace("{page_num}", &page_number.to_string());
content.push_str(&marker);
} else if page_idx > 0 {
content.push_str("\n\n");
}
let byte_start = content.len();
content.push_str(&page_text_ref);
let byte_end = content.len();
boundaries.push(PageBoundary {
byte_start,
byte_end,
page_number,
});
if let Some(ref mut pages) = page_contents {
let hierarchy = if should_extract_hierarchy {
extract_page_hierarchy(&page, hierarchy_config.as_ref())?
} else {
None
};
let is_blank = Some(crate::extraction::blank_detection::is_page_text_blank(&page_text_ref));
pages.push(PageContent {
page_number,
content: page_text_ref.to_owned(),
tables: Vec::new(),
images: Vec::new(),
hierarchy,
is_blank,
});
}
if page_idx == 4 && page_count > 5 && sample_count > 0 {
let avg_page_size = total_sample_size / sample_count;
let estimated_remaining = avg_page_size * (page_count - 5);
let separator_overhead = (page_count - 5) * 3;
content.reserve(estimated_remaining + separator_overhead + (estimated_remaining / 10));
}
}
Ok((content, Some(boundaries), page_contents))
}
fn extract_page_hierarchy(
page: &pdfium_render::prelude::PdfPage,
hierarchy_config: Option<&crate::core::config::HierarchyConfig>,
) -> Result<Option<crate::types::PageHierarchy>> {
use crate::pdf::hierarchy::{
HierarchyLevel, assign_hierarchy_levels, cluster_font_sizes, extract_chars_with_fonts, merge_chars_into_blocks,
};
use crate::types::HierarchicalBlock;
let config = match hierarchy_config {
Some(cfg) if cfg.enabled => cfg,
_ => return Ok(None),
};
let char_data = extract_chars_with_fonts(page)?;
if char_data.is_empty() {
return Ok(None);
}
let text_blocks = merge_chars_into_blocks(char_data);
if text_blocks.is_empty() {
return Ok(None);
}
let k_clusters = config.k_clusters.min(text_blocks.len());
let clusters = cluster_font_sizes(&text_blocks, k_clusters)?;
if clusters.is_empty() {
return Ok(None);
}
let kmeans_result = crate::pdf::hierarchy::KMeansResult {
labels: text_blocks
.iter()
.map(|block| {
let mut min_dist = f32::INFINITY;
let mut best_cluster = 0u32;
for (idx, cluster) in clusters.iter().enumerate() {
let dist = (block.font_size - cluster.centroid).abs();
if dist < min_dist {
min_dist = dist;
best_cluster = idx as u32;
}
}
best_cluster
})
.collect(),
};
let hierarchy_blocks = assign_hierarchy_levels(&text_blocks, &kmeans_result);
let blocks: Vec<HierarchicalBlock> = hierarchy_blocks
.into_iter()
.map(|hb| HierarchicalBlock {
text: hb.text,
font_size: hb.font_size,
level: match hb.hierarchy_level {
HierarchyLevel::H1 => "h1".to_string(),
HierarchyLevel::H2 => "h2".to_string(),
HierarchyLevel::H3 => "h3".to_string(),
HierarchyLevel::H4 => "h4".to_string(),
HierarchyLevel::H5 => "h5".to_string(),
HierarchyLevel::H6 => "h6".to_string(),
HierarchyLevel::Body => "body".to_string(),
},
bbox: if config.include_bbox {
Some((hb.bbox.left, hb.bbox.top, hb.bbox.right, hb.bbox.bottom))
} else {
None
},
})
.collect();
let block_count = blocks.len();
Ok(Some(crate::types::PageHierarchy { block_count, blocks }))
}
#[cfg(test)]
mod tests {
use super::*;
use serial_test::serial;
#[test]
#[serial]
fn test_extractor_creation() {
let result = PdfTextExtractor::new();
assert!(result.is_ok());
}
#[test]
#[serial]
fn test_extract_empty_pdf() {
let extractor = PdfTextExtractor::new().unwrap();
let result = extractor.extract_text(b"");
assert!(result.is_err());
}
#[test]
#[serial]
fn test_extract_invalid_pdf() {
let extractor = PdfTextExtractor::new().unwrap();
let result = extractor.extract_text(b"not a pdf");
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), PdfError::InvalidPdf(_)));
}
#[test]
#[serial]
fn test_password_required_detection() {
let extractor = PdfTextExtractor::new().unwrap();
let encrypted_pdf = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
let result = extractor.extract_text(encrypted_pdf);
if let Err(err) = result {
assert!(matches!(err, PdfError::PasswordRequired | PdfError::InvalidPdf(_)));
}
}
#[test]
#[serial]
fn test_extract_text_with_passwords_empty_list() {
let extractor = PdfTextExtractor::new().unwrap();
let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
assert!(result.is_err());
}
#[test]
fn test_strip_page_rotation_no_rotate() {
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page >>\nendobj";
let result = strip_page_rotation(pdf);
assert!(matches!(result, Cow::Borrowed(_)));
}
#[test]
fn test_strip_page_rotation_90() {
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 90 >>\nendobj";
let result = strip_page_rotation(pdf);
assert!(matches!(result, Cow::Owned(_)));
assert!(!has_rotate_marker(&result));
}
#[test]
fn test_strip_page_rotation_270() {
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 270 >>\nendobj";
let result = strip_page_rotation(pdf);
assert!(matches!(result, Cow::Owned(_)));
assert!(!has_rotate_marker(&result));
}
#[test]
fn test_strip_page_rotation_multiple() {
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Rotate 90 >>\n2 0 obj\n<< /Rotate 180 >>\nendobj";
let result = strip_page_rotation(pdf);
assert!(matches!(result, Cow::Owned(_)));
assert!(!has_rotate_marker(&result));
}
}
#[cfg(test)]
mod cache_regression_tests {
use super::*;
use serial_test::serial;
use std::time::Instant;
#[test]
#[serial]
fn test_no_global_cache_between_documents() {
let pdf_bytes = std::fs::read("../../test_documents/pdf/fake_memo.pdf").expect("Failed to read PDF");
let extractor = PdfTextExtractor::new().expect("Failed to create extractor");
let start = Instant::now();
let text1 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (cold)");
let cold = start.elapsed();
let start = Instant::now();
let text2 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (warm1)");
let warm1 = start.elapsed();
let start = Instant::now();
let text3 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (warm2)");
let warm2 = start.elapsed();
eprintln!("Cold: {:?}", cold);
eprintln!("Warm 1: {:?}", warm1);
eprintln!("Warm 2: {:?}", warm2);
assert_eq!(text1, text2);
assert_eq!(text2, text3);
let warm1_micros = warm1.as_micros().max(1);
let warm2_micros = warm2.as_micros().max(1);
let warm_ratio = if warm1_micros > warm2_micros {
warm1_micros / warm2_micros
} else {
warm2_micros / warm1_micros
};
assert!(
warm_ratio < 5,
"Warm calls have inconsistent performance ({}x difference) - warm1: {:?}, warm2: {:?}",
warm_ratio,
warm1,
warm2
);
let cold_warm_ratio = cold.as_micros() / warm1_micros;
eprintln!(
"Cold/Warm ratio: {}x (expected due to singleton initialization)",
cold_warm_ratio
);
}
}