mod extraction;
mod ocr;
mod pages;
use crate::Result;
use crate::core::config::{ExtractionConfig, OutputFormat};
use crate::plugins::{DocumentExtractor, Plugin};
use crate::types::Metadata;
use crate::types::internal::{ElementKind, InternalDocument, InternalElement};
use async_trait::async_trait;
#[cfg(feature = "tokio-runtime")]
use std::path::Path;
#[cfg(feature = "pdf")]
use crate::pdf::error::PdfError;
#[cfg(feature = "ocr")]
pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr, evaluate_per_page_ocr};
#[cfg(feature = "pdf")]
use pdfium_render::prelude::{PdfDocument, Pdfium};
use extraction::extract_all_from_document;
#[cfg(feature = "ocr")]
use ocr::extract_with_ocr;
use pages::assign_tables_and_images_to_pages;
#[cfg(all(feature = "pdf", feature = "layout-detection"))]
struct LayoutDetectionBundle {
hints: Vec<Vec<crate::pdf::structure::types::LayoutHint>>,
images: Vec<image::DynamicImage>,
results: Vec<crate::pdf::layout_runner::PageLayoutResult>,
}
#[cfg(all(feature = "pdf", feature = "layout-detection"))]
fn run_layout_detection(content: &[u8], config: &ExtractionConfig) -> Option<LayoutDetectionBundle> {
let layout_config = config.layout.as_ref()?;
let mut engine = match crate::layout::take_or_create_engine(layout_config) {
Ok(e) => e,
Err(e) => {
tracing::warn!("Layout engine init failed, continuing without: {}", e);
return None;
}
};
let result = match crate::pdf::layout_runner::detect_layout_for_document(content, &mut engine) {
Ok((results, timing, images)) => {
tracing::info!(
total_ms = timing.total_ms,
avg_inference_ms = timing.avg_inference_ms(),
page_count = results.len(),
total_detections = results.iter().map(|r| r.regions.len()).sum::<usize>(),
"Layout detection completed"
);
let hints = extraction::convert_results_to_hints(&results);
Some(LayoutDetectionBundle { hints, images, results })
}
Err(e) => {
tracing::warn!("Layout detection failed, continuing without: {}", e);
None
}
};
crate::layout::return_engine(engine);
result
}
#[cfg(all(feature = "pdf", feature = "layout-detection", feature = "ocr"))]
fn run_layout_detection_ocr_pass(
content: &[u8],
config: &ExtractionConfig,
) -> Option<Vec<crate::layout::DetectionResult>> {
let layout_config = config.layout.as_ref()?;
let mut engine = match crate::layout::take_or_create_engine(layout_config) {
Ok(e) => e,
Err(e) => {
tracing::warn!("Layout engine init failed for OCR path: {}", e);
return None;
}
};
let mut all_results = Vec::new();
let batch_size = crate::pdf::layout_runner::DEFAULT_LAYOUT_BATCH_SIZE;
let result = match crate::pdf::layout_runner::detect_layout_for_document_batched(
content,
&mut engine,
batch_size,
|batch_res, _timings, _batch_imgs| {
for res in batch_res {
let img_w = res.render_width_px as f32;
let img_h = res.render_height_px as f32;
let page_w = res.page_width_pts;
let page_h = res.page_height_pts;
let sx = if page_w > 0.0 { img_w / page_w } else { 1.0 };
let sy = if page_h > 0.0 { img_h / page_h } else { 1.0 };
let detections = res
.regions
.into_iter()
.map(|region| {
let bbox = crate::layout::BBox {
x1: region.bbox.left * sx,
y1: (page_h - region.bbox.top) * sy,
x2: region.bbox.right * sx,
y2: (page_h - region.bbox.bottom) * sy,
};
crate::layout::LayoutDetection {
class: region.class,
confidence: region.confidence,
bbox,
}
})
.collect();
all_results.push(crate::layout::DetectionResult {
page_width: res.render_width_px,
page_height: res.render_height_px,
detections,
});
}
Ok(())
},
) {
Ok(_) => Some(all_results),
Err(e) => {
tracing::warn!("Layout detection batched pass failed: {}", e);
None
}
};
crate::layout::return_engine(engine);
result
}
#[cfg(feature = "ocr")]
async fn run_ocr_with_layout(
content: &[u8],
config: &ExtractionConfig,
path: Option<&std::path::Path>,
) -> crate::Result<(
String,
Vec<crate::types::Table>,
Vec<crate::types::OcrElement>,
Option<crate::types::internal::InternalDocument>,
)> {
let default_ocr_config = crate::core::config::OcrConfig::default();
let ocr_config = config.ocr.as_ref().unwrap_or(&default_ocr_config);
#[cfg(feature = "layout-detection")]
let layout_detections = run_layout_detection_ocr_pass(content, config);
if let Some(pipeline) = ocr_config.effective_pipeline() {
let (text, _ocr_tables, ocr_elements, pipeline_doc) = ocr::run_ocr_pipeline(
Some(content),
None,
#[cfg(feature = "layout-detection")]
layout_detections.as_deref(),
config,
&pipeline,
path,
)
.await?;
return Ok((text, Vec::new(), ocr_elements, pipeline_doc));
}
#[cfg(feature = "layout-detection")]
let layout_detections = run_layout_detection_ocr_pass(content, config);
let (text, _mean_conf, ocr_tables, ocr_elements, ocr_doc) = extract_with_ocr(
Some(content),
None, #[cfg(feature = "layout-detection")]
layout_detections.as_deref(),
config,
path,
)
.await?;
Ok((text, ocr_tables, ocr_elements, ocr_doc))
}
pub struct PdfExtractor;
impl Default for PdfExtractor {
fn default() -> Self {
Self::new()
}
}
impl PdfExtractor {
pub fn new() -> Self {
Self
}
}
impl Plugin for PdfExtractor {
fn name(&self) -> &str {
"pdf-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for PdfExtractor {
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<InternalDocument> {
self.extract_core(content, mime_type, config, None).await
}
#[cfg(feature = "tokio-runtime")]
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<InternalDocument> {
#[cfg(feature = "pdf")]
crate::pdf::oxide_text::set_current_pdf_path(Some(path.to_path_buf()));
let bytes = tokio::fs::read(path).await?;
let result = self.extract_core(&bytes, mime_type, config, Some(path)).await;
#[cfg(feature = "pdf")]
crate::pdf::oxide_text::set_current_pdf_path(None);
result
}
fn supported_mime_types(&self) -> &[&str] {
&["application/pdf"]
}
}
impl PdfExtractor {
async fn extract_core(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
path: Option<&std::path::Path>,
) -> Result<InternalDocument> {
tracing::debug!(format = "pdf", size_bytes = content.len(), "extraction starting");
let _ = &path;
#[cfg(feature = "pdf")]
let derotated = crate::pdf::text::strip_page_rotation(content);
#[cfg(feature = "pdf")]
let content = &*derotated;
#[cfg(feature = "pdf")]
#[allow(unused_variables, unused_mut)]
let (
mut pdf_metadata,
native_text,
mut tables,
page_contents,
boundaries,
pre_rendered_doc,
has_font_encoding_issues,
pdf_annotations,
) = {
#[cfg(target_arch = "wasm32")]
{
let pdfium = crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
.map_err(|pdf_err| {
if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
crate::error::KreuzbergError::Parsing {
message: "PDF extraction requires proper WASM module initialization. \
Ensure your WASM environment is set up with PDFium support. \
See: https://docs.kreuzberg.dev/wasm/pdf"
.to_string(),
source: None,
}
} else {
pdf_err.into()
}
})?;
let document = load_pdf_from_byte_slice(&pdfium, content, config)?;
extract_all_from_document(&document, config, None, None, None)?
}
#[cfg(all(not(target_arch = "wasm32"), feature = "tokio-runtime"))]
{
#[cfg(feature = "layout-detection")]
let layout_bundle = run_layout_detection(content, config);
#[cfg(feature = "layout-detection")]
let (layout_hints, layout_images, layout_results) = match layout_bundle {
Some(b) => (Some(b.hints), Some(b.images), Some(b.results)),
None => (None, None, None),
};
#[cfg(not(feature = "layout-detection"))]
let layout_hints: Option<Vec<Vec<crate::pdf::structure::types::LayoutHint>>> = None;
if crate::core::batch_mode::is_batch_mode() {
let content_owned = content.to_vec();
let span = tracing::Span::current();
let config_owned = config.clone();
let oxide_path = crate::pdf::oxide_text::current_pdf_path();
let result = tokio::task::spawn_blocking(move || {
let _guard = span.entered();
crate::pdf::oxide_text::set_current_pdf_path(oxide_path);
let pdfium =
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
let document = load_pdf_from_byte_slice(&pdfium, &content_owned, &config_owned)?;
let (
pdf_metadata,
native_text,
tables,
page_contents,
boundaries,
pre_rendered_doc,
has_font_encoding_issues,
pdf_annotations,
) = extract_all_from_document(
&document,
&config_owned,
layout_hints.as_deref(),
#[cfg(feature = "layout-detection")]
layout_images.as_deref(),
#[cfg(not(feature = "layout-detection"))]
None,
#[cfg(feature = "layout-detection")]
layout_results.as_deref(),
#[cfg(not(feature = "layout-detection"))]
None,
)
.map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
if let Some(page_cfg) = config_owned.pages.as_ref()
&& page_cfg.extract_pages
&& page_contents.is_none()
{
return Err(PdfError::ExtractionFailed(
"Page extraction was configured but no page data was extracted in batch mode"
.to_string(),
));
}
Ok::<_, crate::pdf::error::PdfError>((
pdf_metadata,
native_text,
tables,
page_contents,
boundaries,
pre_rendered_doc,
has_font_encoding_issues,
pdf_annotations,
))
})
.await
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))?;
match result {
Ok(tuple) => tuple,
Err(e) => return Err(e.into()),
}
} else {
let pdfium =
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
let document = load_pdf_from_byte_slice(&pdfium, content, config)?;
extract_all_from_document(
&document,
config,
layout_hints.as_deref(),
#[cfg(feature = "layout-detection")]
layout_images.as_deref(),
#[cfg(not(feature = "layout-detection"))]
None,
#[cfg(feature = "layout-detection")]
layout_results.as_deref(),
#[cfg(not(feature = "layout-detection"))]
None,
)?
}
}
#[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
{
#[cfg(feature = "layout-detection")]
let layout_bundle = run_layout_detection(content, config);
#[cfg(feature = "layout-detection")]
let (layout_hints, layout_images, layout_results) = match layout_bundle {
Some(b) => (Some(b.hints), Some(b.images), Some(b.results)),
None => (None, None, None),
};
#[cfg(not(feature = "layout-detection"))]
let (layout_hints, layout_images, layout_results): (
Option<Vec<Vec<crate::pdf::structure::types::LayoutHint>>>,
Option<()>,
Option<()>,
) = (None, None, None);
let pdfium =
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
let document = load_pdf_from_byte_slice(&pdfium, content, config)?;
extract_all_from_document(
&document,
config,
layout_hints.as_deref(),
#[cfg(feature = "layout-detection")]
layout_images.as_deref(),
#[cfg(not(feature = "layout-detection"))]
None,
#[cfg(feature = "layout-detection")]
layout_results.as_deref(),
#[cfg(not(feature = "layout-detection"))]
None,
)?
}
};
#[cfg(feature = "ocr")]
let mut ocr_tables: Vec<crate::types::Table> = Vec::new();
#[cfg(feature = "ocr")]
#[allow(unused_assignments)]
let mut _ocr_elements_from_ocr: Vec<crate::types::OcrElement> = Vec::new();
#[cfg(feature = "ocr")]
let mut ocr_internal_doc: Option<crate::types::internal::InternalDocument> = None;
#[cfg(feature = "ocr")]
let (text, used_ocr) = if config.disable_ocr {
(native_text, false)
} else if config.force_ocr {
let (ocr_text, ocr_tbls, ocr_elems, ocr_doc) = run_ocr_with_layout(content, config, path).await?;
ocr_tables = ocr_tbls;
_ocr_elements_from_ocr = ocr_elems;
ocr_internal_doc = ocr_doc;
(ocr_text, true)
} else if let Some(ref ocr_pages) = config.force_ocr_pages {
if !ocr_pages.is_empty() {
if let Some(ref bounds) = boundaries {
if !bounds.is_empty() {
let mixed =
ocr::extract_mixed_ocr_native(&native_text, bounds, ocr_pages, content, config, path)
.await?;
(mixed, true)
} else {
tracing::warn!("force_ocr_pages set but no page boundaries available; using native text");
(native_text, false)
}
} else {
tracing::warn!("force_ocr_pages set but no page boundaries available; using native text");
(native_text, false)
}
} else {
(native_text, false)
}
} else if let Some(ocr_config) = config.ocr.as_ref() {
let thresholds = ocr_config.effective_thresholds();
let decision = ocr::evaluate_per_page_ocr(
&native_text,
boundaries.as_deref(),
pdf_metadata.pdf_specific.page_count,
&thresholds,
);
if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
eprintln!(
"[kreuzberg::pdf::ocr] fallback={} font_encoding_issues={} non_whitespace={} alnum={} meaningful_words={} \
avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3} fragmented_word_ratio={:.3} \
avg_word_length={:.2} word_count={} consecutive_repeat_ratio={:.3}",
decision.fallback,
has_font_encoding_issues,
decision.stats.non_whitespace,
decision.stats.alnum,
decision.stats.meaningful_words,
decision.avg_non_whitespace,
decision.avg_alnum,
decision.stats.alnum_ratio,
decision.stats.fragmented_word_ratio,
decision.stats.avg_word_length,
decision.stats.word_count,
decision.stats.consecutive_repeat_ratio
);
}
let total_chars = native_text.chars().count();
let alnum_ws_chars = native_text
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.count();
let alnum_ws_ratio = if total_chars > 0 {
alnum_ws_chars as f64 / total_chars as f64
} else {
1.0
};
let has_substantive_doc = pre_rendered_doc.is_some()
&& total_chars >= thresholds.substantive_min_chars
&& alnum_ws_ratio >= thresholds.alnum_ws_ratio_threshold;
let skip_ocr_for_non_text = pre_rendered_doc.is_some()
&& total_chars >= thresholds.non_text_min_chars
&& alnum_ws_ratio < thresholds.alnum_ws_ratio_threshold;
if skip_ocr_for_non_text {
tracing::debug!(
alnum_ws_ratio,
total_chars,
alnum_ws_chars,
"Skipping OCR: font encoding issues but content is non-textual and pre-rendered structured doc available"
);
(native_text, false)
} else if has_substantive_doc {
tracing::debug!(
total_chars,
alnum_ws_ratio,
ocr_fallback = decision.fallback,
has_font_encoding_issues,
"Skipping OCR: pre-rendered structured doc available with substantive native text"
);
(native_text, false)
} else if decision.fallback || has_font_encoding_issues {
match run_ocr_with_layout(content, config, path).await {
Ok((ocr_text, ocr_tbls, ocr_elems, ocr_doc)) => {
ocr_tables = ocr_tbls;
_ocr_elements_from_ocr = ocr_elems;
ocr_internal_doc = ocr_doc;
(ocr_text, true)
}
Err(e) => {
tracing::warn!(
error = %e,
"OCR fallback failed; using native text extraction result"
);
(native_text, false)
}
}
} else {
(native_text, false)
}
} else {
(native_text, false)
};
#[cfg(not(feature = "ocr"))]
let (text, used_ocr) = (native_text, false);
#[cfg(feature = "ocr")]
if !ocr_tables.is_empty() {
tables.extend(ocr_tables);
tables.sort_by_key(|t| t.page_number);
}
#[cfg(feature = "pdf")]
let use_structured_doc = !used_ocr && pre_rendered_doc.is_some();
tracing::debug!(
used_ocr,
has_pre_rendered = pre_rendered_doc.is_some(),
use_structured_doc,
pre_rendered_elements = pre_rendered_doc.as_ref().map(|d| d.elements.len()).unwrap_or(0),
"PDF extractor: deciding whether to use pre-rendered document"
);
#[cfg(not(feature = "pdf"))]
let use_structured_doc = false;
#[cfg(feature = "pdf")]
if let Some(ref page_cfg) = config.pages
&& page_cfg.insert_page_markers
{
let marker_placeholder = page_cfg.marker_format.replace("{page_num}", "");
if !marker_placeholder.is_empty() && !text.contains(&marker_placeholder) {
#[cfg(feature = "otel")]
tracing::warn!(
"Page markers were configured but none found in extracted content. \
This may indicate very short documents or incomplete extraction."
);
}
}
let (images, image_fallback_warning) = if config.images.as_ref().map(|c| c.extract_images).unwrap_or(false) {
match crate::pdf::images::extract_images_from_pdf(content) {
Ok(mut pdf_images) => {
#[cfg(feature = "pdf")]
let fallback_count =
crate::pdf::images::reextract_raw_images_via_pdfium(content, &mut pdf_images).unwrap_or(0);
#[cfg(not(feature = "pdf"))]
let fallback_count = 0u32;
let warning = if fallback_count > 0 {
Some(crate::types::ProcessingWarning {
source: std::borrow::Cow::Borrowed("image_extraction"),
message: std::borrow::Cow::Owned(format!(
"{fallback_count} image(s) re-extracted via pdfium bitmap fallback"
)),
})
} else {
None
};
let extracted = pdf_images
.into_iter()
.enumerate()
.map(|(idx, img)| {
let format = std::borrow::Cow::Owned(img.decoded_format.clone());
crate::types::ExtractedImage {
data: img.data,
format,
image_index: idx,
page_number: Some(img.page_number),
width: Some(img.width as u32),
height: Some(img.height as u32),
colorspace: img.color_space,
bits_per_component: img.bits_per_component.map(|b| b as u32),
is_mask: false,
description: None,
ocr_result: None,
bounding_box: None,
source_path: None,
}
})
.collect();
(Some(extracted), warning)
}
Err(_) => (Some(vec![]), None),
}
} else {
(None, None)
};
#[cfg(feature = "pdf")]
let use_structured_doc = if use_structured_doc {
if let Some(ref doc) = pre_rendered_doc {
let doc_words: usize = doc.elements.iter().map(|e| e.text.split_whitespace().count()).sum();
let native_words = text.split_whitespace().count();
if native_words > 50 && doc_words > native_words * 2 {
tracing::debug!(
doc_words,
native_words,
"Layout quality gate: document has >2x native text words, falling back"
);
false
} else {
true
}
} else {
false
}
} else {
false
};
#[cfg(feature = "pdf")]
let (text, used_structured_doc) = if use_structured_doc {
(text, true)
} else {
(text, false)
};
#[cfg(not(feature = "pdf"))]
let used_structured_doc = false;
let final_pages = assign_tables_and_images_to_pages(page_contents, &tables, images.as_deref().unwrap_or(&[]));
if let (Some(final_pgs), Some(page_structure)) = (&final_pages, &mut pdf_metadata.page_structure)
&& let Some(ref mut page_infos) = page_structure.pages
{
for page_info in page_infos.iter_mut() {
if let Some(pc) = final_pgs.iter().find(|p| p.page_number == page_info.number) {
page_info.is_blank = pc.is_blank;
}
}
}
#[cfg(feature = "pdf")]
let pre_formatted_output = if used_structured_doc && config.output_format == OutputFormat::Markdown {
tracing::trace!("PDF extractor: signaling pre-formatted structured output to pipeline");
Some("markdown".to_string())
} else {
tracing::trace!(
used_structured_doc,
output_format = ?config.output_format,
"PDF extractor: NOT signaling pre-formatted output"
);
None
};
#[cfg(not(feature = "pdf"))]
let pre_formatted_output: Option<String> = None;
#[cfg(feature = "pdf")]
tracing::debug!(
use_structured_doc,
output_format = ?config.output_format,
"final document path selection"
);
#[cfg(feature = "pdf")]
let mut doc = if used_structured_doc {
if let Some(mut pre_doc) = pre_rendered_doc {
pre_doc.mime_type = std::borrow::Cow::Owned(mime_type.to_string());
pre_doc.metadata = Metadata {
output_format: pre_formatted_output,
title: pdf_metadata.title.clone(),
subject: pdf_metadata.subject.clone(),
authors: pdf_metadata.authors.clone(),
keywords: pdf_metadata.keywords.clone(),
created_at: pdf_metadata.created_at.clone(),
modified_at: pdf_metadata.modified_at.clone(),
created_by: pdf_metadata.created_by.clone(),
pages: pdf_metadata.page_structure.clone(),
format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata.pdf_specific)),
..Default::default()
};
for table in tables {
if !pre_doc
.tables
.iter()
.any(|t| t.page_number == table.page_number && t.markdown == table.markdown)
{
pre_doc.tables.push(table);
}
}
if let Some(imgs) = images {
pre_doc.images = imgs;
}
if let Some(warning) = image_fallback_warning.clone() {
pre_doc.processing_warnings.push(warning);
}
pre_doc.annotations = pdf_annotations;
pre_doc
} else {
let mut doc = InternalDocument::new("pdf");
doc.mime_type = std::borrow::Cow::Owned(mime_type.to_string());
for paragraph in text.split("\n\n") {
let trimmed = paragraph.trim();
if !trimmed.is_empty() {
doc.push_element(InternalElement::text(ElementKind::Paragraph, trimmed, 0));
}
}
doc.tables = tables;
if let Some(imgs) = images {
doc.images = imgs;
}
if let Some(warning) = image_fallback_warning.clone() {
doc.processing_warnings.push(warning);
}
doc.annotations = pdf_annotations;
doc
}
} else {
#[cfg(feature = "ocr")]
let has_ocr_doc = ocr_internal_doc.is_some();
#[cfg(feature = "ocr")]
let mut doc = if let Some(mut ocr_doc) = ocr_internal_doc.take() {
ocr_doc.mime_type = std::borrow::Cow::Owned(mime_type.to_string());
ocr_doc
} else {
let mut d = InternalDocument::new("pdf");
d.mime_type = std::borrow::Cow::Owned(mime_type.to_string());
for paragraph in text.split("\n\n") {
let trimmed = paragraph.trim();
if !trimmed.is_empty() {
d.push_element(InternalElement::text(ElementKind::Paragraph, trimmed, 0));
}
}
d
};
#[cfg(not(feature = "ocr"))]
let has_ocr_doc = false;
#[cfg(not(feature = "ocr"))]
let mut doc = {
let mut d = InternalDocument::new("pdf");
d.mime_type = std::borrow::Cow::Owned(mime_type.to_string());
for paragraph in text.split("\n\n") {
let trimmed = paragraph.trim();
if !trimmed.is_empty() {
d.push_element(InternalElement::text(ElementKind::Paragraph, trimmed, 0));
}
}
d
};
doc.metadata = Metadata {
output_format: pre_formatted_output,
title: pdf_metadata.title.clone(),
subject: pdf_metadata.subject.clone(),
authors: pdf_metadata.authors.clone(),
keywords: pdf_metadata.keywords.clone(),
created_at: pdf_metadata.created_at.clone(),
modified_at: pdf_metadata.modified_at.clone(),
created_by: pdf_metadata.created_by.clone(),
pages: pdf_metadata.page_structure.clone(),
format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata.pdf_specific)),
..Default::default()
};
if has_ocr_doc {
for table in tables {
if !doc
.tables
.iter()
.any(|t| t.page_number == table.page_number && t.markdown == table.markdown)
{
let table_index = doc.push_table(table);
doc.push_element(InternalElement::text(ElementKind::Table { table_index }, "", 0));
}
}
} else {
for table in tables {
let table_index = doc.push_table(table);
doc.push_element(InternalElement::text(ElementKind::Table { table_index }, "", 0));
}
}
if let Some(imgs) = images {
doc.images = imgs;
}
if let Some(warning) = image_fallback_warning {
doc.processing_warnings.push(warning);
}
doc.annotations = pdf_annotations;
doc
};
#[cfg(not(feature = "pdf"))]
let mut doc = {
let mut doc = InternalDocument::new("pdf");
doc.mime_type = std::borrow::Cow::Owned(mime_type.to_string());
for paragraph in text.split("\n\n") {
let trimmed = paragraph.trim();
if !trimmed.is_empty() {
doc.push_element(InternalElement::text(ElementKind::Paragraph, trimmed, 0));
}
}
doc
};
{
use crate::types::annotations::PdfAnnotationType;
use crate::types::uri::{Uri, UriKind};
let uris: Vec<Uri> = doc
.annotations
.as_ref()
.map(|annotations| {
annotations
.iter()
.filter(|a| a.annotation_type == PdfAnnotationType::Link)
.filter_map(|a| {
a.content.as_ref().map(|url| {
let kind = if url.starts_with('#') {
UriKind::Anchor
} else if url.starts_with("mailto:") {
UriKind::Email
} else {
UriKind::Hyperlink
};
Uri {
url: url.clone(),
label: Some(url.clone()),
page: Some(a.page_number as u32),
kind,
}
})
})
.collect()
})
.unwrap_or_default();
for uri in uris {
doc.push_uri(uri);
}
}
#[cfg(feature = "pdf")]
{
if let Ok(lopdf_doc) = lopdf::Document::load_mem(content) {
let bookmark_uris = crate::pdf::bookmarks::extract_bookmarks(&lopdf_doc);
for uri in bookmark_uris {
doc.push_uri(uri);
}
}
}
#[cfg(all(feature = "pdf", feature = "tokio-runtime"))]
{
let (embedded_children, embedded_warnings) =
crate::pdf::embedded_files::extract_and_process_embedded_files(content, config).await;
if !embedded_children.is_empty() {
match doc.children {
Some(ref mut existing) => existing.extend(embedded_children),
None => doc.children = Some(embedded_children),
}
}
for warning in embedded_warnings {
doc.processing_warnings.push(warning);
}
}
doc.prebuilt_pages = final_pages;
#[cfg(feature = "pdf")]
tracing::debug!(
elements = doc.elements.len(),
tables = doc.tables.len(),
has_pages = doc.prebuilt_pages.is_some(),
"InternalDocument finalized"
);
Ok(doc)
}
}
#[cfg(feature = "pdf")]
fn load_pdf_from_byte_slice<'pdf>(
pdfium: &'pdf Pdfium,
content: &'pdf [u8],
config: &ExtractionConfig,
) -> std::result::Result<PdfDocument<'pdf>, PdfError> {
let passwords = [None].into_iter().chain(
config
.pdf_options
.iter()
.flat_map(|o| &o.passwords)
.flatten()
.map(String::as_str)
.map(Some),
);
for pwd in passwords {
match pdfium.load_pdf_from_byte_slice(content, pwd) {
Ok(doc) => return Ok(doc),
Err(e) => {
let err_msg = crate::pdf::error::format_pdfium_error(e);
if !err_msg.contains("password") && !err_msg.contains("Password") {
return Err(PdfError::InvalidPdf(err_msg));
}
}
};
}
Err(PdfError::PasswordRequired)
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "ocr")]
use crate::core::config::OcrQualityThresholds;
#[test]
fn test_pdf_extractor_plugin_interface() {
let extractor = PdfExtractor::new();
assert_eq!(extractor.name(), "pdf-extractor");
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_pdf_extractor_supported_mime_types() {
let extractor = PdfExtractor::new();
let mime_types = extractor.supported_mime_types();
assert_eq!(mime_types.len(), 1);
assert!(mime_types.contains(&"application/pdf"));
}
#[cfg(feature = "ocr")]
#[test]
fn test_should_fallback_to_ocr_for_empty_text() {
assert!(ocr::evaluate_native_text_for_ocr("", Some(1), &OcrQualityThresholds::default()).fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_should_not_fallback_for_meaningful_text() {
let sample = "This page has searchable vector text and should avoid OCR.";
assert!(!ocr::evaluate_native_text_for_ocr(sample, Some(1), &OcrQualityThresholds::default()).fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_should_fallback_for_punctuation_only_text() {
let sample = " . , ; : -- -- ";
assert!(ocr::evaluate_native_text_for_ocr(sample, Some(2), &OcrQualityThresholds::default()).fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_no_boundaries_falls_back_to_whole_doc() {
let text = "This document has enough meaningful words for evaluation purposes here.";
let decision = ocr::evaluate_per_page_ocr(text, None, Some(1), &OcrQualityThresholds::default());
assert!(!decision.fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_empty_boundaries_falls_back_to_whole_doc() {
let text = "This document has enough meaningful words for evaluation purposes here.";
let decision = ocr::evaluate_per_page_ocr(text, Some(&[]), Some(1), &OcrQualityThresholds::default());
assert!(!decision.fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_all_pages_good() {
use crate::types::PageBoundary;
let page1 = "This first page has plenty of meaningful searchable text content here.";
let page2 = "This second page also has plenty of meaningful searchable text content.";
let text = format!("{}{}", page1, page2);
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: page1.len(),
page_number: 1,
},
PageBoundary {
byte_start: page1.len(),
byte_end: text.len(),
page_number: 2,
},
];
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2), &OcrQualityThresholds::default());
assert!(!decision.fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_one_bad_page_triggers_fallback() {
use crate::types::PageBoundary;
let good_page = "This page has plenty of meaningful searchable text content for extraction.";
let bad_page = " . ; ";
let text = format!("{}{}", good_page, bad_page);
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: good_page.len(),
page_number: 1,
},
PageBoundary {
byte_start: good_page.len(),
byte_end: text.len(),
page_number: 2,
},
];
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2), &OcrQualityThresholds::default());
assert!(decision.fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_empty_page_triggers_fallback() {
use crate::types::PageBoundary;
let good_page = "This page has plenty of meaningful searchable text content for extraction.";
let empty_page = "";
let text = format!("{}{}", good_page, empty_page);
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: good_page.len(),
page_number: 1,
},
PageBoundary {
byte_start: good_page.len(),
byte_end: text.len(),
page_number: 2,
},
];
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2), &OcrQualityThresholds::default());
assert!(decision.fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_preserves_document_stats_on_fallback() {
use crate::types::PageBoundary;
let good_page = "This page has plenty of meaningful searchable text content for extraction.";
let bad_page = " . ; ";
let text = format!("{}{}", good_page, bad_page);
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: good_page.len(),
page_number: 1,
},
PageBoundary {
byte_start: good_page.len(),
byte_end: text.len(),
page_number: 2,
},
];
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2), &OcrQualityThresholds::default());
assert!(decision.fallback);
assert!(decision.stats.non_whitespace > 0);
assert!(decision.stats.meaningful_words > 0);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_invalid_boundaries_skipped() {
use crate::types::PageBoundary;
let text = "This page has plenty of meaningful searchable text content for extraction.";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: text.len(),
page_number: 1,
},
PageBoundary {
byte_start: 999,
byte_end: 9999,
page_number: 2,
},
];
let decision = ocr::evaluate_per_page_ocr(text, Some(&boundaries), Some(1), &OcrQualityThresholds::default());
assert!(!decision.fallback);
}
#[cfg(feature = "ocr")]
#[test]
fn test_per_page_ocr_multi_page_correct_page_count() {
let text = "ab cd ef";
let decision_wrong = ocr::evaluate_native_text_for_ocr(text, None, &OcrQualityThresholds::default());
let decision_correct = ocr::evaluate_native_text_for_ocr(text, Some(20), &OcrQualityThresholds::default());
assert!(
decision_correct.avg_non_whitespace < decision_wrong.avg_non_whitespace,
"Correct page count should produce lower per-page averages"
);
}
#[tokio::test]
#[cfg(feature = "pdf")]
async fn test_pdf_batch_mode_validates_page_config_enabled() {
use crate::core::config::PageConfig;
let extractor = PdfExtractor::new();
let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
insert_page_markers: false,
marker_format: "<!-- PAGE {page_num} -->".to_string(),
}),
..Default::default()
};
let pdf_path =
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdf/google_doc_document.pdf");
if let Ok(content) = std::fs::read(pdf_path) {
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
assert!(
result.is_ok(),
"Failed to extract PDF with page config: {:?}",
result.err()
);
let extraction_result = result.unwrap();
let extraction_result = crate::extraction::derive::derive_extraction_result(
extraction_result,
true,
crate::core::config::OutputFormat::Plain,
);
assert!(
!extraction_result.content.is_empty(),
"Content should be extracted from PDF"
);
}
}
#[tokio::test]
#[cfg(feature = "pdf")]
async fn test_pdf_batch_mode_validates_page_config_disabled() {
let extractor = PdfExtractor::new();
let config = ExtractionConfig::default();
let pdf_path =
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdf/google_doc_document.pdf");
if let Ok(content) = std::fs::read(pdf_path) {
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
assert!(
result.is_ok(),
"Failed to extract PDF without page config: {:?}",
result.err()
);
let extraction_result = result.unwrap();
let extraction_result = crate::extraction::derive::derive_extraction_result(
extraction_result,
true,
crate::core::config::OutputFormat::Plain,
);
assert!(
extraction_result.pages.is_none(),
"Pages should not be extracted when pages config is None"
);
}
}
#[tokio::test]
#[cfg(feature = "pdf")]
async fn test_pdf_page_marker_validation() {
use crate::core::config::PageConfig;
let extractor = PdfExtractor::new();
let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
insert_page_markers: true,
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
}),
..Default::default()
};
let pdf_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdf/multi_page.pdf");
if let Ok(content) = std::fs::read(pdf_path) {
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
assert!(
result.is_ok(),
"Failed to extract PDF with page markers: {:?}",
result.err()
);
let extraction_result = result.unwrap();
let extraction_result = crate::extraction::derive::derive_extraction_result(
extraction_result,
true,
crate::core::config::OutputFormat::Plain,
);
let marker_placeholder = "<!-- PAGE ";
if extraction_result.content.len() > 100 {
assert!(
extraction_result.content.contains(marker_placeholder),
"Page markers should be inserted when configured and document has multiple pages"
);
}
}
}
#[tokio::test]
#[cfg(all(feature = "pdf", feature = "ocr"))]
async fn test_pdf_force_ocr_without_ocr_config() {
use crate::core::config::ExtractionConfig;
let extractor = PdfExtractor::new();
let config = ExtractionConfig {
force_ocr: true,
ocr: None,
..Default::default()
};
let pdf_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdf/multi_page.pdf");
if let Ok(content) = std::fs::read(pdf_path) {
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
if let Err(e) = result {
assert!(
!e.to_string().contains("OCR config required for force_ocr"),
"Should not require manual OCR config when force_ocr is true"
);
}
}
}
#[test]
#[cfg(feature = "pdf")]
fn test_pdf_extractor_without_feature_pdf() {
let extractor = PdfExtractor::new();
assert_eq!(extractor.name(), "pdf-extractor");
}
}