use crate::Result;
use crate::core::config::{ExtractionConfig, OutputFormat};
use crate::types::{PageBoundary, PageContent, PdfAnnotation};
#[cfg(feature = "pdf")]
use crate::types::Table;
#[cfg(feature = "pdf")]
use pdfium_render::prelude::*;
#[cfg(feature = "pdf")]
pub(crate) type PdfExtractionPhaseResult = (
crate::pdf::metadata::PdfExtractionMetadata,
String,
Vec<Table>,
Option<Vec<PageContent>>,
Option<Vec<PageBoundary>>,
Option<crate::types::internal::InternalDocument>, bool, Option<Vec<PdfAnnotation>>, );
#[cfg(feature = "pdf")]
pub(crate) fn extract_all_from_document(
document: &PdfDocument,
config: &ExtractionConfig,
layout_hints: Option<&[Vec<crate::pdf::structure::types::LayoutHint>]>,
#[cfg(feature = "layout-detection")] layout_images: Option<&[image::DynamicImage]>,
#[cfg(not(feature = "layout-detection"))] _layout_images: Option<()>,
#[cfg(feature = "layout-detection")] layout_results: Option<&[crate::pdf::layout_runner::PageLayoutResult]>,
#[cfg(not(feature = "layout-detection"))] _layout_results: Option<()>,
) -> Result<PdfExtractionPhaseResult> {
let _span = tracing::debug_span!(
"extract_pdf",
page_count = document.pages().len(),
element_count = tracing::field::Empty,
has_text_layer = tracing::field::Empty,
)
.entered();
#[cfg(feature = "layout-detection")]
let has_layout = config.layout.is_some();
#[cfg(not(feature = "layout-detection"))]
let has_layout = false;
tracing::debug!(
output_format = ?config.output_format,
force_ocr = config.force_ocr,
has_layout,
"PDF extraction starting"
);
let (native_text, boundaries, page_contents, pdf_metadata) =
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
let allow_single_column = config
.pdf_options
.as_ref()
.is_some_and(|o| o.allow_single_column_tables);
let tables = extract_tables_from_document(document, &pdf_metadata, allow_single_column)?;
let mut has_font_encoding_issues = false;
let needs_structured = matches!(
config.output_format,
OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html
);
tracing::debug!(
output_format = ?config.output_format,
needs_structured,
force_ocr = config.force_ocr,
"PDF structure path: evaluating whether to render structured document"
);
let pre_rendered_doc = if needs_structured && !config.force_ocr {
let k = config
.pdf_options
.as_ref()
.and_then(|opts| opts.hierarchy.as_ref())
.map(|h| h.k_clusters)
.unwrap_or(4);
let (top_margin, bottom_margin) = config
.pdf_options
.as_ref()
.map(|opts| (opts.top_margin_fraction, opts.bottom_margin_fraction))
.unwrap_or((None, None));
let (strip_repeating_text, include_headers, include_footers) = config
.content_filter
.as_ref()
.map(|cf| (cf.strip_repeating_text, cf.include_headers, cf.include_footers))
.unwrap_or((true, false, false));
tracing::debug!(k_clusters = k, "PDF structure path: calling extract_document_structure");
match crate::pdf::structure::extract_document_structure(
document,
k,
&tables,
top_margin,
bottom_margin,
layout_hints,
#[cfg(feature = "layout-detection")]
layout_images,
#[cfg(not(feature = "layout-detection"))]
None,
#[cfg(feature = "layout-detection")]
layout_results,
#[cfg(not(feature = "layout-detection"))]
None,
allow_single_column,
#[cfg(feature = "layout-detection")]
config.layout.as_ref().map(|l| l.table_model).unwrap_or_default(),
#[cfg(not(feature = "layout-detection"))]
None,
strip_repeating_text,
include_headers,
include_footers,
) {
Ok((doc, has_encoding_issues)) if !doc.elements.is_empty() => {
tracing::debug!(
element_count = doc.elements.len(),
has_headings = doc
.elements
.iter()
.any(|e| matches!(e.kind, crate::types::internal::ElementKind::Heading { .. })),
"PDF structure path: render succeeded with content"
);
has_font_encoding_issues = has_encoding_issues;
Some(doc)
}
Ok((_, has_encoding_issues)) => {
tracing::warn!("Structure rendering produced empty output, will fall back to plain text");
has_font_encoding_issues = has_encoding_issues;
None
}
Err(e) => {
tracing::warn!("Structure rendering failed: {:?}, will fall back to plain text", e);
None
}
}
} else {
None
};
tracing::debug!(
has_pre_rendered = pre_rendered_doc.is_some(),
elements = pre_rendered_doc.as_ref().map(|d| d.elements.len()).unwrap_or(0),
"structure extraction complete"
);
let annotations = if config.pdf_options.as_ref().is_some_and(|opts| opts.extract_annotations) {
let extracted = crate::pdf::annotations::extract_annotations_from_document(document);
if extracted.is_empty() { None } else { Some(extracted) }
} else {
None
};
let element_count = pre_rendered_doc.as_ref().map(|d| d.elements.len()).unwrap_or(0);
let has_text = !native_text.trim().is_empty();
_span.record("element_count", element_count);
_span.record("has_text_layer", has_text);
Ok((
pdf_metadata,
native_text,
tables,
page_contents,
boundaries,
pre_rendered_doc,
has_font_encoding_issues,
annotations,
))
}
#[cfg(all(feature = "pdf", feature = "layout-detection"))]
pub(crate) fn convert_results_to_hints(
results: &[crate::pdf::layout_runner::PageLayoutResult],
) -> Vec<Vec<crate::pdf::structure::types::LayoutHint>> {
use crate::layout::LayoutClass;
use crate::pdf::structure::types::{LayoutHint, LayoutHintClass};
results
.iter()
.enumerate()
.map(|(page_idx, page)| {
let hints: Vec<LayoutHint> = page
.regions
.iter()
.map(|region| {
let class = match region.class {
LayoutClass::Title => LayoutHintClass::Title,
LayoutClass::SectionHeader => LayoutHintClass::SectionHeader,
LayoutClass::Code => LayoutHintClass::Code,
LayoutClass::Formula => LayoutHintClass::Formula,
LayoutClass::ListItem => LayoutHintClass::ListItem,
LayoutClass::Caption => LayoutHintClass::Caption,
LayoutClass::Footnote => LayoutHintClass::Footnote,
LayoutClass::PageHeader => LayoutHintClass::PageHeader,
LayoutClass::PageFooter => LayoutHintClass::PageFooter,
LayoutClass::Table => LayoutHintClass::Table,
LayoutClass::Picture => LayoutHintClass::Picture,
LayoutClass::Text => LayoutHintClass::Text,
_ => LayoutHintClass::Other,
};
LayoutHint {
class,
confidence: region.confidence,
left: region.bbox.left,
bottom: region.bbox.bottom,
right: region.bbox.right,
top: region.bbox.top,
}
})
.collect();
tracing::trace!(
page = page_idx,
table_hints = hints
.iter()
.filter(|h| matches!(h.class, LayoutHintClass::Table))
.count(),
"Layout hints for page"
);
hints
})
.collect()
}
#[cfg(feature = "pdf")]
fn has_column_alignment(words: &[crate::pdf::table_reconstruct::HocrWord]) -> bool {
if words.len() < 6 {
return false;
}
const BUCKET_TOLERANCE: u32 = 15;
let mut buckets: Vec<(u32, usize)> = Vec::new();
for w in words {
let x = w.left;
if let Some(bucket) = buckets.iter_mut().find(|(bx, _)| x.abs_diff(*bx) <= BUCKET_TOLERANCE) {
bucket.1 += 1;
} else {
buckets.push((x, 1));
}
}
let significant_columns = buckets.iter().filter(|(_, count)| *count >= 3).count();
significant_columns >= 3
}
#[cfg(feature = "pdf")]
fn extract_tables_from_document(
document: &PdfDocument,
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
allow_single_column: bool,
) -> Result<Vec<Table>> {
use crate::pdf::table::extract_words_from_page;
use crate::pdf::table_reconstruct::{post_process_table, reconstruct_table, table_to_markdown};
let mut all_tables = Vec::new();
for (page_index, page) in document.pages().iter().enumerate() {
let words = extract_words_from_page(&page, 0.0)?;
if words.len() < 6 {
continue;
}
if !has_column_alignment(&words) {
continue;
}
let column_threshold = 50;
let row_threshold_ratio = 0.5;
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
if table_cells.is_empty() || table_cells[0].is_empty() {
continue;
}
let table_cells = match post_process_table(table_cells, false, allow_single_column) {
Some(cleaned) => cleaned,
None => continue,
};
let markdown = table_to_markdown(&table_cells);
let page_height = page.height().value as f64;
let img_left = words.iter().map(|w| w.left as f64).fold(f64::INFINITY, f64::min);
let img_top = words.iter().map(|w| w.top as f64).fold(f64::INFINITY, f64::min);
let img_right = words
.iter()
.map(|w| (w.left + w.width) as f64)
.fold(f64::NEG_INFINITY, f64::max);
let img_bottom = words
.iter()
.map(|w| (w.top + w.height) as f64)
.fold(f64::NEG_INFINITY, f64::max);
let bounding_box = if img_left.is_finite() {
Some(crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, })
} else {
None
};
if let Some(ref bb) = bounding_box {
let bbox_height = (bb.y1 - bb.y0).abs();
if table_cells.len() <= 3 && page_height > 0.0 && bbox_height / page_height > 0.5 {
tracing::trace!(
page = page_index,
rows = table_cells.len(),
bbox_height,
page_height,
"heuristic table with <=3 rows spans >50% of page — skipping false positive"
);
continue;
}
}
all_tables.push(Table {
cells: table_cells,
markdown,
page_number: page_index + 1,
bounding_box,
});
}
Ok(all_tables)
}
#[cfg(test)]
mod tests {
#[test]
fn test_bounding_box_coordinate_conversion() {
let page_height = 800.0_f64;
let img_left = 50.0_f64;
let img_top = 100.0_f64;
let img_right = 300.0_f64; let img_bottom = 150.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, };
assert_eq!(bbox.x0, 50.0);
assert_eq!(bbox.y0, 650.0); assert_eq!(bbox.x1, 300.0);
assert_eq!(bbox.y1, 700.0); assert!(bbox.y1 > bbox.y0);
}
#[test]
fn test_bounding_box_coordinate_conversion_different_scales() {
let page_height = 1000.0_f64;
let img_left = 100.0_f64;
let img_top = 50.0_f64;
let img_right = 600.0_f64;
let img_bottom = 400.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, };
assert_eq!(bbox.x0, 100.0);
assert_eq!(bbox.y0, 600.0);
assert_eq!(bbox.x1, 600.0);
assert_eq!(bbox.y1, 950.0);
assert_eq!(bbox.y1 - bbox.y0, 350.0);
}
#[test]
fn test_bounding_box_coordinate_conversion_preserves_width() {
let page_height = 595.0_f64;
let img_left = 72.0_f64;
let img_right = 522.0_f64; let img_top = 36.0_f64;
let img_bottom = 300.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom,
x1: img_right,
y1: page_height - img_top,
};
let expected_width = img_right - img_left;
let actual_width = bbox.x1 - bbox.x0;
assert_eq!(actual_width, expected_width);
assert_eq!(actual_width, 450.0);
}
#[test]
fn test_bounding_box_serialization_round_trip() {
let original = crate::types::BoundingBox {
x0: 10.5,
y0: 20.25,
x1: 100.75,
y1: 200.5,
};
let json = serde_json::to_string(&original).unwrap();
let deserialized: crate::types::BoundingBox = serde_json::from_str(&json).unwrap();
assert_eq!(original, deserialized);
assert_eq!(deserialized.x0, 10.5);
assert_eq!(deserialized.y0, 20.25);
assert_eq!(deserialized.x1, 100.75);
assert_eq!(deserialized.y1, 200.5);
}
#[test]
#[cfg(feature = "pdf")]
fn test_has_column_alignment_table_layout() {
use crate::pdf::table_reconstruct::HocrWord;
let words = vec![
HocrWord {
text: "Name".into(),
left: 50,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Age".into(),
left: 200,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "City".into(),
left: 400,
top: 100,
width: 50,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Alice".into(),
left: 50,
top: 120,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "30".into(),
left: 200,
top: 120,
width: 30,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "NYC".into(),
left: 400,
top: 120,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Bob".into(),
left: 50,
top: 140,
width: 50,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "25".into(),
left: 200,
top: 140,
width: 30,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "LA".into(),
left: 400,
top: 140,
width: 30,
height: 12,
confidence: 95.0,
},
];
assert!(super::has_column_alignment(&words));
}
#[test]
#[cfg(feature = "pdf")]
fn test_has_column_alignment_rejects_two_column_layout() {
use crate::pdf::table_reconstruct::HocrWord;
let words = vec![
HocrWord {
text: "Left".into(),
left: 50,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Right".into(),
left: 300,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "More".into(),
left: 50,
top: 120,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Text".into(),
left: 300,
top: 120,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Here".into(),
left: 50,
top: 140,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Also".into(),
left: 300,
top: 140,
width: 60,
height: 12,
confidence: 95.0,
},
];
assert!(!super::has_column_alignment(&words));
}
#[test]
#[cfg(feature = "pdf")]
fn test_has_column_alignment_body_text() {
use crate::pdf::table_reconstruct::HocrWord;
let words = vec![
HocrWord {
text: "This".into(),
left: 50,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "is".into(),
left: 100,
top: 100,
width: 20,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "some".into(),
left: 130,
top: 100,
width: 45,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "body".into(),
left: 185,
top: 100,
width: 45,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "text".into(),
left: 240,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "here".into(),
left: 290,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
];
assert!(!super::has_column_alignment(&words));
}
#[test]
#[cfg(feature = "pdf")]
fn test_has_column_alignment_too_few_words() {
use crate::pdf::table_reconstruct::HocrWord;
let words = vec![
HocrWord {
text: "Hello".into(),
left: 50,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "World".into(),
left: 300,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
];
assert!(!super::has_column_alignment(&words));
}
}