use crate::Result;
use crate::core::config::{ExtractionConfig, OutputFormat};
use crate::types::{PageBoundary, PageContent, PdfAnnotation};
#[cfg(feature = "pdf")]
use crate::types::Table;
#[cfg(feature = "pdf")]
use pdfium_render::prelude::*;
#[cfg(feature = "pdf")]
pub(crate) type PdfExtractionPhaseResult = (
crate::pdf::metadata::PdfExtractionMetadata,
String,
Vec<Table>,
Option<Vec<PageContent>>,
Option<Vec<PageBoundary>>,
Option<String>, bool, Option<Vec<PdfAnnotation>>, );
#[cfg(feature = "pdf")]
pub(crate) fn extract_all_from_document(
document: &PdfDocument,
config: &ExtractionConfig,
) -> Result<PdfExtractionPhaseResult> {
let (native_text, boundaries, page_contents, pdf_metadata) =
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
let tables = extract_tables_from_document(document, &pdf_metadata)?;
let needs_structured = matches!(
config.output_format,
OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html
);
let pre_rendered_markdown = if needs_structured && !config.force_ocr {
let k = config
.pdf_options
.as_ref()
.and_then(|opts| opts.hierarchy.as_ref())
.map(|h| h.k_clusters)
.unwrap_or(4);
let (top_margin, bottom_margin) = config
.pdf_options
.as_ref()
.map(|opts| (opts.top_margin_fraction, opts.bottom_margin_fraction))
.unwrap_or((None, None));
let page_marker_format = config
.pages
.as_ref()
.filter(|p| p.insert_page_markers)
.map(|p| p.marker_format.as_str());
match crate::pdf::markdown::render_document_as_markdown_with_tables(
document,
k,
&tables,
top_margin,
bottom_margin,
page_marker_format,
) {
Ok(md) if !md.trim().is_empty() => Some(md),
Ok(_) => {
tracing::warn!("Markdown rendering produced empty output, will fall back to plain text");
None
}
Err(e) => {
tracing::warn!("Markdown rendering failed: {:?}, will fall back to plain text", e);
None
}
}
} else {
None
};
let has_font_encoding_issues = sample_unicode_map_errors(document);
let annotations = if config.pdf_options.as_ref().is_some_and(|opts| opts.extract_annotations) {
let extracted = crate::pdf::annotations::extract_annotations_from_document(document);
if extracted.is_empty() { None } else { Some(extracted) }
} else {
None
};
Ok((
pdf_metadata,
native_text,
tables,
page_contents,
boundaries,
pre_rendered_markdown,
has_font_encoding_issues,
annotations,
))
}
#[cfg(feature = "pdf")]
fn sample_unicode_map_errors(document: &PdfDocument) -> bool {
const MAX_SAMPLES_PER_PAGE: usize = 50;
const ERROR_RATIO_THRESHOLD: f32 = 0.3;
for page in document.pages().iter() {
let text = match page.text() {
Ok(t) => t,
Err(_) => continue,
};
let char_count = text.chars().len();
if char_count == 0 {
continue;
}
let mut sampled = 0usize;
let mut errors = 0usize;
let chars = text.chars();
let step = (char_count / MAX_SAMPLES_PER_PAGE).max(1);
for i in (0..char_count).step_by(step) {
if let Ok(ch) = chars.get(i) {
if ch.is_generated().unwrap_or(false) {
continue;
}
sampled += 1;
if ch.has_unicode_map_error().unwrap_or(false) {
errors += 1;
}
}
if sampled >= MAX_SAMPLES_PER_PAGE {
break;
}
}
if sampled >= 5 && (errors as f32 / sampled as f32) > ERROR_RATIO_THRESHOLD {
return true;
}
}
false
}
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn has_column_alignment(words: &[crate::ocr::table::HocrWord]) -> bool {
if words.len() < 6 {
return false;
}
const BUCKET_TOLERANCE: u32 = 15;
let mut buckets: Vec<(u32, usize)> = Vec::new();
for w in words {
let x = w.left;
if let Some(bucket) = buckets.iter_mut().find(|(bx, _)| x.abs_diff(*bx) <= BUCKET_TOLERANCE) {
bucket.1 += 1;
} else {
buckets.push((x, 1));
}
}
let significant_columns = buckets.iter().filter(|(_, count)| *count >= 3).count();
significant_columns >= 3
}
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn extract_tables_from_document(
document: &PdfDocument,
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<Table>> {
use crate::ocr::table::{post_process_table, reconstruct_table, table_to_markdown};
use crate::pdf::table::extract_words_from_page;
let mut all_tables = Vec::new();
for (page_index, page) in document.pages().iter().enumerate() {
let words = extract_words_from_page(&page, 0.0)?;
if words.len() < 6 {
continue;
}
if !has_column_alignment(&words) {
continue;
}
let column_threshold = 50;
let row_threshold_ratio = 0.5;
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
if table_cells.is_empty() || table_cells[0].is_empty() {
continue;
}
let table_cells = match post_process_table(table_cells) {
Some(cleaned) => cleaned,
None => continue,
};
let markdown = table_to_markdown(&table_cells);
let page_height = page.height().value as f64;
let img_left = words.iter().map(|w| w.left as f64).fold(f64::INFINITY, f64::min);
let img_top = words.iter().map(|w| w.top as f64).fold(f64::INFINITY, f64::min);
let img_right = words
.iter()
.map(|w| (w.left + w.width) as f64)
.fold(f64::NEG_INFINITY, f64::max);
let img_bottom = words
.iter()
.map(|w| (w.top + w.height) as f64)
.fold(f64::NEG_INFINITY, f64::max);
let bounding_box = if img_left.is_finite() {
Some(crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, })
} else {
None
};
all_tables.push(Table {
cells: table_cells,
markdown,
page_number: page_index + 1,
bounding_box,
});
}
Ok(all_tables)
}
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
fn extract_tables_from_document(
_document: &PdfDocument,
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<crate::types::Table>> {
Ok(vec![])
}
#[cfg(test)]
mod tests {
#[test]
fn test_bounding_box_coordinate_conversion() {
let page_height = 800.0_f64;
let img_left = 50.0_f64;
let img_top = 100.0_f64;
let img_right = 300.0_f64; let img_bottom = 150.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, };
assert_eq!(bbox.x0, 50.0);
assert_eq!(bbox.y0, 650.0); assert_eq!(bbox.x1, 300.0);
assert_eq!(bbox.y1, 700.0); assert!(bbox.y1 > bbox.y0);
}
#[test]
fn test_bounding_box_coordinate_conversion_different_scales() {
let page_height = 1000.0_f64;
let img_left = 100.0_f64;
let img_top = 50.0_f64;
let img_right = 600.0_f64;
let img_bottom = 400.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, };
assert_eq!(bbox.x0, 100.0);
assert_eq!(bbox.y0, 600.0);
assert_eq!(bbox.x1, 600.0);
assert_eq!(bbox.y1, 950.0);
assert_eq!(bbox.y1 - bbox.y0, 350.0);
}
#[test]
fn test_bounding_box_coordinate_conversion_preserves_width() {
let page_height = 595.0_f64;
let img_left = 72.0_f64;
let img_right = 522.0_f64; let img_top = 36.0_f64;
let img_bottom = 300.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom,
x1: img_right,
y1: page_height - img_top,
};
let expected_width = img_right - img_left;
let actual_width = bbox.x1 - bbox.x0;
assert_eq!(actual_width, expected_width);
assert_eq!(actual_width, 450.0);
}
#[test]
fn test_bounding_box_serialization_round_trip() {
let original = crate::types::BoundingBox {
x0: 10.5,
y0: 20.25,
x1: 100.75,
y1: 200.5,
};
let json = serde_json::to_string(&original).unwrap();
let deserialized: crate::types::BoundingBox = serde_json::from_str(&json).unwrap();
assert_eq!(original, deserialized);
assert_eq!(deserialized.x0, 10.5);
assert_eq!(deserialized.y0, 20.25);
assert_eq!(deserialized.x1, 100.75);
assert_eq!(deserialized.y1, 200.5);
}
#[test]
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn test_has_column_alignment_table_layout() {
use crate::ocr::table::HocrWord;
let words = vec![
HocrWord {
text: "Name".into(),
left: 50,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Age".into(),
left: 200,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "City".into(),
left: 400,
top: 100,
width: 50,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Alice".into(),
left: 50,
top: 120,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "30".into(),
left: 200,
top: 120,
width: 30,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "NYC".into(),
left: 400,
top: 120,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Bob".into(),
left: 50,
top: 140,
width: 50,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "25".into(),
left: 200,
top: 140,
width: 30,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "LA".into(),
left: 400,
top: 140,
width: 30,
height: 12,
confidence: 95.0,
},
];
assert!(super::has_column_alignment(&words));
}
#[test]
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn test_has_column_alignment_rejects_two_column_layout() {
use crate::ocr::table::HocrWord;
let words = vec![
HocrWord {
text: "Left".into(),
left: 50,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Right".into(),
left: 300,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "More".into(),
left: 50,
top: 120,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Text".into(),
left: 300,
top: 120,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Here".into(),
left: 50,
top: 140,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "Also".into(),
left: 300,
top: 140,
width: 60,
height: 12,
confidence: 95.0,
},
];
assert!(!super::has_column_alignment(&words));
}
#[test]
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn test_has_column_alignment_body_text() {
use crate::ocr::table::HocrWord;
let words = vec![
HocrWord {
text: "This".into(),
left: 50,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "is".into(),
left: 100,
top: 100,
width: 20,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "some".into(),
left: 130,
top: 100,
width: 45,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "body".into(),
left: 185,
top: 100,
width: 45,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "text".into(),
left: 240,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "here".into(),
left: 290,
top: 100,
width: 40,
height: 12,
confidence: 95.0,
},
];
assert!(!super::has_column_alignment(&words));
}
#[test]
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn test_has_column_alignment_too_few_words() {
use crate::ocr::table::HocrWord;
let words = vec![
HocrWord {
text: "Hello".into(),
left: 50,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
HocrWord {
text: "World".into(),
left: 300,
top: 100,
width: 60,
height: 12,
confidence: 95.0,
},
];
assert!(!super::has_column_alignment(&words));
}
}