use crate::Result;
use crate::core::config::{ExtractionConfig, OutputFormat};
use crate::types::{PageBoundary, PageContent};
#[cfg(feature = "pdf")]
use crate::types::Table;
#[cfg(feature = "pdf")]
use pdfium_render::prelude::*;
#[cfg(feature = "pdf")]
pub(crate) type PdfExtractionPhaseResult = (
crate::pdf::metadata::PdfExtractionMetadata,
String,
Vec<Table>,
Option<Vec<PageContent>>,
Option<Vec<PageBoundary>>,
Option<String>, );
#[cfg(feature = "pdf")]
pub(crate) fn extract_all_from_document(
document: &PdfDocument,
config: &ExtractionConfig,
) -> Result<PdfExtractionPhaseResult> {
let (native_text, boundaries, page_contents, pdf_metadata) =
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
let tables = extract_tables_from_document(document, &pdf_metadata)?;
let pre_rendered_markdown = if config.output_format == OutputFormat::Markdown && !config.force_ocr {
let k = config
.pdf_options
.as_ref()
.and_then(|opts| opts.hierarchy.as_ref())
.map(|h| h.k_clusters)
.unwrap_or(4);
match crate::pdf::markdown::render_document_as_markdown_with_tables(document, k, &tables) {
Ok(md) if !md.trim().is_empty() => Some(md),
Ok(_) => {
tracing::warn!("Markdown rendering produced empty output, will fall back to plain text");
None
}
Err(e) => {
tracing::warn!("Markdown rendering failed: {:?}, will fall back to plain text", e);
None
}
}
} else {
None
};
Ok((
pdf_metadata,
native_text,
tables,
page_contents,
boundaries,
pre_rendered_markdown,
))
}
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn extract_tables_from_document(
document: &PdfDocument,
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<Table>> {
use crate::ocr::table::{reconstruct_table, table_to_markdown};
use crate::pdf::table::extract_words_from_page;
let mut all_tables = Vec::new();
for (page_index, page) in document.pages().iter().enumerate() {
let words = extract_words_from_page(&page, 0.0)?;
if words.is_empty() {
continue;
}
let column_threshold = 50;
let row_threshold_ratio = 0.5;
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
let min_rows = 2;
let min_cols = table_cells.iter().map(|r| r.len()).min().unwrap_or(0);
if table_cells.len() < min_rows || min_cols < 2 {
continue;
}
let markdown = table_to_markdown(&table_cells);
let page_height = page.height().value as f64;
let img_left = words.iter().map(|w| w.left as f64).fold(f64::INFINITY, f64::min);
let img_top = words.iter().map(|w| w.top as f64).fold(f64::INFINITY, f64::min);
let img_right = words
.iter()
.map(|w| (w.left + w.width) as f64)
.fold(f64::NEG_INFINITY, f64::max);
let img_bottom = words
.iter()
.map(|w| (w.top + w.height) as f64)
.fold(f64::NEG_INFINITY, f64::max);
let bounding_box = if img_left.is_finite() {
Some(crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, })
} else {
None
};
all_tables.push(Table {
cells: table_cells,
markdown,
page_number: page_index + 1,
bounding_box,
});
}
Ok(all_tables)
}
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
fn extract_tables_from_document(
_document: &PdfDocument,
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<crate::types::Table>> {
Ok(vec![])
}
#[cfg(test)]
mod tests {
#[test]
fn test_bounding_box_coordinate_conversion() {
let page_height = 800.0_f64;
let img_left = 50.0_f64;
let img_top = 100.0_f64;
let img_right = 300.0_f64; let img_bottom = 150.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, };
assert_eq!(bbox.x0, 50.0);
assert_eq!(bbox.y0, 650.0); assert_eq!(bbox.x1, 300.0);
assert_eq!(bbox.y1, 700.0); assert!(bbox.y1 > bbox.y0);
}
#[test]
fn test_bounding_box_coordinate_conversion_different_scales() {
let page_height = 1000.0_f64;
let img_left = 100.0_f64;
let img_top = 50.0_f64;
let img_right = 600.0_f64;
let img_bottom = 400.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom, x1: img_right,
y1: page_height - img_top, };
assert_eq!(bbox.x0, 100.0);
assert_eq!(bbox.y0, 600.0);
assert_eq!(bbox.x1, 600.0);
assert_eq!(bbox.y1, 950.0);
assert_eq!(bbox.y1 - bbox.y0, 350.0);
}
#[test]
fn test_bounding_box_coordinate_conversion_preserves_width() {
let page_height = 595.0_f64;
let img_left = 72.0_f64;
let img_right = 522.0_f64; let img_top = 36.0_f64;
let img_bottom = 300.0_f64;
let bbox = crate::types::BoundingBox {
x0: img_left,
y0: page_height - img_bottom,
x1: img_right,
y1: page_height - img_top,
};
let expected_width = img_right - img_left;
let actual_width = bbox.x1 - bbox.x0;
assert_eq!(actual_width, expected_width);
assert_eq!(actual_width, 450.0);
}
#[test]
fn test_bounding_box_serialization_round_trip() {
let original = crate::types::BoundingBox {
x0: 10.5,
y0: 20.25,
x1: 100.75,
y1: 200.5,
};
let json = serde_json::to_string(&original).unwrap();
let deserialized: crate::types::BoundingBox = serde_json::from_str(&json).unwrap();
assert_eq!(original, deserialized);
assert_eq!(deserialized.x0, 10.5);
assert_eq!(deserialized.y0, 20.25);
assert_eq!(deserialized.x1, 100.75);
assert_eq!(deserialized.y1, 200.5);
}
}