use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::types::{PageBoundary, PageContent};
#[cfg(feature = "pdf")]
use crate::types::Table;
#[cfg(feature = "pdf")]
use pdfium_render::prelude::*;
#[cfg(feature = "pdf")]
pub(crate) type PdfExtractionPhaseResult = (
crate::pdf::metadata::PdfExtractionMetadata,
String,
Vec<Table>,
Option<Vec<PageContent>>,
Option<Vec<PageBoundary>>,
);
#[cfg(feature = "pdf")]
pub(crate) fn extract_all_from_document(
document: &PdfDocument,
config: &ExtractionConfig,
) -> Result<PdfExtractionPhaseResult> {
let (native_text, boundaries, page_contents, pdf_metadata) =
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
let tables = extract_tables_from_document(document, &pdf_metadata)?;
Ok((pdf_metadata, native_text, tables, page_contents, boundaries))
}
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn extract_tables_from_document(
document: &PdfDocument,
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<Table>> {
use crate::ocr::table::{reconstruct_table, table_to_markdown};
use crate::pdf::table::extract_words_from_page;
let mut all_tables = Vec::new();
for (page_index, page) in document.pages().iter().enumerate() {
let words = extract_words_from_page(&page, 0.0)?;
if words.is_empty() {
continue;
}
let column_threshold = 50;
let row_threshold_ratio = 0.5;
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
if !table_cells.is_empty() {
let markdown = table_to_markdown(&table_cells);
all_tables.push(Table {
cells: table_cells,
markdown,
page_number: page_index + 1,
});
}
}
Ok(all_tables)
}
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
fn extract_tables_from_document(
_document: &PdfDocument,
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<crate::types::Table>> {
Ok(vec![])
}