kreuzberg 4.3.0

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 75+ formats with async/sync APIs.
Documentation
//! Core PDF extraction functionality.
//!
//! Handles document loading, text extraction, metadata parsing, and table detection.

use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::types::{PageBoundary, PageContent};

#[cfg(feature = "pdf")]
use crate::types::Table;
#[cfg(feature = "pdf")]
use pdfium_render::prelude::*;

#[cfg(feature = "pdf")]
pub(crate) type PdfExtractionPhaseResult = (
    crate::pdf::metadata::PdfExtractionMetadata,
    String,
    Vec<Table>,
    Option<Vec<PageContent>>,
    Option<Vec<PageBoundary>>,
);

/// Extract text, metadata, and tables from a PDF document using a single shared instance.
///
/// This method consolidates all PDF extraction phases (text, metadata, tables) into a single
/// operation using a single PdfDocument instance. This avoids redundant document parsing
/// and pdfium initialization overhead.
///
/// # Performance
///
/// By reusing a single document instance across all extraction phases, we eliminate:
/// - Duplicate document parsing overhead (25-40ms saved)
/// - Redundant pdfium bindings initialization
/// - Multiple page tree traversals
///
/// Expected improvement: 20-30% faster PDF processing.
///
/// # Returns
///
/// A tuple containing:
/// - PDF metadata (title, authors, dates, page structure, etc.)
/// - Native extracted text (or empty if using OCR)
/// - Extracted tables (if OCR feature enabled)
/// - Per-page content (if page extraction configured)
/// - Page boundaries for per-page OCR evaluation
#[cfg(feature = "pdf")]
pub(crate) fn extract_all_from_document(
    document: &PdfDocument,
    config: &ExtractionConfig,
) -> Result<PdfExtractionPhaseResult> {
    let (native_text, boundaries, page_contents, pdf_metadata) =
        crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;

    let tables = extract_tables_from_document(document, &pdf_metadata)?;

    Ok((pdf_metadata, native_text, tables, page_contents, boundaries))
}

/// Extract tables from PDF document using native text positions.
///
/// This function converts PDF character positions to HocrWord format,
/// then uses the existing table reconstruction logic to detect tables.
///
/// Uses the shared PdfDocument reference (wrapped in Arc<RwLock<>> for thread-safety).
#[cfg(all(feature = "pdf", feature = "ocr"))]
fn extract_tables_from_document(
    document: &PdfDocument,
    _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<Table>> {
    use crate::ocr::table::{reconstruct_table, table_to_markdown};
    use crate::pdf::table::extract_words_from_page;

    let mut all_tables = Vec::new();

    for (page_index, page) in document.pages().iter().enumerate() {
        let words = extract_words_from_page(&page, 0.0)?;

        if words.is_empty() {
            continue;
        }

        let column_threshold = 50;
        let row_threshold_ratio = 0.5;

        let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);

        if !table_cells.is_empty() {
            let markdown = table_to_markdown(&table_cells);

            all_tables.push(Table {
                cells: table_cells,
                markdown,
                page_number: page_index + 1,
            });
        }
    }

    Ok(all_tables)
}

/// Fallback for when OCR feature is not enabled - returns empty tables.
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
fn extract_tables_from_document(
    _document: &PdfDocument,
    _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
) -> Result<Vec<crate::types::Table>> {
    Ok(vec![])
}