kreuzberg 4.4.6

//! PDF text extraction module.
//!
//! This module provides functions to extract text content from PDF files using the pdfium-render library.

use super::bindings::{PdfiumHandle, bind_pdfium};
use super::error::{PdfError, Result};
use crate::core::config::PageConfig;
use crate::pdf::metadata::PdfExtractionMetadata;
use crate::types::{PageBoundary, PageContent};
use memchr::memmem;
use pdfium_render::prelude::*;
use std::borrow::Cow;

/// Result type for PDF text extraction with optional page tracking.
type PdfTextExtractionResult = (String, Option<Vec<PageBoundary>>, Option<Vec<PageContent>>);

pub struct PdfTextExtractor<'a> {
    pdfium: PdfiumHandle<'a>,
}

impl PdfTextExtractor<'static> {
    pub fn new() -> Result<Self> {
        let pdfium = bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
        Ok(PdfTextExtractor { pdfium })
    }
}

impl PdfTextExtractor<'_> {
    pub fn extract_text(&self, pdf_bytes: &[u8]) -> Result<String> {
        self.extract_text_with_password(pdf_bytes, None)
    }

    pub fn extract_text_with_password(&self, pdf_bytes: &[u8], password: Option<&str>) -> Result<String> {
        let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, password).map_err(|e| {
            let err_msg = super::error::format_pdfium_error(e);
            if (err_msg.contains("password") || err_msg.contains("Password")) && password.is_some() {
                PdfError::InvalidPassword
            } else if err_msg.contains("password") || err_msg.contains("Password") {
                PdfError::PasswordRequired
            } else {
                PdfError::InvalidPdf(err_msg)
            }
        })?;

        let (content, _, _) = extract_text_from_pdf_document(&document, None, None)?;
        Ok(content)
    }

    pub fn extract_text_with_passwords(&self, pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
        let mut last_error = None;

        for password in passwords {
            match self.extract_text_with_password(pdf_bytes, Some(password)) {
                Ok(text) => return Ok(text),
                Err(e) => {
                    last_error = Some(e);
                    continue;
                }
            }
        }

        if let Some(err) = last_error {
            return Err(err);
        }

        self.extract_text(pdf_bytes)
    }

    pub fn get_page_count(&self, pdf_bytes: &[u8]) -> Result<usize> {
        let document = self.pdfium.load_pdf_from_byte_slice(pdf_bytes, None).map_err(|e| {
            let err_msg = super::error::format_pdfium_error(e);
            if err_msg.contains("password") || err_msg.contains("Password") {
                PdfError::PasswordRequired
            } else {
                PdfError::InvalidPdf(err_msg)
            }
        })?;

        Ok(document.pages().len() as usize)
    }
}

pub fn extract_text_from_pdf(pdf_bytes: &[u8]) -> Result<String> {
    let extractor = PdfTextExtractor::new()?;
    extractor.extract_text(pdf_bytes)
}

pub fn extract_text_from_pdf_with_password(pdf_bytes: &[u8], password: &str) -> Result<String> {
    let extractor = PdfTextExtractor::new()?;
    extractor.extract_text_with_password(pdf_bytes, Some(password))
}

pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
    let extractor = PdfTextExtractor::new()?;
    extractor.extract_text_with_passwords(pdf_bytes, passwords)
}

/// Result type for unified PDF text and metadata extraction.
///
/// Contains text, optional page boundaries, optional per-page content, and metadata.
pub type PdfUnifiedExtractionResult = (
    String,
    Option<Vec<PageBoundary>>,
    Option<Vec<PageContent>>,
    PdfExtractionMetadata,
);

/// Extract text and metadata from PDF document in a single pass.
///
/// This is an optimized function that extracts both text and metadata in one pass
/// through the document, avoiding redundant document parsing. It combines the
/// functionality of `extract_text_from_pdf_document` and
/// `extract_metadata_from_document` into a single unified operation.
///
/// # Arguments
///
/// * `document` - The PDF document to extract from
/// * `extraction_config` - Optional extraction configuration for hierarchy and page tracking
///
/// # Returns
///
/// A tuple containing:
/// - The extracted text content (String)
/// - Optional page boundaries when page tracking is enabled (Vec<PageBoundary>)
/// - Optional per-page content when extract_pages is enabled (Vec<PageContent>)
/// - Complete extraction metadata (PdfExtractionMetadata)
///
/// # Performance
///
/// This function is optimized for single-pass extraction. It performs all document
/// scanning in one iteration, avoiding redundant pdfium operations compared to
/// calling text and metadata extraction separately.
pub fn extract_text_and_metadata_from_pdf_document(
    document: &PdfDocument<'_>,
    extraction_config: Option<&crate::core::config::ExtractionConfig>,
) -> Result<PdfUnifiedExtractionResult> {
    let page_config = extraction_config.and_then(|c| c.pages.as_ref());
    let (text, boundaries, page_contents) = extract_text_from_pdf_document(document, page_config, extraction_config)?;

    let metadata = crate::pdf::metadata::extract_metadata_from_document_impl(document, boundaries.as_deref(), &text)?;

    Ok((text, boundaries, page_contents, metadata))
}

/// Extract text from PDF document with optional page boundary tracking.
///
/// # Arguments
///
/// * `document` - The PDF document to extract text from
/// * `page_config` - Optional page configuration for boundary tracking and page markers
/// * `extraction_config` - Optional extraction configuration for hierarchy detection
///
/// # Returns
///
/// A tuple containing:
/// - The extracted text content (String)
/// - Optional page boundaries when page tracking is enabled (Vec<PageBoundary>)
/// - Optional per-page content when extract_pages is enabled (Vec<PageContent>)
///
/// # Implementation Details
///
/// Uses lazy page-by-page iteration to reduce memory footprint. Pages are processed
/// one at a time and released after extraction, rather than accumulating all pages
/// in memory. This approach saves 40-50MB for large documents while improving
/// performance by 15-25% through reduced upfront work.
///
/// When page_config is None, uses fast path with minimal overhead.
/// When page_config is Some, tracks byte offsets using .len() for O(1) performance (UTF-8 valid boundaries).
pub fn extract_text_from_pdf_document(
    document: &PdfDocument<'_>,
    page_config: Option<&PageConfig>,
    extraction_config: Option<&crate::core::config::ExtractionConfig>,
) -> Result<PdfTextExtractionResult> {
    if let Some(config) = page_config {
        extract_text_lazy_with_tracking(document, config, extraction_config)
    } else {
        extract_text_lazy_fast_path(document)
    }
}

/// Strip `/Rotate` entries from PDF bytes to work around a pdfium bug where
/// `FPDFText_CountChars` returns 0 for pages with 90° or 270° rotation.
///
/// PDF `/Rotate` is a display hint — the actual text coordinates in content streams
/// are in the original (unrotated) space. Pdfium's text extraction layer fails to
/// account for this, so we remove the rotation before loading the document.
///
/// Uses direct byte-level patching to handle incremental PDF updates correctly
/// (where lopdf may miss updated objects). Replaces `/Rotate <value>` with spaces
/// to preserve file offsets and cross-reference table validity.
///
/// Returns `Cow::Borrowed` if no `/Rotate` entries were patched,
/// or `Cow::Owned` with patched bytes if any `/Rotate` entries were blanked.
pub(crate) fn strip_page_rotation(pdf_bytes: &[u8]) -> Cow<'_, [u8]> {
    // Quick scan: if /Rotate doesn't appear at all, skip allocation.
    if !has_rotate_marker(pdf_bytes) {
        return Cow::Borrowed(pdf_bytes);
    }

    let mut patched = pdf_bytes.to_vec();
    let mut modified = false;
    let mut pos = 0;

    while pos + 7 < patched.len() {
        let Some(offset) = find_rotate_offset(&patched, pos) else {
            break;
        };

        let key_end = offset + 7;

        // Skip whitespace between key and value
        let mut val_start = key_end;
        while val_start < patched.len() && patched[val_start].is_ascii_whitespace() {
            val_start += 1;
        }

        // Read the integer value
        let mut val_end = val_start;
        if val_end < patched.len() && patched[val_end] == b'-' {
            val_end += 1;
        }
        while val_end < patched.len() && patched[val_end].is_ascii_digit() {
            val_end += 1;
        }

        // Only blank if we found a numeric value after /Rotate
        if val_end > val_start {
            for byte in &mut patched[offset..val_end] {
                *byte = b' ';
            }
            modified = true;
        }

        pos = if val_end > offset + 7 { val_end } else { offset + 7 };
    }

    if modified {
        Cow::Owned(patched)
    } else {
        Cow::Borrowed(pdf_bytes)
    }
}

/// Check if the raw PDF bytes contain any `/Rotate` marker.
fn has_rotate_marker(bytes: &[u8]) -> bool {
    memmem::find(bytes, b"/Rotate").is_some()
}

/// Find the next `/Rotate` offset starting from `start`.
fn find_rotate_offset(bytes: &[u8], start: usize) -> Option<usize> {
    memmem::find(&bytes[start..], b"/Rotate").map(|p| start + p)
}

/// Fast path for text extraction without page tracking.
///
/// Processes pages one-by-one lazily, building content incrementally with
/// pre-allocated capacity to minimize reallocation overhead. This combines
/// memory efficiency of lazy iteration with the allocation optimization
/// of pre-sizing.
///
/// # Performance Optimization
///
/// Pre-allocates buffer capacity by sampling the first 5 pages' text length
/// and extrapolating for the full document. This reduces String reallocation
/// calls from O(n) to O(log n) while maintaining low peak memory usage.
/// For large documents, this can reduce allocation overhead by 40-50%.
fn extract_text_lazy_fast_path(document: &PdfDocument<'_>) -> Result<PdfTextExtractionResult> {
    let page_count = document.pages().len() as usize;
    let mut content = String::new();
    let mut total_sample_size = 0usize;
    let mut sample_count = 0;

    for (page_idx, page) in document.pages().iter().enumerate() {
        let text = page
            .text()
            .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;

        let page_text = text.all();
        let page_size = page_text.len();

        if page_idx > 0 {
            content.push_str("\n\n");
        }

        content.push_str(&page_text);

        if page_idx < 5 {
            total_sample_size += page_size;
            sample_count += 1;
        }

        if page_idx == 4 && sample_count > 0 && page_count > 5 {
            let avg_page_size = total_sample_size / sample_count;
            let estimated_remaining = avg_page_size * (page_count - 5);
            content.reserve(estimated_remaining + (estimated_remaining / 10));
        }
    }

    Ok((content, None, None))
}

/// Lazy extraction with page boundary and content tracking.
///
/// Processes pages one-by-one, tracking byte boundaries and optionally
/// collecting per-page content. Pre-allocates buffer capacity using an
/// adaptive strategy to minimize reallocations while maintaining low peak
/// memory usage.
///
/// When hierarchy extraction is enabled, extracts text hierarchy (H1-H6 levels)
/// from font size clustering and assigns semantic heading levels to text blocks.
///
/// # Performance Optimization
///
/// Uses a two-phase approach: sample first 5 pages to estimate average
/// page size, then reserve capacity for remaining pages. This reduces
/// allocations from O(n) to O(log n) while keeping memory efficient.
fn extract_text_lazy_with_tracking(
    document: &PdfDocument<'_>,
    config: &PageConfig,
    extraction_config: Option<&crate::core::config::ExtractionConfig>,
) -> Result<PdfTextExtractionResult> {
    let mut content = String::new();
    let page_count = document.pages().len() as usize;
    let mut boundaries = Vec::with_capacity(page_count);
    let mut page_contents = if config.extract_pages {
        Some(Vec::with_capacity(page_count))
    } else {
        None
    };

    // Check if hierarchy extraction is enabled
    let should_extract_hierarchy = extraction_config
        .and_then(|cfg| cfg.pdf_options.as_ref())
        .and_then(|pdf_cfg| pdf_cfg.hierarchy.as_ref())
        .map(|h_cfg| h_cfg.enabled)
        .unwrap_or(false);

    let hierarchy_config = extraction_config
        .and_then(|cfg| cfg.pdf_options.as_ref())
        .and_then(|pdf_cfg| pdf_cfg.hierarchy.as_ref())
        .cloned();

    let mut total_sample_size = 0usize;
    let mut sample_count = 0;

    for (page_idx, page) in document.pages().iter().enumerate() {
        let page_number = page_idx + 1;

        let text = page
            .text()
            .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;

        let page_text_ref = text.all();
        let page_size = page_text_ref.len();

        if page_idx < 5 {
            total_sample_size += page_size;
            sample_count += 1;
        }

        // Insert page marker before the page content (for ALL pages including page 1)
        if config.insert_page_markers {
            let marker = config.marker_format.replace("{page_num}", &page_number.to_string());
            content.push_str(&marker);
        } else if page_idx > 0 {
            // Only add separator between pages when markers are disabled
            content.push_str("\n\n");
        }

        let byte_start = content.len();
        content.push_str(&page_text_ref);
        let byte_end = content.len();

        boundaries.push(PageBoundary {
            byte_start,
            byte_end,
            page_number,
        });

        if let Some(ref mut pages) = page_contents {
            // Extract hierarchy if enabled
            let hierarchy = if should_extract_hierarchy {
                extract_page_hierarchy(&page, hierarchy_config.as_ref())?
            } else {
                None
            };

            let is_blank = Some(crate::extraction::blank_detection::is_page_text_blank(&page_text_ref));
            pages.push(PageContent {
                page_number,
                content: page_text_ref.to_owned(),
                tables: Vec::new(),
                images: Vec::new(),
                hierarchy,
                is_blank,
            });
        }

        if page_idx == 4 && page_count > 5 && sample_count > 0 {
            let avg_page_size = total_sample_size / sample_count;
            let estimated_remaining = avg_page_size * (page_count - 5);
            let separator_overhead = (page_count - 5) * 3;
            content.reserve(estimated_remaining + separator_overhead + (estimated_remaining / 10));
        }
    }

    Ok((content, Some(boundaries), page_contents))
}

/// Extract text hierarchy from a single PDF page.
///
/// Uses font size clustering to identify heading levels (H1-H6) and assigns
/// hierarchy levels to text blocks based on their font sizes.
///
/// # Arguments
///
/// * `page` - The PDF page to extract hierarchy from
/// * `hierarchy_config` - Configuration for hierarchy extraction
///
/// # Returns
///
/// Optional PageHierarchy containing hierarchical blocks with heading levels
fn extract_page_hierarchy(
    page: &pdfium_render::prelude::PdfPage,
    hierarchy_config: Option<&crate::core::config::HierarchyConfig>,
) -> Result<Option<crate::types::PageHierarchy>> {
    use crate::pdf::hierarchy::{
        HierarchyLevel, assign_hierarchy_levels, cluster_font_sizes, extract_chars_with_fonts, merge_chars_into_blocks,
    };
    use crate::types::HierarchicalBlock;

    // Check if config is present and hierarchy is enabled
    let config = match hierarchy_config {
        Some(cfg) if cfg.enabled => cfg,
        _ => return Ok(None),
    };

    // Extract characters with font information
    let char_data = extract_chars_with_fonts(page)?;

    if char_data.is_empty() {
        return Ok(None);
    }

    // Merge characters into text blocks
    let text_blocks = merge_chars_into_blocks(char_data);

    if text_blocks.is_empty() {
        return Ok(None);
    }

    // Cluster by font sizes
    let k_clusters = config.k_clusters.min(text_blocks.len());
    let clusters = cluster_font_sizes(&text_blocks, k_clusters)?;

    if clusters.is_empty() {
        return Ok(None);
    }

    // Assign hierarchy levels using KMeans-based clustering
    let kmeans_result = crate::pdf::hierarchy::KMeansResult {
        labels: text_blocks
            .iter()
            .map(|block| {
                // Find which cluster this block belongs to
                let mut min_dist = f32::INFINITY;
                let mut best_cluster = 0u32;
                for (idx, cluster) in clusters.iter().enumerate() {
                    let dist = (block.font_size - cluster.centroid).abs();
                    if dist < min_dist {
                        min_dist = dist;
                        best_cluster = idx as u32;
                    }
                }
                best_cluster
            })
            .collect(),
    };

    let hierarchy_blocks = assign_hierarchy_levels(&text_blocks, &kmeans_result);

    // Convert to output format
    let blocks: Vec<HierarchicalBlock> = hierarchy_blocks
        .into_iter()
        .map(|hb| HierarchicalBlock {
            text: hb.text,
            font_size: hb.font_size,
            level: match hb.hierarchy_level {
                HierarchyLevel::H1 => "h1".to_string(),
                HierarchyLevel::H2 => "h2".to_string(),
                HierarchyLevel::H3 => "h3".to_string(),
                HierarchyLevel::H4 => "h4".to_string(),
                HierarchyLevel::H5 => "h5".to_string(),
                HierarchyLevel::H6 => "h6".to_string(),
                HierarchyLevel::Body => "body".to_string(),
            },
            bbox: if config.include_bbox {
                Some((hb.bbox.left, hb.bbox.top, hb.bbox.right, hb.bbox.bottom))
            } else {
                None
            },
        })
        .collect();

    let block_count = blocks.len();

    Ok(Some(crate::types::PageHierarchy { block_count, blocks }))
}

#[cfg(test)]
mod tests {
    use super::*;
    use serial_test::serial;

    #[test]
    #[serial]
    fn test_extractor_creation() {
        let result = PdfTextExtractor::new();
        assert!(result.is_ok());
    }

    #[test]
    #[serial]
    fn test_extract_empty_pdf() {
        let extractor = PdfTextExtractor::new().unwrap();
        let result = extractor.extract_text(b"");
        assert!(result.is_err());
    }

    #[test]
    #[serial]
    fn test_extract_invalid_pdf() {
        let extractor = PdfTextExtractor::new().unwrap();
        let result = extractor.extract_text(b"not a pdf");
        assert!(result.is_err());
        assert!(matches!(result.unwrap_err(), PdfError::InvalidPdf(_)));
    }

    #[test]
    #[serial]
    fn test_password_required_detection() {
        let extractor = PdfTextExtractor::new().unwrap();
        let encrypted_pdf = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
        let result = extractor.extract_text(encrypted_pdf);

        if let Err(err) = result {
            assert!(matches!(err, PdfError::PasswordRequired | PdfError::InvalidPdf(_)));
        }
    }

    #[test]
    #[serial]
    fn test_extract_text_with_passwords_empty_list() {
        let extractor = PdfTextExtractor::new().unwrap();
        let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
        assert!(result.is_err());
    }

    #[test]
    fn test_strip_page_rotation_no_rotate() {
        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page >>\nendobj";
        let result = strip_page_rotation(pdf);
        assert!(matches!(result, Cow::Borrowed(_)));
    }

    #[test]
    fn test_strip_page_rotation_90() {
        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 90 >>\nendobj";
        let result = strip_page_rotation(pdf);
        assert!(matches!(result, Cow::Owned(_)));
        assert!(!has_rotate_marker(&result));
    }

    #[test]
    fn test_strip_page_rotation_270() {
        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 270 >>\nendobj";
        let result = strip_page_rotation(pdf);
        assert!(matches!(result, Cow::Owned(_)));
        assert!(!has_rotate_marker(&result));
    }

    #[test]
    fn test_strip_page_rotation_multiple() {
        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Rotate 90 >>\n2 0 obj\n<< /Rotate 180 >>\nendobj";
        let result = strip_page_rotation(pdf);
        assert!(matches!(result, Cow::Owned(_)));
        assert!(!has_rotate_marker(&result));
    }
}

#[cfg(test)]
mod cache_regression_tests {
    use super::*;
    use serial_test::serial;
    use std::time::Instant;

    /// Test that multiple extractions of the same document produce consistent results.
    ///
    /// Note: The Pdfium library uses a singleton pattern for initialization. The first
    /// call to bind_pdfium() initializes the library (expensive), while subsequent
    /// calls reuse the cached instance (fast). This is correct behavior, not a bug.
    ///
    /// This test verifies that:
    /// 1. Multiple extractions produce identical text content
    /// 2. The singleton pattern provides consistent extraction behavior
    #[test]
    #[serial]
    fn test_no_global_cache_between_documents() {
        let pdf_bytes = std::fs::read("../../test_documents/pdf/fake_memo.pdf").expect("Failed to read PDF");

        let extractor = PdfTextExtractor::new().expect("Failed to create extractor");

        let start = Instant::now();
        let text1 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (cold)");
        let cold = start.elapsed();

        let start = Instant::now();
        let text2 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (warm1)");
        let warm1 = start.elapsed();

        let start = Instant::now();
        let text3 = extractor.extract_text(&pdf_bytes).expect("Failed to extract (warm2)");
        let warm2 = start.elapsed();

        eprintln!("Cold:   {:?}", cold);
        eprintln!("Warm 1: {:?}", warm1);
        eprintln!("Warm 2: {:?}", warm2);

        // All extractions must produce identical content
        assert_eq!(text1, text2);
        assert_eq!(text2, text3);

        // Warm calls may be faster due to the Pdfium singleton pattern - this is expected.
        // The singleton initializes Pdfium once and reuses it for subsequent calls.
        // What we DO want to verify is that warm1 and warm2 have similar performance,
        // which indicates consistent behavior after initialization.
        let warm1_micros = warm1.as_micros().max(1);
        let warm2_micros = warm2.as_micros().max(1);
        let warm_ratio = if warm1_micros > warm2_micros {
            warm1_micros / warm2_micros
        } else {
            warm2_micros / warm1_micros
        };

        // After initialization, subsequent calls should have similar performance (within 5x)
        assert!(
            warm_ratio < 5,
            "Warm calls have inconsistent performance ({}x difference) - warm1: {:?}, warm2: {:?}",
            warm_ratio,
            warm1,
            warm2
        );

        // Log the cold/warm ratio for informational purposes
        let cold_warm_ratio = cold.as_micros() / warm1_micros;
        eprintln!(
            "Cold/Warm ratio: {}x (expected due to singleton initialization)",
            cold_warm_ratio
        );
    }
}