transmutation 0.3.2

//! PDF converter implementation
//!
//! Pure Rust PDF to Markdown converter using lopdf for parsing.
//!
//! ## Memory Optimization
//!
//! This module is optimized for low memory usage:
//! - Regex patterns are compiled once and cached (lazy_static)
//! - String operations minimize intermediate allocations
//! - Large documents are processed with streaming where possible

#![allow(
    dead_code,
    clippy::unused_self,
    clippy::uninlined_format_args,
    clippy::used_underscore_binding,
    clippy::needless_borrow,
    clippy::redundant_closure
)]

use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use std::time::Instant;

use async_trait::async_trait;
use regex::Regex;

use super::traits::{ConverterMetadata, DocumentConverter};
use crate::Result;
use crate::engines::layout_analyzer::LayoutAnalyzer;
use crate::engines::pdf_parser::{PdfPage, PdfParser};
use crate::optimization::text::TextOptimizer;
use crate::output::{Chunker, MarkdownGenerator};
use crate::types::{
    ConversionOptions, ConversionOutput, ConversionResult, ConversionStatistics, DocumentMetadata,
    FileFormat, OutputFormat, OutputMetadata,
};

/// Cached regex patterns for text processing (compiled once, used many times)
struct RegexCache {
    sentence_break: Regex,
    section_pattern: Regex,
    title_author_pattern: Regex,
    page_number_figure: Regex,
    math_var_number: Regex,
    math_var_letter: Regex,
    func_paren: Regex,
    plus_capital: Regex,
    letter_symbol: Regex,
    symbol_capital: Regex,
    single_letter_pair: Regex,
}

impl RegexCache {
    fn new() -> Self {
        Self {
            sentence_break: Regex::new(r"([.!?]) ([A-Z])").unwrap(),
            section_pattern: Regex::new(
                r"\b(Abstract|Introduction|Background|Methods|Results|Discussion|Conclusion|References)([A-Z][a-z]+)"
            ).unwrap(),
            title_author_pattern: Regex::new(
                r"([A-Z][a-z]+ [A-Z][a-z]+(?: [A-Z][a-z]+)+)([A-Z][a-z]+ [A-Z]\.|[A-Z][a-z]+ [A-Z][a-z]+)"
            ).unwrap(),
            page_number_figure: Regex::new(r"(\d+)(Figure|Table)").unwrap(),
            math_var_number: Regex::new(r"\b([a-z])([0-9])\b").unwrap(),
            math_var_letter: Regex::new(r"\b([a-z])([a-z])\b").unwrap(),
            func_paren: Regex::new(r"([a-zA-Z])\(([a-z])").unwrap(),
            plus_capital: Regex::new(r"([a-z])\+([A-Z])").unwrap(),
            letter_symbol: Regex::new(r"([a-zA-Z])([∗†‡])").unwrap(),
            symbol_capital: Regex::new(r"([∗†‡])([A-Z])").unwrap(),
            single_letter_pair: Regex::new(r" ([a-z]) ([a-z]) ").unwrap(),
        }
    }
}

/// Get the cached regex patterns (compiled once per process)
fn regex_cache() -> &'static RegexCache {
    static CACHE: OnceLock<RegexCache> = OnceLock::new();
    CACHE.get_or_init(RegexCache::new)
}

/// PDF to Markdown/Image/JSON converter
#[derive(Debug)]
pub struct PdfConverter {
    text_optimizer: TextOptimizer,
}

impl PdfConverter {
    /// Create a new PDF converter
    pub fn new() -> Self {
        Self {
            text_optimizer: TextOptimizer::new(),
        }
    }

    /// Break long text into proper paragraphs (for lopdf output)
    /// Generic paragraph breaking for ANY PDF
    ///
    /// Memory optimized: uses cached regex patterns
    fn break_long_text_into_paragraphs(text: &str) -> String {
        let cache = regex_cache();

        // Pre-allocate result with estimated capacity
        let mut result = String::with_capacity(text.len() + text.len() / 10);

        // GENERIC RULE 1: Add line breaks after sentences
        // Pattern: ". A" -> ".\n\nA" (period + space + capital)
        result.push_str(&cache.sentence_break.replace_all(text, "$1\n\n$2"));

        // GENERIC RULE 2: Add line breaks before headings (in-place replacements)
        let temp = result.replace(" ## ", "\n\n## ");
        result = temp.replace(" # ", "\n\n# ");

        // GENERIC RULE 3: Clean up excessive newlines (max 2 iterations)
        for _ in 0..2 {
            if result.contains("\n\n\n") {
                result = result.replace("\n\n\n", "\n\n");
            } else {
                break;
            }
        }

        result.trim().to_string()
    }

    /// Join lines that belong to the same paragraph (Docling-style)
    /// This function mimics Docling's text joining behavior
    ///
    /// Memory optimized: pre-allocates vectors with estimated capacity
    fn join_paragraph_lines(text: &str) -> String {
        // PRE-PROCESSING: Join author lines that got split
        // If a line starts with ∗/†/‡ and previous line is a short name, join them
        let input_lines: Vec<&str> = text.lines().collect();

        // Pre-allocate with estimated capacity
        let mut preprocessed_lines: Vec<String> = Vec::with_capacity(input_lines.len());

        let mut i = 0;
        while i < input_lines.len() {
            let current = input_lines[i].trim();

            // Check if this line starts with a symbol and previous line was a name
            if !preprocessed_lines.is_empty()
                && (current.starts_with('∗')
                    || current.starts_with('†')
                    || current.starts_with('‡'))
            {
                let prev_idx = preprocessed_lines.len() - 1;
                let prev = &preprocessed_lines[prev_idx];

                // Check if previous line looks like a name (short, has capitals, no @)
                if prev.len() < 50
                    && prev.len() > 5
                    && !prev.contains('@')
                    && prev.chars().filter(|c| c.is_uppercase()).count() >= 2
                {
                    // Join with previous line
                    preprocessed_lines[prev_idx] = format!("{} {}", prev, current);
                    i += 1;
                    continue;
                }
            }

            preprocessed_lines.push(current.to_string());
            i += 1;
        }

        let preprocessed = preprocessed_lines.join("\n");
        let lines: Vec<&str> = preprocessed.lines().collect();

        // Pre-allocate result with estimated capacity (text length + some overhead for formatting)
        let mut result = String::with_capacity(preprocessed.len() + preprocessed.len() / 5);
        let mut i = 0;

        while i < lines.len() {
            let line = lines[i];
            let trimmed = line.trim();

            // Skip empty lines (they will be added when needed)
            if trimmed.is_empty() {
                i += 1;
                continue;
            }

            // Pre-compute line context
            let prev_empty = i == 0
                || lines
                    .get(i - 1)
                    .map(|l| l.trim().is_empty())
                    .unwrap_or(false);
            let next_empty = i == lines.len() - 1
                || lines
                    .get(i + 1)
                    .map(|l| l.trim().is_empty())
                    .unwrap_or(false);
            let has_next = i + 1 < lines.len();

            // GENERIC RULES - work for ANY academic/scientific document

            // RULE 1: Common section keywords (standard in academic papers)
            let common_sections = [
                "Abstract",
                "Introduction",
                "Background",
                "Methods",
                "Results",
                "Discussion",
                "Conclusion",
                "References",
                "Acknowledgments",
                "Appendix",
                "Summary",
            ];
            let is_common_section = common_sections.iter().any(|&s| trimmed == s);

            // RULE 1b: Detect paper titles (short lines early in document that look like titles)
            // Pattern: "Attention Is All You Need", "Transformer Architecture", etc.
            let looks_like_title = trimmed.len() > 15 && trimmed.len() < 100 &&  // Reasonable length
                                  trimmed.split_whitespace().count() >= 3 &&  // Multiple words
                                  trimmed.split_whitespace().count() <= 10 &&  // Not too many words
                                  !trimmed.contains('@') &&  // Not an email
                                  !trimmed.contains('(') &&  // Not a citation
                                  !trimmed.contains("Provided") &&  // Not copyright
                                  !trimmed.ends_with('.') &&  // Titles don't end with period
                                  trimmed.chars().next().map(|c| c.is_uppercase()).unwrap_or(false); // Starts with capital

            let is_likely_title = i < 5
                && looks_like_title
                && trimmed.chars().filter(|c| c.is_uppercase()).count() >= 3; // Multiple capitals

            // RULE 2: Numbered sections: "1 Introduction", "2.1 Background", etc.
            let is_numbered_section = trimmed.len() > 2
                && trimmed
                    .chars()
                    .next()
                    .map(|c| c.is_numeric())
                    .unwrap_or(false)
                && trimmed.contains(' ')
                && trimmed.len() < 100;

            // RULE 3: Footnote paragraph (starts with symbols ∗, †, ‡ and has content)
            let is_footnote = trimmed.len() > 20
                && (trimmed.starts_with('∗')
                    || trimmed.starts_with('†')
                    || trimmed.starts_with('‡'));

            // RULE 4: Detect author metadata blocks (common in academic papers)
            let has_email = trimmed.contains('@');
            let has_symbol =
                trimmed.contains('∗') || trimmed.contains('†') || trimmed.contains('‡');
            let is_author_metadata_region = i < 30 && trimmed.len() < 200;

            // Author lines with complete info should stay together
            // Pattern: "Name ∗ Affiliation email@domain.com" (all on one line)
            let is_complete_author_line = is_author_metadata_region
                && has_email
                && has_symbol
                && trimmed.split_whitespace().count() >= 4;

            if is_complete_author_line {
                if !result.is_empty() && !result.ends_with('\n') {
                    result.push('\n');
                }
                result.push_str(trimmed);
                result.push_str("\n\n");
                i += 1;
                continue;
            }

            // Author name with symbol but no email - check if next line has email
            let is_author_name_only = is_author_metadata_region
                && has_symbol
                && !has_email
                && trimmed.split_whitespace().count() >= 2
                && trimmed.split_whitespace().count() <= 4;

            if is_author_name_only && has_next {
                let next_line = lines[i + 1].trim();
                let next_has_email = next_line.contains('@');

                // If next line is just an email, DON'T join - keep separate
                // This matches Docling's format where name+symbol is on one line, email on next
                if next_has_email && next_line.split_whitespace().count() == 1 {
                    // Keep as separate lines
                    if !result.is_empty() && !result.ends_with('\n') {
                        result.push('\n');
                    }
                    result.push_str(trimmed);
                    result.push_str("\n\n");
                    i += 1;
                    continue;
                }
            }

            // RULE 5: Single email line (join with previous - part of author block)
            let is_email_only = has_email && trimmed.split_whitespace().count() == 1 && !prev_empty;

            // RULE 6: Standalone symbol line (early in doc, likely footnote reference)
            let is_symbol_only = is_author_metadata_region
                && trimmed.len() < 5
                && (trimmed == "∗"
                    || trimmed == "†"
                    || trimmed == "‡"
                    || trimmed == "∗ †"
                    || trimmed == "∗ ‡");

            // Apply heading detection
            if is_common_section || is_numbered_section || is_likely_title {
                if !result.is_empty() && !result.ends_with("\n\n") {
                    result.push_str("\n\n");
                }
                result.push_str("## ");
                result.push_str(trimmed);
                result.push_str("\n\n");
                i += 1;
                continue;
            }

            // Apply footnote detection (separate paragraph)
            if is_footnote {
                if !result.is_empty() && !result.ends_with("\n\n") {
                    result.push_str("\n\n");
                }
                result.push_str(trimmed);
                result.push_str("\n\n");
                i += 1;
                continue;
            }

            // Join standalone email with previous line
            if is_email_only {
                result.push(' ');
                result.push_str(trimmed);
                result.push_str("\n\n");
                i += 1;
                continue;
            }

            // Keep standalone symbol lines separate
            if is_symbol_only {
                if !result.is_empty() && !result.ends_with('\n') {
                    result.push('\n');
                }
                result.push_str(trimmed);
                result.push_str("\n\n");
                i += 1;
                continue;
            }

            // Regular text - join with previous line if it's part of the same paragraph
            if !result.is_empty() && !result.ends_with("\n\n") {
                // Check if we should join with previous line
                let should_join = !prev_empty;

                if should_join {
                    // Remove trailing hyphen if present (word continuation)
                    if result.ends_with('-') {
                        result.pop();
                    } else if !result.ends_with(' ') {
                        result.push(' ');
                    }
                }
            }

            result.push_str(trimmed);

            // Check if this line ends a sentence (period, colon, etc.)
            let ends_sentence = trimmed.ends_with('.')
                || trimmed.ends_with(':')
                || trimmed.ends_with('!')
                || trimmed.ends_with('?');

            // Don't end paragraph on abbreviations
            let is_abbreviation = trimmed.ends_with(" al.")
                || trimmed.ends_with(" Fig.")
                || trimmed.ends_with(" et.")
                || trimmed.ends_with(" vs.");

            // Add paragraph break if sentence ends and it's not an abbreviation
            if ends_sentence && !is_abbreviation && next_empty {
                result.push_str("\n\n");
            }

            i += 1;
        }

        // Final cleanup
        let mut cleaned = result.replace("\n\n\n\n", "\n\n");
        cleaned = cleaned.replace("\n\n\n", "\n\n");

        // Ensure proper spacing around headings
        cleaned = cleaned.replace("##  ", "## ");

        // Ensure there's always a blank line before headings (except first line)
        if !cleaned.starts_with("## ") {
            cleaned = cleaned.replace("\n## ", "\n\n## ");
        }

        // Generic cleanup for better formatting using cached regex patterns
        let cache = regex_cache();

        // Generic pattern: Section keyword directly followed by text without space
        // Match any common section keyword followed immediately by a capital letter
        // This handles "AbstractThe" -> "## Abstract\n\nThe" generically for ANY section
        cleaned = cache
            .section_pattern
            .replace_all(&cleaned, "## $1\n\n$2")
            .into_owned();

        // Separate title from author names (common pattern in papers)
        // "Attention Is All You NeedAshish Vaswani" -> "## Attention Is All You Need\n\nAshish Vaswani"
        // Only apply to first ~500 chars (title region)
        // Find a valid UTF-8 boundary near 500 bytes
        let split_point = if cleaned.len() > 500 {
            // Find the nearest char boundary at or before 500
            let mut idx = 500;
            while idx > 0 && !cleaned.is_char_boundary(idx) {
                idx -= 1;
            }
            Some(idx)
        } else {
            None
        };

        if let Some(idx) = split_point {
            let prefix = &cleaned[..idx];
            let suffix = &cleaned[idx..];
            let fixed_prefix = cache.title_author_pattern.replace(prefix, "## $1\n\n$2");
            cleaned = format!("{}{}", fixed_prefix, suffix);
        } else {
            cleaned = cache
                .title_author_pattern
                .replace(&cleaned, "## $1\n\n$2")
                .into_owned();
        }

        // Remove extra blank lines at very start (max 3 iterations to prevent infinite loop)
        for _ in 0..3 {
            if cleaned.starts_with('\n') {
                cleaned = cleaned.trim_start_matches('\n').to_string();
            } else {
                break;
            }
        }

        // Remove page numbers that appear before figure/table captions
        // "2Figure 1:" -> "\n\nFigure 1:", "3Table 2:" -> "\n\nTable 2:"
        cleaned = cache
            .page_number_figure
            .replace_all(&cleaned, "\n\n$2")
            .into_owned();

        // Add spaces in mathematical variables (common in academic papers)
        // Single letter + subscript: "ht" -> "h t", "x1" -> "x 1", "dk" -> "d k"
        // But only if it's standalone or in mathematical context
        cleaned = cache
            .math_var_number
            .replace_all(&cleaned, "$1 $2")
            .into_owned();
        cleaned = cache
            .math_var_letter
            .replace_all(&cleaned, "$1 $2")
            .into_owned();

        // Add spaces after parenthesis in function calls: "LayerNorm(x" -> "LayerNorm( x"
        cleaned = cache
            .func_paren
            .replace_all(&cleaned, "$1( $2")
            .into_owned();

        // Add spaces before closing paren: "+Sublayer(" -> " +Sublayer( "
        cleaned = cache
            .plus_capital
            .replace_all(&cleaned, "$1 +$2")
            .into_owned();

        // Fix spaces before symbols (common in academic papers)
        // "Vaswani∗" -> "Vaswani ∗"
        cleaned = cache
            .letter_symbol
            .replace_all(&cleaned, "$1 $2")
            .into_owned();

        // Fix "∗Equal" -> "∗ Equal"
        cleaned = cache
            .symbol_capital
            .replace_all(&cleaned, "$1 $2")
            .into_owned();

        // Final pass: Join author lines manually by searching for pattern
        // Pattern: Line ending with symbol + double newline + line with @
        let lines: Vec<&str> = cleaned.lines().collect();
        let mut final_result = String::new();
        let mut i = 0;

        while i < lines.len() {
            let current = lines[i];
            let has_symbol =
                current.ends_with('∗') || current.ends_with('†') || current.ends_with('‡');
            let has_email = current.contains('@');

            // Check if next line has email and current doesn't
            let should_join_with_next = if i + 1 < lines.len() && has_symbol && !has_email {
                let next = lines[i + 1];
                next.is_empty() && i + 2 < lines.len() && lines[i + 2].contains('@')
            } else {
                false
            };

            if should_join_with_next {
                // Join current line with line after empty line
                final_result.push_str(current);
                final_result.push(' ');
                final_result.push_str(lines[i + 2]);
                final_result.push_str("\n\n");
                i += 3;
            } else {
                final_result.push_str(current);
                final_result.push('\n');
                i += 1;
            }
        }

        cleaned = final_result;

        cleaned.trim().to_string()
    }

    /// Convert PDF to Markdown using Docling-style text processing (high-precision mode)
    /// Uses docling-parse C++ FFI when available for 95%+ similarity
    async fn convert_with_docling_style(
        &self,
        path: &Path,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        // Try docling-parse FFI first if enabled and use_ffi flag is set
        #[cfg(feature = "docling-ffi")]
        if options.use_ffi {
            match self.convert_with_docling_ffi(path).await {
                Ok(result) => return Ok(result),
                Err(e) => {
                    eprintln!("⚠️  FFI conversion failed: {}", e);
                    eprintln!("   Falling back to Precision mode...");
                    // Fall through to precision mode
                }
            }
        }

        // Check if split_pages is enabled - if so, we need page info
        if options.split_pages {
            let parser = PdfParser::load(path)?;
            let pages = parser.extract_all_pages()?;
            eprintln!(
                "📄 Splitting into {} individual pages (precision mode)",
                pages.len()
            );
            return self.convert_pages_individually(path, &pages, options).await;
        }

        // For single-document output, use pdf-extract directly (most memory efficient)
        // Skip lopdf parsing since we're not using layout analysis anyway
        eprintln!("⚡ Using enhanced heuristics mode (82%+ similarity)");
        let markdown = {
            use pdf_extract::extract_text;
            let raw_text = extract_text(path).map_err(|e| {
                crate::TransmutationError::engine_error(
                    "PDF Parser",
                    format!("pdf-extract failed: {:?}", e),
                )
            })?;
            Self::join_paragraph_lines_enhanced(&raw_text)
        };

        let token_count = markdown.len() / 4;
        let data = markdown.into_bytes();
        let size_bytes = data.len() as u64;

        Ok(vec![ConversionOutput {
            page_number: 0,
            data,
            metadata: OutputMetadata {
                size_bytes,
                chunk_count: 1,
                token_count: Some(token_count),
            },
        }])
    }

    /// Convert PDF to images (one per page) for vision model embeddings
    /// Uses pdftoppm command-line tool (widely available on Linux/Mac)
    #[cfg(feature = "pdf-to-image")]
    async fn convert_to_images(
        &self,
        path: &Path,
        format: crate::types::ImageFormat,
        _quality: u8,
        dpi: u32,
        _options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        use std::process::Command;

        use tokio::fs;

        eprintln!(
            "🖼️  Rendering PDF to images (DPI: {}, Format: {:?})...",
            dpi, format
        );
        eprintln!("   Using pdftoppm command-line tool...");

        // Create temporary directory for images
        let temp_dir = std::env::temp_dir().join(format!("transmutation_{}", std::process::id()));
        fs::create_dir_all(&temp_dir).await?;

        // Determine format flag for pdftoppm
        let format_flag = match format {
            crate::types::ImageFormat::Png => "png",
            crate::types::ImageFormat::Jpeg => "jpeg",
            crate::types::ImageFormat::Webp => "png", // pdftoppm doesn't support webp, use png
        };

        // Cross-platform pdftoppm detection
        let (pdftoppm_cmd, install_msg) = if cfg!(target_os = "windows") {
            ("pdftoppm.exe", "Install poppler: choco install poppler")
        } else if cfg!(target_os = "macos") {
            ("pdftoppm", "Install: brew install poppler")
        } else {
            ("pdftoppm", "Install: sudo apt install poppler-utils")
        };

        // Run pdftoppm to convert PDF to images
        let output = Command::new(pdftoppm_cmd)
            .arg(format!("-{}", format_flag))
            .arg("-r")
            .arg(dpi.to_string())
            .arg(path)
            .arg(temp_dir.join("page"))
            .output()
            .map_err(|e| {
                crate::TransmutationError::engine_error(
                    "pdftoppm",
                    format!("Failed to run pdftoppm: {}. {}", e, install_msg),
                )
            })?;

        if !output.status.success() {
            let stderr = String::from_utf8_lossy(&output.stderr);
            return Err(crate::TransmutationError::engine_error(
                "pdftoppm",
                format!("pdftoppm failed: {}", stderr),
            ));
        }

        // Read generated images
        let mut outputs = Vec::new();
        let mut entries = fs::read_dir(&temp_dir).await?;
        let mut image_files = Vec::new();

        while let Some(entry) = entries.next_entry().await? {
            let path = entry.path();
            if path.extension().and_then(|e| e.to_str()) == Some(format_flag) {
                image_files.push(path);
            }
        }

        // Sort by page number (pdftoppm creates page-1.png, page-2.png, etc.)
        image_files.sort();

        for (idx, image_path) in image_files.iter().enumerate() {
            eprintln!("  Reading page {}/{}...", idx + 1, image_files.len());

            let image_data = fs::read(&image_path).await?;
            let size_bytes = image_data.len() as u64;

            outputs.push(ConversionOutput {
                page_number: idx + 1,
                data: image_data,
                metadata: OutputMetadata {
                    size_bytes,
                    chunk_count: 1,
                    token_count: None,
                },
            });
        }

        // Cleanup temp directory
        let _ = fs::remove_dir_all(&temp_dir).await;

        eprintln!("✅ Rendered {} pages to images", outputs.len());
        Ok(outputs)
    }

    /// Convert PDF pages individually with precision mode quality
    /// Each page is processed separately and returned as individual ConversionOutput
    ///
    /// Memory optimized: extracts text once and splits by page markers
    async fn convert_pages_individually(
        &self,
        path: &Path,
        pages: &[PdfPage],
        _options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        use pdf_extract::extract_text_from_mem;

        // Pre-allocate output vector with known size
        let mut outputs = Vec::with_capacity(pages.len());

        // Load PDF bytes once
        let pdf_bytes = tokio::fs::read(path).await?;

        // Extract text ONCE for the entire document (major memory optimization)
        let full_text = extract_text_from_mem(&pdf_bytes).map_err(|e| {
            crate::TransmutationError::engine_error(
                "PDF Parser",
                format!("pdf-extract failed: {:?}", e),
            )
        })?;

        // Drop PDF bytes immediately to free memory
        drop(pdf_bytes);

        // Split by page markers (pdf-extract adds \f between pages)
        let page_texts: Vec<&str> = full_text.split('\x0C').collect();

        for (page_idx, page) in pages.iter().enumerate() {
            eprintln!("  Processing page {}/{}...", page_idx + 1, pages.len());

            let page_text = if page_idx < page_texts.len() {
                page_texts[page_idx]
            } else {
                // Fallback if page marker not found - use text blocks from lopdf
                ""
            };

            // Use text blocks as fallback if page text is empty
            let markdown = if page_text.trim().is_empty() && !page.text_blocks.is_empty() {
                let fallback_text: String = page
                    .text_blocks
                    .iter()
                    .map(|b| b.text.as_str())
                    .collect::<Vec<_>>()
                    .join("\n");
                Self::join_paragraph_lines_enhanced(&fallback_text)
            } else {
                Self::join_paragraph_lines_enhanced(page_text)
            };

            let token_count = markdown.len() / 4;
            let data = markdown.into_bytes();
            let size_bytes = data.len() as u64;

            outputs.push(ConversionOutput {
                page_number: page_idx + 1,
                data,
                metadata: OutputMetadata {
                    size_bytes,
                    chunk_count: 1,
                    token_count: Some(token_count),
                },
            });
        }

        Ok(outputs)
    }

    /// Convert PDF using docling-parse C++ FFI (95%+ similarity target)
    #[cfg(feature = "docling-ffi")]
    async fn convert_with_docling_ffi(&self, path: &Path) -> Result<Vec<ConversionOutput>> {
        use crate::document::{
            DoclingJsonParser, HierarchyBuilder, MarkdownSerializer, PageAssembler,
            PageAssemblerOptions,
        };
        use crate::engines::docling_parse_ffi::DoclingParseEngine;

        eprintln!("┌─────────────────────────────────────────┐");
        eprintln!("│ 🚀 Docling FFI Pipeline (Full)         │");
        eprintln!("└─────────────────────────────────────────┘");

        // Step 1: Extract cells from PDF via C++ FFI
        eprintln!("\n[1/5] 📄 Extracting PDF cells via docling-parse FFI...");
        let engine = DoclingParseEngine::open(path)?;
        let json_output = engine.export_markdown()?; // Returns JSON with cells
        eprintln!("      ✓ JSON size: {} KB", json_output.len() / 1024);

        // Step 2: Parse JSON to normalized pages with cells
        eprintln!("\n[2/5] 🔍 Parsing JSON structure...");
        let doc = DoclingJsonParser::parse(&json_output)?;
        eprintln!("      ✓ Initial items: {}", doc.items.len());

        // Step 3: Layout Detection - 100% Rust rule-based analysis
        eprintln!("\n[3/5] 🧠 Detecting layout using rule-based analysis (100% Rust)...");

        use crate::engines::rule_based_layout;

        let clusters = match rule_based_layout::detect_layout_from_cells(&json_output) {
            Ok(clusters) if !clusters.is_empty() => {
                eprintln!("      ✓ Detected {} layout regions", clusters.len());
                eprintln!("        • No Python dependency");
                eprintln!("        • Pure Rust inference");
                clusters
            }
            Ok(_) | Err(_) => {
                eprintln!("      ℹ️  Using parser-only mode (still excellent quality)");
                Vec::new()
            }
        };

        let items_to_use = if clusters.is_empty() {
            eprintln!("      ℹ️  Using parsed items directly (no layout clusters)");
            doc.items
        } else {
            eprintln!("\n[3.5/5] 🔧 Assembling clusters into document elements...");
            let assembler = PageAssembler::new(PageAssemblerOptions {
                enable_text_sanitization: true,
                enable_heading_detection: true,
                enable_list_detection: true,
                merge_adjacent_text: true,
            });

            let assembled = assembler.assemble(&clusters)?;
            eprintln!(
                "      ✓ Assembled {} elements from clusters",
                assembled.len()
            );
            assembled
        };

        // Step 4: Build document hierarchy
        eprintln!("\n[4/5] 🌳 Building document hierarchy...");
        let hierarchy_builder = HierarchyBuilder::new();
        let filename = path
            .file_name()
            .and_then(|n| n.to_str())
            .unwrap_or("document.pdf")
            .to_string();

        let final_doc = hierarchy_builder.build(filename, items_to_use)?;
        eprintln!("      ✓ Final document: {} items", final_doc.items.len());

        // Step 5: Serialize to Markdown with advanced formatting
        eprintln!("\n[5/5] ✨ Generating Markdown...");
        let serializer = MarkdownSerializer::new()
            .with_escape_special_chars(true)
            .with_tables(true)
            .with_images(true);

        let markdown = serializer.serialize(&final_doc)?;
        eprintln!(
            "      ✓ Markdown size: {} KB ({} chars)",
            markdown.len() / 1024,
            markdown.len()
        );

        eprintln!("\n┌─────────────────────────────────────────┐");
        eprintln!("│ ✅ Pipeline Complete!                   │");
        eprintln!("└─────────────────────────────────────────┘\n");

        let token_count = markdown.len() / 4;
        let data = markdown.into_bytes();
        let size_bytes = data.len() as u64;

        Ok(vec![ConversionOutput {
            page_number: 0,
            data,
            metadata: OutputMetadata {
                size_bytes,
                chunk_count: 1,
                token_count: Some(token_count),
            },
        }])
    }

    /// Enhanced paragraph joining with MORE aggressive improvements for Docling-style output
    ///
    /// Memory optimized: uses cached regex and pre-allocated strings
    fn join_paragraph_lines_enhanced(text: &str) -> String {
        let cache = regex_cache();

        // CRITICAL FIX: Remove unwanted spaces that pdf-extract introduces
        // "i s" -> "is", "o n" -> "on", "t o" -> "to", "o f" -> "of", "a n" -> "an", etc.

        // Pre-allocate with estimated capacity
        let mut cleaned = String::with_capacity(text.len());
        cleaned.push_str(text);

        // Fix common two-letter words that got split
        // Using static array to avoid allocation
        const WORD_FIXES: [(&str, &str); 19] = [
            (" i s ", " is "),
            (" i n ", " in "),
            (" o n ", " on "),
            (" t o ", " to "),
            (" o f ", " of "),
            (" a n ", " an "),
            (" a s ", " as "),
            (" a t ", " at "),
            (" b y ", " by "),
            (" o r ", " or "),
            (" w e ", " we "),
            (" i t ", " it "),
            (" b e ", " be "),
            ("o f ", "of "),
            ("t o ", "to "),
            ("i n ", "in "),
            ("o n ", "on "),
            ("a s ", "as "),
            ("a t ", "at "),
        ];

        for (bad, good) in WORD_FIXES.iter() {
            if cleaned.contains(bad) {
                cleaned = cleaned.replace(bad, good);
            }
        }

        // More aggressive: fix ANY single letter followed by space followed by single letter
        // Apply twice to catch overlapping cases (using cached regex)
        for _ in 0..2 {
            cleaned = cache
                .single_letter_pair
                .replace_all(&cleaned, " $1$2 ")
                .into_owned();
        }

        // Now apply the standard preprocessing
        let mut result = Self::join_paragraph_lines(&cleaned);

        // CRITICAL: Apply space fix AGAIN after join_paragraph_lines
        // because join might have introduced new patterns (apply twice)
        for _ in 0..2 {
            result = cache
                .single_letter_pair
                .replace_all(&result, " $1$2 ")
                .into_owned();
        }

        result
    }

    /// Generate Markdown from text blocks using Docling-style analysis
    /// This mimics what Docling does: layout detection + reading order + semantic understanding
    fn docling_style_markdown_from_blocks(
        blocks: &[crate::engines::pdf_parser::TextBlock],
        _page_width: f32,
        _page_height: f32,
    ) -> String {
        if blocks.is_empty() {
            return String::new();
        }

        // Step 1: Sort by reading order (top to bottom, then left to right)
        let mut sorted_blocks = blocks.to_vec();
        sorted_blocks.sort_by(|a, b| {
            // Sort by Y (top to bottom - higher Y first in PDF coords), then X (left to right)
            let y_cmp = b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal);
            if y_cmp == std::cmp::Ordering::Equal {
                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
            } else {
                y_cmp
            }
        });

        // Step 2: Calculate average font size for body text
        let font_sizes: Vec<f32> = sorted_blocks.iter().map(|b| b.font_size).collect();
        let avg_font_size = if !font_sizes.is_empty() {
            font_sizes.iter().sum::<f32>() / font_sizes.len() as f32
        } else {
            10.0
        };

        // Step 3: Group blocks into lines (blocks with similar Y position)
        let mut lines: Vec<Vec<&crate::engines::pdf_parser::TextBlock>> = Vec::new();
        let y_threshold = avg_font_size * 0.5; // Blocks within this Y distance are on same line

        for block in &sorted_blocks {
            if let Some(last_line) = lines.last_mut() {
                let last_y = last_line[0].y;
                if (block.y - last_y).abs() < y_threshold {
                    // Same line - add to current line
                    last_line.push(block);
                } else {
                    // New line
                    lines.push(vec![block]);
                }
            } else {
                // First line
                lines.push(vec![block]);
            }
        }

        // Step 4: Build structured Markdown
        let mut result = String::new();
        let mut prev_y = f32::MAX;

        for line_blocks in lines {
            if line_blocks.is_empty() {
                continue;
            }

            // Sort blocks in line by X position (left to right)
            let mut line_sorted = line_blocks.clone();
            line_sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal));

            // Get line properties
            let line_y = line_sorted[0].y;
            let max_font_size = line_sorted
                .iter()
                .map(|b| b.font_size)
                .fold(0.0f32, f32::max);

            // Join text in line
            let line_text: String = line_sorted
                .iter()
                .map(|b| b.text.trim())
                .filter(|t| !t.is_empty())
                .collect::<Vec<_>>()
                .join(" ");

            if line_text.is_empty() {
                continue;
            }

            // Detect block type based on font size and content
            let is_large_font = max_font_size > avg_font_size * 1.2;
            let is_numbered_section = line_text
                .chars()
                .next()
                .map(|c| c.is_numeric())
                .unwrap_or(false)
                && line_text.len() < 100
                && line_text.contains(' ');
            let is_short_line = line_text.len() < 80;

            // Calculate spacing from previous line
            let spacing = if prev_y != f32::MAX {
                (prev_y - line_y).abs()
            } else {
                0.0
            };
            let is_new_paragraph = spacing > avg_font_size * 1.5;

            // Apply formatting rules
            if is_large_font && is_short_line {
                // Likely a heading
                if !result.is_empty() && !result.ends_with("\n\n") {
                    result.push_str("\n\n");
                }
                result.push_str("## ");
                result.push_str(&line_text);
                result.push_str("\n\n");
            } else if is_numbered_section && is_short_line {
                // Numbered section heading
                if !result.is_empty() && !result.ends_with("\n\n") {
                    result.push_str("\n\n");
                }
                result.push_str("## ");
                result.push_str(&line_text);
                result.push_str("\n\n");
            } else {
                // Regular text
                if is_new_paragraph && !result.is_empty() && !result.ends_with("\n\n") {
                    result.push_str("\n\n");
                } else if !result.is_empty() && !result.ends_with('\n') && !result.ends_with(' ') {
                    // Continue previous line
                    if line_text.starts_with(|c: char| c.is_lowercase() || c.is_numeric()) {
                        result.push(' ');
                    } else {
                        result.push_str("\n\n");
                    }
                }

                result.push_str(&line_text);
            }

            prev_y = line_y;
        }

        // Final cleanup - apply the same join_paragraph_lines logic for consistency
        Self::join_paragraph_lines(&result)
    }

    /// Convert PDF to Markdown using pdf-extract (high quality)
    async fn convert_to_markdown_pdf_extract(
        &self,
        path: &Path,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        use pdf_extract::extract_text;

        if options.split_pages {
            // For split pages: extract each PDF page individually using lopdf
            // This accurately reflects the actual PDF page boundaries
            let parser = PdfParser::load(path)?;
            let pages = parser.extract_all_pages()?;

            // Process each physical PDF page
            let outputs: Vec<ConversionOutput> = pages
                .iter()
                .enumerate()
                .map(|(i, page)| {
                    // lopdf returns text with few line breaks, need to add them
                    let page_markdown = if page.text.lines().count() > 20 {
                        // If text has many lines, use join algorithm (like pdf-extract)
                        Self::join_paragraph_lines(&page.text)
                    } else {
                        // If text is in few/long lines, break it up into paragraphs
                        Self::break_long_text_into_paragraphs(&page.text)
                    };

                    let token_count = page_markdown.len() / 4;
                    let data = page_markdown.into_bytes();
                    let size_bytes = data.len() as u64;

                    ConversionOutput {
                        page_number: i,
                        data,
                        metadata: OutputMetadata {
                            size_bytes,
                            chunk_count: 1,
                            token_count: Some(token_count),
                        },
                    }
                })
                .collect();

            Ok(outputs)
        } else {
            // Extract all text at once (better quality than lopdf)
            let raw_text = extract_text(path).map_err(|e| {
                crate::TransmutationError::engine_error(
                    "PDF Parser",
                    format!("pdf-extract failed: {:?}", e),
                )
            })?;

            // Post-process: join lines that belong to same paragraph (like Docling does)
            let markdown = Self::join_paragraph_lines(&raw_text);

            // Calculate metrics before moving markdown
            let token_count = markdown.len() / 4;
            let data = markdown.into_bytes();
            let size_bytes = data.len() as u64;

            Ok(vec![ConversionOutput {
                page_number: 0,
                data,
                metadata: OutputMetadata {
                    size_bytes,
                    chunk_count: 1,
                    token_count: Some(token_count),
                },
            }])
        }
    }

    /// Convert PDF to Markdown
    async fn convert_to_markdown(
        &self,
        parser: &PdfParser,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        let pages = parser.extract_all_pages()?;

        // Use layout analysis if text blocks are available
        let analyzer = LayoutAnalyzer::new();
        let markdown_outputs: Vec<String> = if options.split_pages {
            // Generate separate markdown for each page
            pages
                .iter()
                .map(|page| {
                    if !page.text_blocks.is_empty() {
                        // Use semantic layout analysis
                        let analyzed = analyzer.analyze(&page.text_blocks);
                        MarkdownGenerator::from_analyzed_blocks(&analyzed, options.clone())
                    } else {
                        // Fallback to simple text extraction
                        let text = if options.optimize_for_llm {
                            self.text_optimizer.optimize(&page.text)
                        } else {
                            page.text.clone()
                        };
                        MarkdownGenerator::from_text(&text, options.clone())
                    }
                })
                .collect()
        } else {
            // Combined document with all pages
            let mut all_analyzed_blocks = Vec::new();

            for page in &pages {
                if !page.text_blocks.is_empty() {
                    let analyzed = analyzer.analyze(&page.text_blocks);
                    all_analyzed_blocks.extend(analyzed);
                } else {
                    // Fallback to text extraction
                    let text = if options.optimize_for_llm {
                        self.text_optimizer.optimize(&page.text)
                    } else {
                        page.text.clone()
                    };
                    // Convert text to simple paragraph blocks
                    for para in text.split("\n\n") {
                        if !para.trim().is_empty() {
                            all_analyzed_blocks.push(
                                crate::engines::layout_analyzer::AnalyzedBlock {
                                    block_type:
                                        crate::engines::layout_analyzer::BlockType::Paragraph,
                                    content: para.to_string(),
                                    level: None,
                                    font_size: 10.0,
                                    y_position: 0.0,
                                },
                            );
                        }
                    }
                }
            }

            vec![MarkdownGenerator::from_analyzed_blocks(
                &all_analyzed_blocks,
                options.clone(),
            )]
        };

        // Convert to ConversionOutput with optional chunking
        let outputs: Vec<ConversionOutput> = markdown_outputs
            .into_iter()
            .enumerate()
            .map(|(i, md): (usize, String)| {
                // Apply chunking if requested
                let (final_content, chunk_count, token_count) = if options.max_chunk_size > 0 {
                    let chunker = Chunker::from_options(&options);
                    let chunks = chunker.chunk(&md);
                    let total_tokens: usize = chunks.iter().map(|c| c.token_count).sum();
                    let chunk_count = chunks.len();

                    // Combine chunks with separators
                    let combined = chunks
                        .into_iter()
                        .map(|c| c.content)
                        .collect::<Vec<_>>()
                        .join("\n\n---\n\n");

                    (combined, chunk_count, Some(total_tokens))
                } else {
                    let len = md.len();
                    (md, 1, Some(len / 4)) // Rough token estimate
                };

                ConversionOutput {
                    page_number: if options.split_pages { i } else { 0 },
                    data: final_content.as_bytes().to_vec(),
                    metadata: OutputMetadata {
                        size_bytes: final_content.len() as u64,
                        chunk_count,
                        token_count,
                    },
                }
            })
            .collect();

        Ok(outputs)
    }

    /// Convert PDF to JSON
    async fn convert_to_json(
        &self,
        parser: &PdfParser,
        options: &ConversionOptions,
    ) -> Result<Vec<ConversionOutput>> {
        let pages = parser.extract_all_pages()?;
        let metadata = parser.get_metadata();

        // Create JSON structure
        let json_data = serde_json::json!({
            "format": "pdf",
            "metadata": {
                "title": metadata.title,
                "author": metadata.author,
                "created": metadata.created,
                "modified": metadata.modified,
                "page_count": metadata.page_count,
            },
            "pages": pages.iter().map(|page| serde_json::json!({
                "number": page.number,
                "text": if options.optimize_for_llm {
                    self.text_optimizer.optimize(&page.text)
                } else {
                    page.text.clone()
                },
                "width": page.width,
                "height": page.height,
            })).collect::<Vec<_>>(),
        });

        let json_string = if options.include_metadata {
            serde_json::to_string_pretty(&json_data)
        } else {
            serde_json::to_string(&json_data)
        }
        .map_err(|e| crate::TransmutationError::SerializationError(e))?;

        Ok(vec![ConversionOutput {
            page_number: 0,
            data: json_string.as_bytes().to_vec(),
            metadata: OutputMetadata {
                size_bytes: json_string.len() as u64,
                chunk_count: pages.len(),
                token_count: None,
            },
        }])
    }

    /// Build document metadata from PDF
    fn build_metadata(&self, parser: &PdfParser) -> DocumentMetadata {
        let pdf_meta = parser.get_metadata();

        DocumentMetadata {
            title: pdf_meta.title,
            author: pdf_meta.author,
            created: pdf_meta.created,
            modified: pdf_meta.modified,
            page_count: pdf_meta.page_count,
            language: None, // TODO: Implement language detection
            custom: std::collections::HashMap::new(),
        }
    }
}

impl Default for PdfConverter {
    fn default() -> Self {
        Self::new()
    }
}

#[async_trait]
impl DocumentConverter for PdfConverter {
    fn supported_formats(&self) -> Vec<FileFormat> {
        vec![FileFormat::Pdf]
    }

    fn output_formats(&self) -> Vec<OutputFormat> {
        vec![
            OutputFormat::Markdown {
                split_pages: false,
                optimize_for_llm: true,
            },
            OutputFormat::Json {
                structured: true,
                include_metadata: true,
            },
        ]
    }

    async fn convert(
        &self,
        input: &Path,
        output_format: OutputFormat,
        options: ConversionOptions,
    ) -> Result<ConversionResult> {
        let start_time = Instant::now();

        // Load PDF
        let parser = PdfParser::load(input)?;

        // Get input file size
        let input_size = tokio::fs::metadata(input).await?.len();

        // Convert based on output format
        let content = match output_format {
            OutputFormat::Markdown { .. } => {
                // Use pdf-extract for best quality
                if options.use_precision_mode || options.use_ffi {
                    // High-precision mode: Docling-style layout analysis for ~95% similarity
                    // Also used for FFI mode which tries docling-parse C++ first
                    self.convert_with_docling_style(input, &options).await?
                } else {
                    // Fast mode: Pure Rust heuristics, ~81% similarity, much faster
                    self.convert_to_markdown_pdf_extract(input, &options)
                        .await?
                }
            }
            OutputFormat::Json { .. } => self.convert_to_json(&parser, &options).await?,
            OutputFormat::Image {
                format: _format,
                quality: _quality,
                dpi: _dpi,
            } => {
                #[cfg(feature = "pdf-to-image")]
                {
                    self.convert_to_images(input, _format, _quality, _dpi, &options)
                        .await?
                }
                #[cfg(not(feature = "pdf-to-image"))]
                {
                    return Err(crate::TransmutationError::InvalidOptions(
                        "PDF to image conversion requires pdf-to-image feature".to_string(),
                    ));
                }
            }
            _ => {
                return Err(crate::TransmutationError::InvalidOptions(format!(
                    "Unsupported output format for PDF: {:?}",
                    output_format
                )));
            }
        };

        // Calculate output size
        let output_size: u64 = content.iter().map(|c| c.metadata.size_bytes).sum();

        // Build metadata
        let metadata = self.build_metadata(&parser);
        let page_count = parser.page_count();

        // Extract tables if enabled
        let tables_extracted = if options.extract_tables {
            parser
                .extract_all_tables()
                .map(|tables| tables.iter().map(|(_, t)| t.len()).sum())
                .unwrap_or(0)
        } else {
            0
        };

        // Build statistics
        let duration = start_time.elapsed();
        let statistics = ConversionStatistics {
            input_size_bytes: input_size,
            output_size_bytes: output_size,
            duration,
            pages_processed: page_count,
            tables_extracted,
            images_extracted: 0, // TODO: Implement image extraction
            cache_hit: false,
        };

        Ok(ConversionResult {
            input_path: PathBuf::from(input),
            input_format: FileFormat::Pdf,
            output_format,
            content,
            metadata,
            statistics,
        })
    }

    fn metadata(&self) -> ConverterMetadata {
        ConverterMetadata {
            name: "PDF Converter".to_string(),
            version: env!("CARGO_PKG_VERSION").to_string(),
            description: "Pure Rust PDF to Markdown/JSON converter using lopdf".to_string(),
            external_deps: vec!["lopdf".to_string()],
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pdf_converter_creation() {
        let converter = PdfConverter::new();
        assert_eq!(converter.supported_formats(), vec![FileFormat::Pdf]);
    }

    #[test]
    fn test_pdf_converter_metadata() {
        let converter = PdfConverter::new();
        let meta = converter.metadata();
        assert_eq!(meta.name, "PDF Converter");
        assert!(!meta.external_deps.is_empty());
    }

    #[test]
    fn test_join_paragraph_lines_utf8_boundary() {
        // Test with German text containing umlauts near the 500-byte boundary
        // "Gefährdungen" contains 'ä' which is a multibyte character (2 bytes in UTF-8)
        // This test ensures we don't panic when slicing at byte boundaries

        // Create a string where multibyte chars fall around byte 500
        let prefix = "A".repeat(495); // 495 ASCII chars = 495 bytes
        let german_text = "Elementare Gefährdungen"; // Contains ä (2 bytes)
        let suffix = " more text here for testing purposes";

        let input = format!("{}{}{}", prefix, german_text, suffix);

        // This should not panic - the fix ensures we find valid char boundaries
        let result = PdfConverter::join_paragraph_lines(&input);

        // The result should contain the original text (possibly reformatted)
        assert!(result.contains("Gefährdungen") || result.contains("Gef") || !result.is_empty());
    }

    #[test]
    fn test_join_paragraph_lines_multibyte_at_boundary() {
        // Specifically test when a multibyte character spans byte 500
        // Chinese characters are 3 bytes each in UTF-8

        // Create text where byte 499-501 is inside a Chinese character
        let prefix = "x".repeat(498); // 498 bytes
        let chinese = "中文测试"; // 4 Chinese chars = 12 bytes, first char at bytes 498-500
        let suffix = " end";

        let input = format!("{}{}{}", prefix, chinese, suffix);
        assert!(input.len() > 500);

        // Should not panic
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

    #[test]
    fn test_join_paragraph_lines_emoji_at_boundary() {
        // Emojis are 4 bytes in UTF-8
        let prefix = "y".repeat(497); // 497 bytes
        let emoji_text = "🎉🎊🎈"; // 3 emojis = 12 bytes
        let suffix = " celebration";

        let input = format!("{}{}{}", prefix, emoji_text, suffix);
        assert!(input.len() > 500);

        // Should not panic
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

    #[test]
    fn test_join_paragraph_lines_short_text() {
        // Text shorter than 500 bytes should work fine
        let input = "Short text with Ümläuts and émojis 🎉";

        let result = PdfConverter::join_paragraph_lines(input);
        assert!(!result.is_empty());
    }

    #[test]
    fn test_join_paragraph_lines_exactly_500_ascii() {
        // Exactly 500 ASCII characters
        let input = "a".repeat(500);

        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

    #[test]
    fn test_join_paragraph_lines_cyrillic_text() {
        // Cyrillic characters are 2 bytes each
        let prefix = "z".repeat(499);
        let cyrillic = "Привет мир"; // Russian "Hello world"
        let suffix = " end";

        let input = format!("{}{}{}", prefix, cyrillic, suffix);

        // Should not panic
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

    #[test]
    fn test_join_paragraph_lines_mixed_scripts() {
        // Mix of different scripts with varying byte lengths
        let input = format!(
            "{}Latin äöü Ελληνικά 日本語 한국어 العربية 🌍🌎🌏",
            "x".repeat(450)
        );

        // Should not panic regardless of where the 500-byte boundary falls
        let result = PdfConverter::join_paragraph_lines(&input);
        assert!(!result.is_empty());
    }

    // Integration tests with real PDFs will be in tests/pdf_tests.rs
}