reasonkit-mem 0.1.7

//! PDF Document Extraction
//!
//! This module provides high-fidelity PDF document extraction with
//! layout preservation for RAG pipelines.
//!
//! # Features
//! - Text extraction with layout preservation
//! - Table structure detection
//! - Multi-column support
//! - Metadata extraction
//!
//! Enable with: `cargo build --features pdf-parsing`

#![cfg(feature = "pdf-parsing")]

use anyhow::Result;
use serde::{Deserialize, Serialize};

// Re-exports
pub use lopdf;
pub use pdf_extract;

/// Configuration for PDF extraction
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PdfConfig {
    /// Preserve layout structure
    pub preserve_layout: bool,
    /// Extract metadata
    pub extract_metadata: bool,
    /// OCR for scanned pages (if available)
    pub enable_ocr: bool,
    /// Maximum pages to extract (0 = unlimited)
    pub max_pages: usize,
}

impl Default for PdfConfig {
    fn default() -> Self {
        Self {
            preserve_layout: true,
            extract_metadata: true,
            enable_ocr: false,
            max_pages: 0,
        }
    }
}

/// Extracted PDF document
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedDocument {
    /// Document metadata
    pub metadata: DocumentMetadata,
    /// Extracted pages
    pub pages: Vec<ExtractedPage>,
    /// Full text content
    pub full_text: String,
}

/// Document metadata
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DocumentMetadata {
    pub title: Option<String>,
    pub author: Option<String>,
    pub subject: Option<String>,
    pub keywords: Vec<String>,
    pub creator: Option<String>,
    pub producer: Option<String>,
    pub creation_date: Option<String>,
    pub modification_date: Option<String>,
    pub page_count: usize,
}

/// Extracted page content
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedPage {
    /// Page number (1-indexed)
    pub page_number: usize,
    /// Page text content
    pub text: String,
    /// Detected sections
    pub sections: Vec<Section>,
    /// Detected tables
    pub tables: Vec<Table>,
}

/// A section of the document
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Section {
    /// Section heading (if detected)
    pub heading: Option<String>,
    /// Section content
    pub content: String,
    /// Nesting level (0 = top level)
    pub level: usize,
}

/// A detected table
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Table {
    /// Table caption (if any)
    pub caption: Option<String>,
    /// Header row
    pub headers: Vec<String>,
    /// Data rows
    pub rows: Vec<Vec<String>>,
}

impl Table {
    /// Convert table to markdown format
    pub fn to_markdown(&self) -> String {
        let mut md = String::new();

        if let Some(caption) = &self.caption {
            md.push_str(&format!("**{}**\n\n", caption));
        }

        if !self.headers.is_empty() {
            md.push_str("| ");
            md.push_str(&self.headers.join(" | "));
            md.push_str(" |\n");

            md.push_str("| ");
            md.push_str(
                &self
                    .headers
                    .iter()
                    .map(|_| "---")
                    .collect::<Vec<_>>()
                    .join(" | "),
            );
            md.push_str(" |\n");
        }

        for row in &self.rows {
            md.push_str("| ");
            md.push_str(&row.join(" | "));
            md.push_str(" |\n");
        }

        md
    }
}

/// PDF extractor
pub struct PdfExtractor {
    config: PdfConfig,
}

impl PdfExtractor {
    /// Create a new extractor with the given configuration
    pub fn new(config: PdfConfig) -> Self {
        Self { config }
    }

    /// Extract text from a PDF file
    pub fn extract_file(&self, path: &std::path::Path) -> Result<ExtractedDocument> {
        let bytes = std::fs::read(path)?;
        self.extract_bytes(&bytes)
    }

    /// Extract text from PDF bytes
    pub fn extract_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
        // Use pdf-extract for text extraction
        let text = pdf_extract::extract_text_from_mem(bytes)?;

        // Use lopdf for metadata
        let doc = lopdf::Document::load_mem(bytes)?;
        let metadata = self.extract_metadata(&doc);

        // Split into pages (simplified - real implementation would use page boundaries)
        let pages = self.split_into_pages(&text, metadata.page_count);

        Ok(ExtractedDocument {
            metadata,
            pages,
            full_text: text,
        })
    }

    fn extract_metadata(&self, doc: &lopdf::Document) -> DocumentMetadata {
        let mut metadata = DocumentMetadata::default();
        metadata.page_count = doc.get_pages().len();

        // Extract info dictionary if available
        if let Ok(info) = doc.trailer.get(b"Info") {
            if let Ok(info_ref) = info.as_reference() {
                if let Ok(info_dict) = doc.get_dictionary(info_ref) {
                    metadata.title = self.get_string(info_dict, b"Title");
                    metadata.author = self.get_string(info_dict, b"Author");
                    metadata.subject = self.get_string(info_dict, b"Subject");
                    metadata.creator = self.get_string(info_dict, b"Creator");
                    metadata.producer = self.get_string(info_dict, b"Producer");
                }
            }
        }

        metadata
    }

    fn get_string(&self, dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
        dict.get(key)
            .ok()
            .and_then(|v| v.as_str().ok())
            .and_then(|s| String::from_utf8(s.to_vec()).ok())
    }

    fn split_into_pages(&self, text: &str, page_count: usize) -> Vec<ExtractedPage> {
        // Simple heuristic: split by form feed or estimate based on page count
        let chunks: Vec<&str> = if text.contains('\x0C') {
            text.split('\x0C').collect()
        } else if page_count > 0 {
            let chars_per_page = text.len() / page_count;
            text.as_bytes()
                .chunks(chars_per_page)
                .map(|c| std::str::from_utf8(c).unwrap_or(""))
                .collect()
        } else {
            vec![text]
        };

        chunks
            .into_iter()
            .enumerate()
            .map(|(i, content)| ExtractedPage {
                page_number: i + 1,
                text: content.to_string(),
                sections: Vec::new(), // Would require more sophisticated parsing
                tables: Vec::new(),   // Would require layout analysis
            })
            .collect()
    }
}

impl Default for PdfExtractor {
    fn default() -> Self {
        Self::new(PdfConfig::default())
    }
}

/// Chunk a document for RAG ingestion
pub fn chunk_for_rag(
    doc: &ExtractedDocument,
    chunk_size: usize,
    overlap: usize,
) -> Vec<DocumentChunk> {
    let mut chunks = Vec::new();
    let text = &doc.full_text;
    let mut start = 0;

    while start < text.len() {
        let end = (start + chunk_size).min(text.len());

        // Try to break at a sentence boundary
        let actual_end = if end < text.len() {
            text[start..end]
                .rfind(|c| c == '.' || c == '!' || c == '?')
                .map(|i| start + i + 1)
                .unwrap_or(end)
        } else {
            end
        };

        chunks.push(DocumentChunk {
            content: text[start..actual_end].to_string(),
            page_start: 1, // Would need proper page tracking
            page_end: 1,
            chunk_index: chunks.len(),
            metadata: doc.metadata.clone(),
        });

        start = if overlap > 0 && actual_end > overlap {
            actual_end - overlap
        } else {
            actual_end
        };
    }

    chunks
}

/// A document chunk for RAG
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChunk {
    pub content: String,
    pub page_start: usize,
    pub page_end: usize,
    pub chunk_index: usize,
    pub metadata: DocumentMetadata,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_config_default() {
        let config = PdfConfig::default();
        assert!(config.preserve_layout);
        assert!(config.extract_metadata);
    }

    #[test]
    fn test_table_to_markdown() {
        let table = Table {
            caption: Some("Test Table".to_string()),
            headers: vec!["Col1".to_string(), "Col2".to_string()],
            rows: vec![vec!["A".to_string(), "B".to_string()]],
        };

        let md = table.to_markdown();
        assert!(md.contains("Test Table"));
        assert!(md.contains("Col1"));
        assert!(md.contains("| A | B |"));
    }
}