oxify_connect_vision/
pdf_processing.rs

1//! Multi-page PDF document processing.
2//!
3//! This module provides functionality to process multi-page PDF documents,
4//! extracting text and structure from each page with support for page ordering,
5//! table of contents generation, and cross-page table handling.
6
7use crate::types::OcrResult;
8use crate::VisionProvider;
9use serde::{Deserialize, Serialize};
10use std::sync::Arc;
11
12/// Represents a processed page from a PDF document.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct PdfPage {
15    /// Page number (1-indexed)
16    pub page_number: usize,
17    /// OCR result for this page
18    pub ocr_result: OcrResult,
19    /// Page dimensions (width, height) in points
20    pub dimensions: (f32, f32),
21    /// Page rotation in degrees
22    pub rotation: i32,
23}
24
25/// Result of processing a multi-page PDF document.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct PdfDocumentResult {
28    /// All processed pages in order
29    pub pages: Vec<PdfPage>,
30    /// Total number of pages
31    pub total_pages: usize,
32    /// Document metadata
33    pub metadata: PdfMetadata,
34    /// Combined text from all pages
35    pub full_text: String,
36}
37
38/// PDF document metadata.
39#[derive(Debug, Clone, Serialize, Deserialize, Default)]
40pub struct PdfMetadata {
41    /// Document title
42    pub title: Option<String>,
43    /// Document author
44    pub author: Option<String>,
45    /// Document subject
46    pub subject: Option<String>,
47    /// Creation date
48    pub creation_date: Option<String>,
49    /// Modification date
50    pub modification_date: Option<String>,
51    /// Producer/creator software
52    pub producer: Option<String>,
53}
54
55/// Configuration for PDF processing.
56#[derive(Debug, Clone)]
57pub struct PdfProcessingConfig {
58    /// Maximum number of pages to process (None for all pages)
59    pub max_pages: Option<usize>,
60    /// Page range to process (start, end) - 1-indexed
61    pub page_range: Option<(usize, usize)>,
62    /// Enable table of contents generation
63    pub generate_toc: bool,
64    /// Combine text from all pages
65    pub combine_text: bool,
66    /// DPI for rendering PDF pages to images
67    pub render_dpi: u32,
68}
69
70impl Default for PdfProcessingConfig {
71    fn default() -> Self {
72        Self {
73            max_pages: None,
74            page_range: None,
75            generate_toc: true,
76            combine_text: true,
77            render_dpi: 300,
78        }
79    }
80}
81
82/// PDF document processor.
83pub struct PdfProcessor {
84    config: PdfProcessingConfig,
85}
86
87impl PdfProcessor {
88    /// Create a new PDF processor with default configuration.
89    pub fn new() -> Self {
90        Self {
91            config: PdfProcessingConfig::default(),
92        }
93    }
94
95    /// Create a new PDF processor with custom configuration.
96    pub fn with_config(config: PdfProcessingConfig) -> Self {
97        Self { config }
98    }
99
100    /// Process a PDF document from raw bytes.
101    ///
102    /// Note: This is a stub implementation. Real PDF processing would require
103    /// a PDF library like pdf-rs or pdfium to extract pages and render them as images.
104    pub async fn process_pdf(
105        &self,
106        _pdf_data: &[u8],
107        _provider: Arc<dyn VisionProvider>,
108    ) -> crate::Result<PdfDocumentResult> {
109        // Placeholder implementation
110        // In a real implementation, this would:
111        // 1. Parse the PDF using a PDF library
112        // 2. Extract metadata
113        // 3. Render each page to an image at specified DPI
114        // 4. Process each page image with the OCR provider
115        // 5. Combine results
116
117        let metadata = PdfMetadata::default();
118
119        let pages = vec![PdfPage {
120            page_number: 1,
121            ocr_result: OcrResult::from_text("[PDF processing requires pdf library]"),
122            dimensions: (612.0, 792.0), // US Letter size
123            rotation: 0,
124        }];
125
126        let full_text = if self.config.combine_text {
127            pages
128                .iter()
129                .map(|p| p.ocr_result.text.clone())
130                .collect::<Vec<_>>()
131                .join("\n\n")
132        } else {
133            String::new()
134        };
135
136        Ok(PdfDocumentResult {
137            total_pages: pages.len(),
138            pages,
139            metadata,
140            full_text,
141        })
142    }
143
144    /// Process a specific page range from a PDF.
145    pub async fn process_page_range(
146        &self,
147        pdf_data: &[u8],
148        provider: Arc<dyn VisionProvider>,
149        start_page: usize,
150        end_page: usize,
151    ) -> crate::Result<PdfDocumentResult> {
152        let mut config = self.config.clone();
153        config.page_range = Some((start_page, end_page));
154
155        let processor = Self::with_config(config);
156        processor.process_pdf(pdf_data, provider).await
157    }
158
159    /// Extract metadata from a PDF document.
160    pub fn extract_metadata(&self, _pdf_data: &[u8]) -> crate::Result<PdfMetadata> {
161        // Placeholder implementation
162        Ok(PdfMetadata::default())
163    }
164}
165
166impl Default for PdfProcessor {
167    fn default() -> Self {
168        Self::new()
169    }
170}
171
172impl PdfDocumentResult {
173    /// Get a specific page by number (1-indexed).
174    pub fn get_page(&self, page_number: usize) -> Option<&PdfPage> {
175        self.pages.iter().find(|p| p.page_number == page_number)
176    }
177
178    /// Generate a table of contents from page headings.
179    pub fn generate_toc(&self) -> Vec<TocEntry> {
180        let mut toc = Vec::new();
181
182        for page in &self.pages {
183            // Find heading blocks in the page
184            for block in &page.ocr_result.blocks {
185                if matches!(block.role, crate::types::BlockRole::Header) {
186                    toc.push(TocEntry {
187                        title: block.text.clone(),
188                        page_number: page.page_number,
189                        level: 1, // Could be enhanced with heading level detection
190                    });
191                }
192            }
193        }
194
195        toc
196    }
197
198    /// Export to a single markdown document.
199    pub fn to_markdown(&self) -> String {
200        let mut output = String::new();
201
202        // Add metadata if available
203        if let Some(ref title) = self.metadata.title {
204            output.push_str(&format!("# {}\n\n", title));
205        }
206
207        // Add pages
208        for page in &self.pages {
209            output.push_str(&format!("## Page {}\n\n", page.page_number));
210            output.push_str(&page.ocr_result.markdown);
211            output.push_str("\n\n");
212        }
213
214        output
215    }
216
217    /// Export to HTML format.
218    pub fn to_html(&self) -> String {
219        let mut output = String::from("<!DOCTYPE html>\n<html>\n<head>\n");
220
221        if let Some(ref title) = self.metadata.title {
222            output.push_str(&format!("  <title>{}</title>\n", title));
223        }
224
225        output.push_str("</head>\n<body>\n");
226
227        for page in &self.pages {
228            output.push_str(&format!(
229                "  <div class=\"page\" data-page=\"{}\">\n",
230                page.page_number
231            ));
232            output.push_str(&format!("    <h2>Page {}</h2>\n", page.page_number));
233            output.push_str("    <div class=\"content\">\n");
234            output.push_str(&format!("      {}\n", page.ocr_result.text));
235            output.push_str("    </div>\n");
236            output.push_str("  </div>\n");
237        }
238
239        output.push_str("</body>\n</html>");
240        output
241    }
242
243    /// Search for text across all pages.
244    pub fn search(&self, query: &str) -> Vec<SearchResult> {
245        let mut results = Vec::new();
246
247        for page in &self.pages {
248            if page
249                .ocr_result
250                .text
251                .to_lowercase()
252                .contains(&query.to_lowercase())
253            {
254                results.push(SearchResult {
255                    page_number: page.page_number,
256                    context: self.extract_context(&page.ocr_result.text, query, 50),
257                });
258            }
259        }
260
261        results
262    }
263
264    /// Extract context around a search term.
265    fn extract_context(&self, text: &str, query: &str, context_chars: usize) -> String {
266        let lower_text = text.to_lowercase();
267        let lower_query = query.to_lowercase();
268
269        if let Some(pos) = lower_text.find(&lower_query) {
270            let start = pos.saturating_sub(context_chars);
271            let end = (pos + query.len() + context_chars).min(text.len());
272
273            let mut context = text[start..end].to_string();
274
275            if start > 0 {
276                context = format!("...{}", context);
277            }
278            if end < text.len() {
279                context.push_str("...");
280            }
281
282            context
283        } else {
284            String::new()
285        }
286    }
287}
288
289/// Table of contents entry.
290#[derive(Debug, Clone, Serialize, Deserialize)]
291pub struct TocEntry {
292    /// Entry title
293    pub title: String,
294    /// Page number where this entry appears
295    pub page_number: usize,
296    /// Heading level (1-6)
297    pub level: usize,
298}
299
300/// Search result for text search in PDF.
301#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct SearchResult {
303    /// Page number where match was found
304    pub page_number: usize,
305    /// Context around the match
306    pub context: String,
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312
313    #[test]
314    fn test_pdf_processing_config_default() {
315        let config = PdfProcessingConfig::default();
316        assert_eq!(config.render_dpi, 300);
317        assert!(config.generate_toc);
318        assert!(config.combine_text);
319    }
320
321    #[test]
322    fn test_pdf_page_creation() {
323        let page = PdfPage {
324            page_number: 1,
325            ocr_result: OcrResult::from_text("Test page"),
326            dimensions: (612.0, 792.0),
327            rotation: 0,
328        };
329
330        assert_eq!(page.page_number, 1);
331        assert_eq!(page.ocr_result.text, "Test page");
332    }
333
334    #[test]
335    fn test_pdf_document_get_page() {
336        let result = PdfDocumentResult {
337            pages: vec![
338                PdfPage {
339                    page_number: 1,
340                    ocr_result: OcrResult::from_text("Page 1"),
341                    dimensions: (612.0, 792.0),
342                    rotation: 0,
343                },
344                PdfPage {
345                    page_number: 2,
346                    ocr_result: OcrResult::from_text("Page 2"),
347                    dimensions: (612.0, 792.0),
348                    rotation: 0,
349                },
350            ],
351            total_pages: 2,
352            metadata: PdfMetadata::default(),
353            full_text: String::new(),
354        };
355
356        let page1 = result.get_page(1);
357        assert!(page1.is_some());
358        assert_eq!(page1.unwrap().ocr_result.text, "Page 1");
359
360        let page3 = result.get_page(3);
361        assert!(page3.is_none());
362    }
363
364    #[test]
365    fn test_pdf_document_search() {
366        let result = PdfDocumentResult {
367            pages: vec![
368                PdfPage {
369                    page_number: 1,
370                    ocr_result: OcrResult::from_text("Hello world from page 1"),
371                    dimensions: (612.0, 792.0),
372                    rotation: 0,
373                },
374                PdfPage {
375                    page_number: 2,
376                    ocr_result: OcrResult::from_text("Different content on page 2"),
377                    dimensions: (612.0, 792.0),
378                    rotation: 0,
379                },
380            ],
381            total_pages: 2,
382            metadata: PdfMetadata::default(),
383            full_text: String::new(),
384        };
385
386        let results = result.search("world");
387        assert_eq!(results.len(), 1);
388        assert_eq!(results[0].page_number, 1);
389    }
390
391    #[test]
392    fn test_toc_entry_creation() {
393        let entry = TocEntry {
394            title: "Chapter 1".to_string(),
395            page_number: 5,
396            level: 1,
397        };
398
399        assert_eq!(entry.title, "Chapter 1");
400        assert_eq!(entry.page_number, 5);
401        assert_eq!(entry.level, 1);
402    }
403
404    #[test]
405    fn test_pdf_metadata() {
406        let metadata = PdfMetadata {
407            title: Some("Test Document".to_string()),
408            author: Some("Test Author".to_string()),
409            subject: None,
410            creation_date: None,
411            modification_date: None,
412            producer: Some("Test Producer".to_string()),
413        };
414
415        assert_eq!(metadata.title.unwrap(), "Test Document");
416        assert_eq!(metadata.author.unwrap(), "Test Author");
417    }
418}