oxirs_vec/content_processing/
pdf_handler.rs

1//! PDF document handler for content processing
2//!
3//! This module provides PDF document parsing and content extraction capabilities.
4
5#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7    ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8    ExtractedImage, ExtractedLink, ExtractedTable, FormatHandler, Heading, ProcessingStats,
9    TocEntry,
10};
11#[cfg(feature = "content-processing")]
12use anyhow::{anyhow, Result};
13#[cfg(feature = "content-processing")]
14use std::collections::HashMap;
15
16/// PDF document handler
17#[cfg(feature = "content-processing")]
18pub struct PdfHandler;
19
20#[cfg(feature = "content-processing")]
21impl FormatHandler for PdfHandler {
22    fn extract_content(
23        &self,
24        data: &[u8],
25        config: &ContentExtractionConfig,
26    ) -> Result<ExtractedContent> {
27        let text = match pdf_extract::extract_text_from_mem(data) {
28            Ok(extracted_text) => {
29                if extracted_text.trim().is_empty() {
30                    return Err(anyhow!("No text content found in PDF"));
31                }
32                extracted_text
33            }
34            Err(e) => {
35                return Err(anyhow!("Failed to extract text from PDF: {}", e));
36            }
37        };
38
39        // Enhanced metadata extraction
40        let mut metadata = HashMap::new();
41        metadata.insert("format".to_string(), "PDF".to_string());
42        metadata.insert("size".to_string(), data.len().to_string());
43        metadata.insert("extraction_method".to_string(), "pdf-extract".to_string());
44
45        // Try to extract metadata from PDF header
46        if let Some(pdf_metadata) = self.extract_pdf_metadata(data) {
47            for (key, value) in pdf_metadata {
48                metadata.insert(key, value);
49            }
50        }
51
52        // Enhanced page count estimation
53        let estimated_pages = text.matches("\x0C").count().max(1); // Form feed character
54
55        // Extract structural elements
56        let headings = self.extract_pdf_headings(&text);
57        let tables = if config.extract_tables {
58            self.extract_pdf_tables(&text)
59        } else {
60            Vec::new()
61        };
62
63        let links = if config.extract_links {
64            self.extract_pdf_links(&text)
65        } else {
66            Vec::new()
67        };
68
69        // Enhanced table of contents generation
70        let toc = self.generate_table_of_contents(&headings);
71
72        // Extract images (basic implementation)
73        let images = if config.extract_images {
74            self.extract_pdf_images(data, config).unwrap_or_default()
75        } else {
76            Vec::new()
77        };
78
79        Ok(ExtractedContent {
80            format: DocumentFormat::Pdf,
81            text: text.trim().to_string(),
82            metadata,
83            images,
84            tables,
85            links,
86            structure: DocumentStructure {
87                title: self.extract_pdf_title(&text),
88                headings: headings.clone(),
89                page_count: estimated_pages,
90                section_count: headings.len().max(1),
91                table_of_contents: toc,
92            },
93            chunks: Vec::new(),
94            language: None,
95            processing_stats: ProcessingStats::default(),
96            audio_content: Vec::new(),
97            video_content: Vec::new(),
98            cross_modal_embeddings: Vec::new(),
99        })
100    }
101
102    fn can_handle(&self, data: &[u8]) -> bool {
103        data.len() >= 4 && &data[0..4] == b"%PDF"
104    }
105
106    fn supported_extensions(&self) -> Vec<&'static str> {
107        vec!["pdf"]
108    }
109}
110
111#[cfg(feature = "content-processing")]
112impl PdfHandler {
113    fn extract_pdf_title(&self, text: &str) -> Option<String> {
114        // Look for title-like patterns at the beginning of the document
115        let lines: Vec<&str> = text.lines().take(10).collect();
116        for line in lines {
117            let trimmed = line.trim();
118            if trimmed.len() > 5
119                && trimmed.len() < 100
120                && !trimmed.contains("http")
121                && !trimmed.contains("www")
122            {
123                // Simple heuristic: first substantial line might be title
124                return Some(trimmed.to_string());
125            }
126        }
127        None
128    }
129
130    fn extract_pdf_headings(&self, text: &str) -> Vec<Heading> {
131        let mut headings = Vec::new();
132
133        // Simple heuristic: lines that are shorter, have capital words, and are followed by text
134        let lines: Vec<&str> = text.lines().collect();
135        for (i, line) in lines.iter().enumerate() {
136            let trimmed = line.trim();
137            if trimmed.len() > 5 && trimmed.len() < 80 {
138                // Check if it looks like a heading
139                let words: Vec<&str> = trimmed.split_whitespace().collect();
140                let capitalized_words = words
141                    .iter()
142                    .filter(|w| w.chars().next().is_some_and(|c| c.is_uppercase()))
143                    .count();
144
145                if capitalized_words >= words.len() / 2 && words.len() <= 10 {
146                    headings.push(Heading {
147                        level: 1, // Simple assumption - would need better analysis
148                        text: trimmed.to_string(),
149                        location: ContentLocation {
150                            page: None,
151                            section: None,
152                            char_offset: None,
153                            line: Some(i + 1),
154                            column: None,
155                        },
156                    });
157                }
158            }
159        }
160
161        headings
162    }
163
164    /// Extract tables from PDF text using pattern recognition
165    fn extract_pdf_tables(&self, text: &str) -> Vec<ExtractedTable> {
166        let mut tables = Vec::new();
167        let lines: Vec<&str> = text.lines().collect();
168
169        let mut current_table: Vec<Vec<String>> = Vec::new();
170        let mut in_table = false;
171
172        for (i, line) in lines.iter().enumerate() {
173            let trimmed = line.trim();
174
175            // Detect table-like patterns (multiple columns separated by spaces/tabs)
176            let parts: Vec<&str> = trimmed.split_whitespace().collect();
177
178            if parts.len() >= 2 && parts.len() <= 8 {
179                // Check if this looks like a table row
180                let has_numbers = parts.iter().any(|p| p.parse::<f64>().is_ok());
181                let consistent_spacing =
182                    trimmed.contains('\t') || trimmed.matches("  ").count() >= 2;
183
184                if has_numbers || consistent_spacing {
185                    if !in_table {
186                        in_table = true;
187                        current_table.clear();
188                    }
189
190                    let row: Vec<String> = parts.iter().map(|s| s.to_string()).collect();
191                    current_table.push(row);
192                } else if in_table && current_table.len() >= 2 {
193                    // End of table detected
194                    tables.push(ExtractedTable {
195                        headers: if current_table.len() > 1 {
196                            current_table[0].clone()
197                        } else {
198                            Vec::new()
199                        },
200                        rows: current_table[1..].to_vec(),
201                        caption: None,
202                        location: ContentLocation {
203                            page: None,
204                            section: None,
205                            char_offset: None,
206                            line: Some(i + 1),
207                            column: None,
208                        },
209                    });
210
211                    in_table = false;
212                    current_table.clear();
213                }
214            } else if in_table {
215                // Non-table line encountered, end current table
216                if current_table.len() >= 2 {
217                    tables.push(ExtractedTable {
218                        headers: if current_table.len() > 1 {
219                            current_table[0].clone()
220                        } else {
221                            Vec::new()
222                        },
223                        rows: current_table[1..].to_vec(),
224                        caption: None,
225                        location: ContentLocation {
226                            page: None,
227                            section: None,
228                            char_offset: None,
229                            line: Some(i + 1),
230                            column: None,
231                        },
232                    });
233                }
234
235                in_table = false;
236                current_table.clear();
237            }
238        }
239
240        // Handle table at end of document
241        if in_table && current_table.len() >= 2 {
242            tables.push(ExtractedTable {
243                headers: if current_table.len() > 1 {
244                    current_table[0].clone()
245                } else {
246                    Vec::new()
247                },
248                rows: current_table[1..].to_vec(),
249                caption: None,
250                location: ContentLocation {
251                    page: None,
252                    section: None,
253                    char_offset: None,
254                    line: Some(lines.len()),
255                    column: None,
256                },
257            });
258        }
259
260        tables
261    }
262
263    /// Extract links from PDF text
264    fn extract_pdf_links(&self, text: &str) -> Vec<ExtractedLink> {
265        let mut links = Vec::new();
266
267        // Regular expressions for different link types
268        let url_regex = regex::Regex::new(r"https?://[^\s\)]+").unwrap();
269        let email_regex =
270            regex::Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap();
271
272        // Find HTTP/HTTPS URLs
273        for mat in url_regex.find_iter(text) {
274            let url = mat
275                .as_str()
276                .trim_end_matches(&['.', ',', ')', ']', '}'][..]);
277            links.push(ExtractedLink {
278                url: url.to_string(),
279                text: url.to_string(),
280                title: None,
281                location: ContentLocation {
282                    page: None,
283                    section: None,
284                    char_offset: None,
285                    line: None,
286                    column: None,
287                },
288            });
289        }
290
291        // Find email addresses
292        for mat in email_regex.find_iter(text) {
293            let email = mat.as_str();
294            links.push(ExtractedLink {
295                url: format!("mailto:{}", email),
296                text: email.to_string(),
297                title: None,
298                location: ContentLocation {
299                    page: None,
300                    section: None,
301                    char_offset: None,
302                    line: None,
303                    column: None,
304                },
305            });
306        }
307
308        links
309    }
310
311    /// Extract basic metadata from PDF bytes
312    fn extract_pdf_metadata(&self, data: &[u8]) -> Option<HashMap<String, String>> {
313        let mut metadata = HashMap::new();
314
315        // Simple PDF metadata extraction (looking for Info dictionary patterns)
316        let content = String::from_utf8_lossy(data).into_owned();
317
318        // Look for title
319        if let Some(title_match) = regex::Regex::new(r"/Title\s*\(\s*([^)]+)\s*\)")
320            .unwrap()
321            .captures(&content)
322        {
323            if let Some(title) = title_match.get(1) {
324                metadata.insert("title".to_string(), title.as_str().to_string());
325            }
326        }
327
328        // Look for author
329        if let Some(author_match) = regex::Regex::new(r"/Author\s*\(\s*([^)]+)\s*\)")
330            .unwrap()
331            .captures(&content)
332        {
333            if let Some(author) = author_match.get(1) {
334                metadata.insert("author".to_string(), author.as_str().to_string());
335            }
336        }
337
338        // Look for subject
339        if let Some(subject_match) = regex::Regex::new(r"/Subject\s*\(\s*([^)]+)\s*\)")
340            .unwrap()
341            .captures(&content)
342        {
343            if let Some(subject) = subject_match.get(1) {
344                metadata.insert("subject".to_string(), subject.as_str().to_string());
345            }
346        }
347
348        // Look for creation date
349        if let Some(date_match) = regex::Regex::new(r"/CreationDate\s*\(\s*([^)]+)\s*\)")
350            .unwrap()
351            .captures(&content)
352        {
353            if let Some(date) = date_match.get(1) {
354                metadata.insert("creation_date".to_string(), date.as_str().to_string());
355            }
356        }
357
358        if metadata.is_empty() {
359            None
360        } else {
361            Some(metadata)
362        }
363    }
364
365    /// Extract images from PDF (basic implementation)
366    fn extract_pdf_images(
367        &self,
368        _data: &[u8],
369        config: &ContentExtractionConfig,
370    ) -> Result<Vec<ExtractedImage>> {
371        if config.extract_images {
372            // This is a placeholder implementation
373            // In a real scenario, you'd use a PDF library that can extract embedded images
374            // For now, just return empty vector
375            Ok(Vec::new())
376        } else {
377            Ok(Vec::new())
378        }
379    }
380
381    /// Generate table of contents from headings
382    fn generate_table_of_contents(&self, headings: &[Heading]) -> Vec<TocEntry> {
383        headings
384            .iter()
385            .map(|heading| TocEntry {
386                title: heading.text.clone(),
387                level: heading.level,
388                page: heading.location.page,
389                location: heading.location.clone(),
390            })
391            .collect()
392    }
393}