Skip to main content

memvid_rs/text/
pdf.rs

1//! PDF text extraction functionality
2//!
3//! This module provides utilities for extracting text content from PDF files
4//! using both pdf-extract and lopdf crates for maximum compatibility.
5
6use crate::error::{MemvidError, Result};
7use std::path::Path;
8
9/// PDF text extraction processor
10pub struct PdfProcessor;
11
12impl PdfProcessor {
13    /// Extract text from a PDF file
14    pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
15        let path = path.as_ref();
16
17        // Try using pdf-extract first (simpler)
18        match pdf_extract::extract_text(path) {
19            Ok(text) => Ok(text),
20            Err(e) => {
21                log::warn!("pdf-extract failed, trying lopdf: {}", e);
22                Self::extract_with_lopdf(path)
23            }
24        }
25    }
26
27    /// Extract text using lopdf (fallback method)
28    fn extract_with_lopdf<P: AsRef<Path>>(path: P) -> Result<String> {
29        use lopdf::Document;
30
31        let doc = Document::load(path)
32            .map_err(|e| MemvidError::Pdf(format!("Failed to load PDF: {}", e)))?;
33
34        let mut text = String::new();
35        let pages = doc.get_pages();
36
37        for (page_num, _) in pages {
38            match doc.extract_text(&[page_num]) {
39                Ok(page_text) => {
40                    text.push_str(&page_text);
41                    text.push_str("\n\n");
42                }
43                Err(e) => {
44                    log::warn!("Failed to extract text from page {}: {}", page_num, e);
45                }
46            }
47        }
48
49        if text.trim().is_empty() {
50            return Err(MemvidError::Pdf("No text extracted from PDF".to_string()));
51        }
52
53        Ok(text)
54    }
55
56    /// Extract text with page information
57    pub fn extract_text_with_pages<P: AsRef<Path>>(path: P) -> Result<Vec<(u32, String)>> {
58        use lopdf::Document;
59
60        let doc = Document::load(path)
61            .map_err(|e| MemvidError::Pdf(format!("Failed to load PDF: {}", e)))?;
62
63        let mut pages_text = Vec::new();
64        let pages = doc.get_pages();
65
66        for (page_num, _) in pages {
67            match doc.extract_text(&[page_num]) {
68                Ok(page_text) => {
69                    if !page_text.trim().is_empty() {
70                        pages_text.push((page_num, page_text));
71                    }
72                }
73                Err(e) => {
74                    log::warn!("Failed to extract text from page {}: {}", page_num, e);
75                }
76            }
77        }
78
79        if pages_text.is_empty() {
80            return Err(MemvidError::Pdf("No text extracted from PDF".to_string()));
81        }
82
83        Ok(pages_text)
84    }
85
86    /// Check if a file is a valid PDF
87    pub fn is_pdf<P: AsRef<Path>>(path: P) -> bool {
88        use std::fs::File;
89        use std::io::Read;
90
91        let mut file = match File::open(path) {
92            Ok(file) => file,
93            Err(_) => return false,
94        };
95
96        let mut buffer = [0; 4];
97        match file.read_exact(&mut buffer) {
98            Ok(_) => buffer == b"%PDF"[..],
99            Err(_) => false,
100        }
101    }
102
103    /// Get PDF metadata (page count, title, etc.)
104    pub fn get_metadata<P: AsRef<Path>>(path: P) -> Result<PdfMetadata> {
105        use lopdf::Document;
106
107        let doc = Document::load(path)
108            .map_err(|e| MemvidError::Pdf(format!("Failed to load PDF: {}", e)))?;
109
110        let page_count = doc.get_pages().len() as u32;
111
112        // Try to extract title from document info
113        let title = Self::extract_title(&doc);
114
115        Ok(PdfMetadata { page_count, title })
116    }
117
118    /// Extract title from PDF document
119    fn extract_title(doc: &lopdf::Document) -> Option<String> {
120        // Try to get document info dictionary
121        if let Ok(info_dict) = doc.trailer.get(b"Info") {
122            if let Ok(info_ref) = info_dict.as_reference() {
123                if let Ok(info_obj) = doc.get_object(info_ref) {
124                    if let Ok(info_dict) = info_obj.as_dict() {
125                        // Look for title field
126                        if let Ok(title_obj) = info_dict.get(b"Title") {
127                            if let Ok(title_bytes) = title_obj.as_str() {
128                                if let Ok(title_string) = String::from_utf8(title_bytes.to_vec()) {
129                                    return Some(title_string);
130                                }
131                            }
132                        }
133                    }
134                }
135            }
136        }
137
138        // Fallback: try to extract from first few lines of text
139        let pages = doc.get_pages();
140        if let Some((page_num, _)) = pages.into_iter().next() {
141            if let Ok(text) = doc.extract_text(&[page_num]) {
142                let lines: Vec<&str> = text.lines().take(3).collect();
143                for line in lines {
144                    let trimmed = line.trim();
145                    if trimmed.len() > 10 && trimmed.len() < 200 {
146                        // Likely a title if it's reasonably sized
147                        return Some(trimmed.to_string());
148                    }
149                }
150            }
151        }
152
153        None
154    }
155}
156
157/// PDF document metadata
158#[derive(Debug, Clone)]
159pub struct PdfMetadata {
160    /// Number of pages in the PDF
161    pub page_count: u32,
162
163    /// Document title (if available)
164    pub title: Option<String>,
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170    use std::io::Write;
171    use tempfile::NamedTempFile;
172
173    #[test]
174    fn test_is_pdf_detection() {
175        // Create a temporary file with PDF header
176        let mut temp_file = NamedTempFile::new().unwrap();
177        writeln!(temp_file, "%PDF-1.4").unwrap();
178
179        assert!(PdfProcessor::is_pdf(temp_file.path()));
180    }
181
182    #[test]
183    fn test_non_pdf_detection() {
184        // Create a temporary text file
185        let mut temp_file = NamedTempFile::new().unwrap();
186        writeln!(temp_file, "This is not a PDF").unwrap();
187
188        assert!(!PdfProcessor::is_pdf(temp_file.path()));
189    }
190
191    #[test]
192    fn test_nonexistent_file() {
193        assert!(!PdfProcessor::is_pdf("/nonexistent/file.pdf"));
194    }
195}