1use crate::error::{MemvidError, Result};
7use std::path::Path;
8
9pub struct PdfProcessor;
11
12impl PdfProcessor {
13 pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
15 let path = path.as_ref();
16
17 match pdf_extract::extract_text(path) {
19 Ok(text) => Ok(text),
20 Err(e) => {
21 log::warn!("pdf-extract failed, trying lopdf: {}", e);
22 Self::extract_with_lopdf(path)
23 }
24 }
25 }
26
27 fn extract_with_lopdf<P: AsRef<Path>>(path: P) -> Result<String> {
29 use lopdf::Document;
30
31 let doc = Document::load(path)
32 .map_err(|e| MemvidError::Pdf(format!("Failed to load PDF: {}", e)))?;
33
34 let mut text = String::new();
35 let pages = doc.get_pages();
36
37 for (page_num, _) in pages {
38 match doc.extract_text(&[page_num]) {
39 Ok(page_text) => {
40 text.push_str(&page_text);
41 text.push_str("\n\n");
42 }
43 Err(e) => {
44 log::warn!("Failed to extract text from page {}: {}", page_num, e);
45 }
46 }
47 }
48
49 if text.trim().is_empty() {
50 return Err(MemvidError::Pdf("No text extracted from PDF".to_string()));
51 }
52
53 Ok(text)
54 }
55
56 pub fn extract_text_with_pages<P: AsRef<Path>>(path: P) -> Result<Vec<(u32, String)>> {
58 use lopdf::Document;
59
60 let doc = Document::load(path)
61 .map_err(|e| MemvidError::Pdf(format!("Failed to load PDF: {}", e)))?;
62
63 let mut pages_text = Vec::new();
64 let pages = doc.get_pages();
65
66 for (page_num, _) in pages {
67 match doc.extract_text(&[page_num]) {
68 Ok(page_text) => {
69 if !page_text.trim().is_empty() {
70 pages_text.push((page_num, page_text));
71 }
72 }
73 Err(e) => {
74 log::warn!("Failed to extract text from page {}: {}", page_num, e);
75 }
76 }
77 }
78
79 if pages_text.is_empty() {
80 return Err(MemvidError::Pdf("No text extracted from PDF".to_string()));
81 }
82
83 Ok(pages_text)
84 }
85
86 pub fn is_pdf<P: AsRef<Path>>(path: P) -> bool {
88 use std::fs::File;
89 use std::io::Read;
90
91 let mut file = match File::open(path) {
92 Ok(file) => file,
93 Err(_) => return false,
94 };
95
96 let mut buffer = [0; 4];
97 match file.read_exact(&mut buffer) {
98 Ok(_) => buffer == b"%PDF"[..],
99 Err(_) => false,
100 }
101 }
102
103 pub fn get_metadata<P: AsRef<Path>>(path: P) -> Result<PdfMetadata> {
105 use lopdf::Document;
106
107 let doc = Document::load(path)
108 .map_err(|e| MemvidError::Pdf(format!("Failed to load PDF: {}", e)))?;
109
110 let page_count = doc.get_pages().len() as u32;
111
112 let title = Self::extract_title(&doc);
114
115 Ok(PdfMetadata { page_count, title })
116 }
117
118 fn extract_title(doc: &lopdf::Document) -> Option<String> {
120 if let Ok(info_dict) = doc.trailer.get(b"Info") {
122 if let Ok(info_ref) = info_dict.as_reference() {
123 if let Ok(info_obj) = doc.get_object(info_ref) {
124 if let Ok(info_dict) = info_obj.as_dict() {
125 if let Ok(title_obj) = info_dict.get(b"Title") {
127 if let Ok(title_bytes) = title_obj.as_str() {
128 if let Ok(title_string) = String::from_utf8(title_bytes.to_vec()) {
129 return Some(title_string);
130 }
131 }
132 }
133 }
134 }
135 }
136 }
137
138 let pages = doc.get_pages();
140 if let Some((page_num, _)) = pages.into_iter().next() {
141 if let Ok(text) = doc.extract_text(&[page_num]) {
142 let lines: Vec<&str> = text.lines().take(3).collect();
143 for line in lines {
144 let trimmed = line.trim();
145 if trimmed.len() > 10 && trimmed.len() < 200 {
146 return Some(trimmed.to_string());
148 }
149 }
150 }
151 }
152
153 None
154 }
155}
156
157#[derive(Debug, Clone)]
159pub struct PdfMetadata {
160 pub page_count: u32,
162
163 pub title: Option<String>,
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170 use std::io::Write;
171 use tempfile::NamedTempFile;
172
173 #[test]
174 fn test_is_pdf_detection() {
175 let mut temp_file = NamedTempFile::new().unwrap();
177 writeln!(temp_file, "%PDF-1.4").unwrap();
178
179 assert!(PdfProcessor::is_pdf(temp_file.path()));
180 }
181
182 #[test]
183 fn test_non_pdf_detection() {
184 let mut temp_file = NamedTempFile::new().unwrap();
186 writeln!(temp_file, "This is not a PDF").unwrap();
187
188 assert!(!PdfProcessor::is_pdf(temp_file.path()));
189 }
190
191 #[test]
192 fn test_nonexistent_file() {
193 assert!(!PdfProcessor::is_pdf("/nonexistent/file.pdf"));
194 }
195}