1use anyhow::{Result, anyhow};
23use serde::{Deserialize, Serialize};
24use std::path::{Path, PathBuf};
25use tracing::{debug, info, warn};
26
27use crate::utils::file_utils::ensure_dir_exists_sync;
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct DocumentProcessorConfig {
32 pub enabled: bool,
34
35 pub image_format: String, pub dpi: u32,
40
41 pub max_pages: usize,
43
44 pub enable_ocr_fallback: bool,
46}
47
48impl Default for DocumentProcessorConfig {
49 fn default() -> Self {
50 Self {
51 enabled: true,
52 image_format: "png".to_string(),
53 dpi: 150, max_pages: 50, enable_ocr_fallback: true,
56 }
57 }
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ProcessedDocument {
63 pub source_path: PathBuf,
65
66 pub doc_type: DocumentType,
68
69 pub page_count: usize,
71
72 pub pages: Vec<PageImage>,
74
75 pub extracted_text: Option<String>,
77
78 pub metadata: DocumentMetadata,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
84pub enum DocumentType {
85 Pdf,
86 Docx,
87 Doc,
88 Xlsx,
89 Xls,
90 Csv,
91 Txt,
92 Rtf,
93 Image,
94 Unknown,
95}
96
97impl DocumentType {
98 pub fn from_path(path: &Path) -> Self {
100 match path.extension().and_then(|e| e.to_str()) {
101 Some("pdf") => DocumentType::Pdf,
102 Some("docx") => DocumentType::Docx,
103 Some("doc") => DocumentType::Doc,
104 Some("xlsx") => DocumentType::Xlsx,
105 Some("xls") => DocumentType::Xls,
106 Some("csv") => DocumentType::Csv,
107 Some("txt") => DocumentType::Txt,
108 Some("rtf") => DocumentType::Rtf,
109 Some("png") | Some("jpg") | Some("jpeg") | Some("gif") | Some("bmp") | Some("tiff") => {
110 DocumentType::Image
111 }
112 _ => DocumentType::Unknown,
113 }
114 }
115
116 pub fn supports_vision_processing(&self) -> bool {
118 matches!(
119 self,
120 DocumentType::Pdf
121 | DocumentType::Docx
122 | DocumentType::Doc
123 | DocumentType::Xlsx
124 | DocumentType::Xls
125 | DocumentType::Image
126 )
127 }
128}
129
130#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct PageImage {
133 pub page_number: usize,
135
136 pub image_path: PathBuf,
138
139 pub dimensions: ImageDimensions,
141
142 pub text_content: Option<String>,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct ImageDimensions {
149 pub width: u32,
150 pub height: u32,
151}
152
153#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct DocumentMetadata {
156 pub title: Option<String>,
157 pub author: Option<String>,
158 pub created_date: Option<String>,
159 pub modified_date: Option<String>,
160 pub file_size: u64,
161 pub page_count: Option<usize>,
162}
163
164pub struct DocumentProcessor {
166 config: DocumentProcessorConfig,
167 temp_dir: PathBuf,
168}
169
170impl DocumentProcessor {
171 pub fn new(config: DocumentProcessorConfig) -> Result<Self> {
173 let temp_dir = std::env::temp_dir().join("vtcode-document-processor");
174 ensure_dir_exists_sync(&temp_dir)?;
175
176 Ok(Self { config, temp_dir })
177 }
178
179 pub async fn process_document(&self, document_path: &Path) -> Result<ProcessedDocument> {
181 if !self.config.enabled {
182 return Err(anyhow!("Document processing is disabled"));
183 }
184
185 if !document_path.exists() {
186 return Err(anyhow!("Document not found: {}", document_path.display()));
187 }
188
189 let doc_type = DocumentType::from_path(document_path);
190 info!(
191 "Processing document: {} (type: {:?})",
192 document_path.display(),
193 doc_type
194 );
195
196 match doc_type {
197 DocumentType::Pdf => self.process_pdf(document_path).await,
198 DocumentType::Docx | DocumentType::Doc => {
199 self.process_word_document(document_path).await
200 }
201 DocumentType::Xlsx | DocumentType::Xls | DocumentType::Csv => {
202 self.process_spreadsheet(document_path).await
203 }
204 DocumentType::Image => self.process_image(document_path).await,
205 other => {
206 warn!("Unsupported document type: {:?}", other);
207 Err(anyhow!("Unsupported document type: {:?}", other))
208 }
209 }
210 }
211
212 async fn process_pdf(&self, pdf_path: &Path) -> Result<ProcessedDocument> {
214 debug!("Processing PDF: {}", pdf_path.display());
215
216 let metadata = self.extract_file_metadata(pdf_path)?;
223
224 Ok(ProcessedDocument {
225 source_path: pdf_path.to_path_buf(),
226 doc_type: DocumentType::Pdf,
227 page_count: 1, pages: vec![], extracted_text: None, metadata,
231 })
232 }
233
234 async fn process_word_document(&self, doc_path: &Path) -> Result<ProcessedDocument> {
236 debug!("Processing Word document: {}", doc_path.display());
237
238 let metadata = self.extract_file_metadata(doc_path)?;
239
240 Ok(ProcessedDocument {
241 source_path: doc_path.to_path_buf(),
242 doc_type: DocumentType::Docx,
243 page_count: 1, pages: vec![],
245 extracted_text: None,
246 metadata,
247 })
248 }
249
250 async fn process_spreadsheet(&self, spreadsheet_path: &Path) -> Result<ProcessedDocument> {
252 debug!("Processing spreadsheet: {}", spreadsheet_path.display());
253
254 let metadata = self.extract_file_metadata(spreadsheet_path)?;
255 let doc_type = DocumentType::from_path(spreadsheet_path);
256
257 Ok(ProcessedDocument {
258 source_path: spreadsheet_path.to_path_buf(),
259 doc_type,
260 page_count: 1, pages: vec![],
262 extracted_text: None,
263 metadata,
264 })
265 }
266
267 async fn process_image(&self, image_path: &Path) -> Result<ProcessedDocument> {
269 debug!("Processing image: {}", image_path.display());
270
271 let metadata = self.extract_file_metadata(image_path)?;
272
273 Ok(ProcessedDocument {
274 source_path: image_path.to_path_buf(),
275 doc_type: DocumentType::Image,
276 page_count: 1,
277 pages: vec![PageImage {
278 page_number: 1,
279 image_path: image_path.to_path_buf(),
280 dimensions: ImageDimensions {
281 width: 0,
282 height: 0,
283 }, text_content: None,
285 }],
286 extracted_text: None,
287 metadata,
288 })
289 }
290
291 fn extract_file_metadata(&self, path: &Path) -> Result<DocumentMetadata> {
293 let metadata = std::fs::metadata(path)?;
294
295 Ok(DocumentMetadata {
296 title: None,
297 author: None,
298 created_date: None,
299 modified_date: None,
300 file_size: metadata.len(),
301 page_count: None,
302 })
303 }
304
305 pub fn generate_vision_prompt(
307 &self,
308 processed: &ProcessedDocument,
309 query: &str,
310 ) -> Result<String> {
311 let mut prompt = String::new();
312
313 prompt.push_str(&format!("Document: {}\n", processed.source_path.display()));
314 prompt.push_str(&format!("Type: {:?}\n", processed.doc_type));
315 prompt.push_str(&format!("Pages: {}\n\n", processed.page_count));
316
317 if let Some(text) = &processed.extracted_text {
318 prompt.push_str("Extracted Text:\n");
319 prompt.push_str(text);
320 prompt.push_str("\n\n");
321 }
322
323 prompt.push_str("Analyze the document images and provide: ");
324 prompt.push_str("\n1. A summary of the content");
325 prompt.push_str("\n2. Key insights or findings");
326 prompt.push_str("\n3. Answers to specific questions");
327 prompt.push_str(&format!("\n\nSpecific query: {}\n", query));
328
329 Ok(prompt)
330 }
331
332 pub fn cleanup(&self) -> Result<()> {
334 if self.temp_dir.exists() {
335 std::fs::remove_dir_all(&self.temp_dir)?;
336 debug!(
337 "Cleaned up temporary directory: {}",
338 self.temp_dir.display()
339 );
340 }
341 Ok(())
342 }
343}
344
345impl Drop for DocumentProcessor {
346 fn drop(&mut self) {
347 let _ = self.cleanup();
349 }
350}
351
352#[cfg(test)]
353mod tests {
354 use super::*;
355
356 #[test]
357 fn test_document_type_detection() {
358 assert_eq!(
359 DocumentType::from_path(Path::new("test.pdf")),
360 DocumentType::Pdf
361 );
362 assert_eq!(
363 DocumentType::from_path(Path::new("test.docx")),
364 DocumentType::Docx
365 );
366 assert_eq!(
367 DocumentType::from_path(Path::new("test.xlsx")),
368 DocumentType::Xlsx
369 );
370 assert_eq!(
371 DocumentType::from_path(Path::new("test.png")),
372 DocumentType::Image
373 );
374 assert_eq!(
375 DocumentType::from_path(Path::new("test.unknown")),
376 DocumentType::Unknown
377 );
378 }
379
380 #[test]
381 fn test_document_processor_creation() {
382 let config = DocumentProcessorConfig::default();
383 let processor = DocumentProcessor::new(config).unwrap();
384 assert!(processor.temp_dir.exists());
385 }
386
387 #[tokio::test]
388 async fn test_process_nonexistent_document() {
389 let config = DocumentProcessorConfig::default();
390 let processor = DocumentProcessor::new(config).unwrap();
391
392 let result = processor
393 .process_document(Path::new("/nonexistent/document.pdf"))
394 .await;
395 result.unwrap_err();
396 }
397}