Skip to main content

vtcode_core/skills/
document_processor.rs

1//! Document Processing for Skills using Vision Models
2//!
3//! Implements OpenAI-style document processing by converting PDFs, DOCX, and spreadsheets
4//! to rendered images for vision model analysis. This preserves layout, formatting, and
5//! visual information that would be lost in text extraction.
6//!
7//! ## Supported Formats
8//!
9//! - **PDF**: Multi-page documents converted to page-by-page PNGs
10//! - **DOCX/DOC**: Word documents rendered per-page
11//! - **Spreadsheets**: Excel/CSV files rendered as visual tables
12//! - **Images**: Direct vision model processing
13//!
14//! ## Architecture
15//!
16//! ```text
17//! Document → Renderer → PNG Images → Vision Model → Structured Data
18//! ```
19//!
20//! Inspired by OpenAI's implementation in ChatGPT's Code Interpreter.
21
22use anyhow::{Result, anyhow};
23use serde::{Deserialize, Serialize};
24use std::path::{Path, PathBuf};
25use tracing::{debug, info, warn};
26
27use crate::utils::file_utils::ensure_dir_exists_sync;
28
29/// Document processing configuration
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct DocumentProcessorConfig {
32    /// Enable vision-based document processing
33    pub enabled: bool,
34
35    /// Output format for rendered pages
36    pub image_format: String, // "png" recommended
37
38    /// DPI for rendering (higher = better quality but larger files)
39    pub dpi: u32,
40
41    /// Maximum number of pages to process (prevent runaway)
42    pub max_pages: usize,
43
44    /// Enable OCR fallback for text extraction
45    pub enable_ocr_fallback: bool,
46}
47
48impl Default for DocumentProcessorConfig {
49    fn default() -> Self {
50        Self {
51            enabled: true,
52            image_format: "png".to_string(),
53            dpi: 150,      // Good balance of quality vs file size
54            max_pages: 50, // Reasonable limit for most documents
55            enable_ocr_fallback: true,
56        }
57    }
58}
59
60/// Processed document result
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ProcessedDocument {
63    /// Original document path
64    pub source_path: PathBuf,
65
66    /// Document type
67    pub doc_type: DocumentType,
68
69    /// Page count
70    pub page_count: usize,
71
72    /// Rendered page images
73    pub pages: Vec<PageImage>,
74
75    /// Extracted text (with layout preservation)
76    pub extracted_text: Option<String>,
77
78    /// Document metadata
79    pub metadata: DocumentMetadata,
80}
81
82/// Document type classification
83#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
84pub enum DocumentType {
85    Pdf,
86    Docx,
87    Doc,
88    Xlsx,
89    Xls,
90    Csv,
91    Txt,
92    Rtf,
93    Image,
94    Unknown,
95}
96
97impl DocumentType {
98    /// Detect document type from file extension
99    pub fn from_path(path: &Path) -> Self {
100        match path.extension().and_then(|e| e.to_str()) {
101            Some("pdf") => DocumentType::Pdf,
102            Some("docx") => DocumentType::Docx,
103            Some("doc") => DocumentType::Doc,
104            Some("xlsx") => DocumentType::Xlsx,
105            Some("xls") => DocumentType::Xls,
106            Some("csv") => DocumentType::Csv,
107            Some("txt") => DocumentType::Txt,
108            Some("rtf") => DocumentType::Rtf,
109            Some("png") | Some("jpg") | Some("jpeg") | Some("gif") | Some("bmp") | Some("tiff") => {
110                DocumentType::Image
111            }
112            _ => DocumentType::Unknown,
113        }
114    }
115
116    /// Check if this document type is supported for vision processing
117    pub fn supports_vision_processing(&self) -> bool {
118        matches!(
119            self,
120            DocumentType::Pdf
121                | DocumentType::Docx
122                | DocumentType::Doc
123                | DocumentType::Xlsx
124                | DocumentType::Xls
125                | DocumentType::Image
126        )
127    }
128}
129
130/// Single page image data
131#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct PageImage {
133    /// Page number (1-indexed)
134    pub page_number: usize,
135
136    /// Image file path
137    pub image_path: PathBuf,
138
139    /// Image dimensions
140    pub dimensions: ImageDimensions,
141
142    /// Page text content (if OCR enabled)
143    pub text_content: Option<String>,
144}
145
146/// Image dimensions
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct ImageDimensions {
149    pub width: u32,
150    pub height: u32,
151}
152
153/// Document metadata
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct DocumentMetadata {
156    pub title: Option<String>,
157    pub author: Option<String>,
158    pub created_date: Option<String>,
159    pub modified_date: Option<String>,
160    pub file_size: u64,
161    pub page_count: Option<usize>,
162}
163
164/// Main document processor
165pub struct DocumentProcessor {
166    config: DocumentProcessorConfig,
167    temp_dir: PathBuf,
168}
169
170impl DocumentProcessor {
171    /// Create new document processor
172    pub fn new(config: DocumentProcessorConfig) -> Result<Self> {
173        let temp_dir = std::env::temp_dir().join("vtcode-document-processor");
174        ensure_dir_exists_sync(&temp_dir)?;
175
176        Ok(Self { config, temp_dir })
177    }
178
179    /// Process a document for vision model analysis
180    pub async fn process_document(&self, document_path: &Path) -> Result<ProcessedDocument> {
181        if !self.config.enabled {
182            return Err(anyhow!("Document processing is disabled"));
183        }
184
185        if !document_path.exists() {
186            return Err(anyhow!("Document not found: {}", document_path.display()));
187        }
188
189        let doc_type = DocumentType::from_path(document_path);
190        info!(
191            "Processing document: {} (type: {:?})",
192            document_path.display(),
193            doc_type
194        );
195
196        match doc_type {
197            DocumentType::Pdf => self.process_pdf(document_path).await,
198            DocumentType::Docx | DocumentType::Doc => {
199                self.process_word_document(document_path).await
200            }
201            DocumentType::Xlsx | DocumentType::Xls | DocumentType::Csv => {
202                self.process_spreadsheet(document_path).await
203            }
204            DocumentType::Image => self.process_image(document_path).await,
205            other => {
206                warn!("Unsupported document type: {:?}", other);
207                Err(anyhow!("Unsupported document type: {:?}", other))
208            }
209        }
210    }
211
212    /// Process PDF document
213    async fn process_pdf(&self, pdf_path: &Path) -> Result<ProcessedDocument> {
214        debug!("Processing PDF: {}", pdf_path.display());
215
216        // For now, return a placeholder implementation
217        // In a full implementation, this would:
218        // 1. Use a PDF rendering library to convert pages to images
219        // 2. Optionally run OCR on each page
220        // 3. Extract metadata
221
222        let metadata = self.extract_file_metadata(pdf_path)?;
223
224        Ok(ProcessedDocument {
225            source_path: pdf_path.to_path_buf(),
226            doc_type: DocumentType::Pdf,
227            page_count: 1,        // Placeholder
228            pages: vec![],        // Placeholder - would contain actual rendered pages
229            extracted_text: None, // Placeholder - would contain OCR text if enabled
230            metadata,
231        })
232    }
233
234    /// Process Word document
235    async fn process_word_document(&self, doc_path: &Path) -> Result<ProcessedDocument> {
236        debug!("Processing Word document: {}", doc_path.display());
237
238        let metadata = self.extract_file_metadata(doc_path)?;
239
240        Ok(ProcessedDocument {
241            source_path: doc_path.to_path_buf(),
242            doc_type: DocumentType::Docx,
243            page_count: 1, // Placeholder
244            pages: vec![],
245            extracted_text: None,
246            metadata,
247        })
248    }
249
250    /// Process spreadsheet
251    async fn process_spreadsheet(&self, spreadsheet_path: &Path) -> Result<ProcessedDocument> {
252        debug!("Processing spreadsheet: {}", spreadsheet_path.display());
253
254        let metadata = self.extract_file_metadata(spreadsheet_path)?;
255        let doc_type = DocumentType::from_path(spreadsheet_path);
256
257        Ok(ProcessedDocument {
258            source_path: spreadsheet_path.to_path_buf(),
259            doc_type,
260            page_count: 1, // Spreadsheets are typically single "sheet"
261            pages: vec![],
262            extracted_text: None,
263            metadata,
264        })
265    }
266
267    /// Process image file
268    async fn process_image(&self, image_path: &Path) -> Result<ProcessedDocument> {
269        debug!("Processing image: {}", image_path.display());
270
271        let metadata = self.extract_file_metadata(image_path)?;
272
273        Ok(ProcessedDocument {
274            source_path: image_path.to_path_buf(),
275            doc_type: DocumentType::Image,
276            page_count: 1,
277            pages: vec![PageImage {
278                page_number: 1,
279                image_path: image_path.to_path_buf(),
280                dimensions: ImageDimensions {
281                    width: 0,
282                    height: 0,
283                }, // Would detect actual dimensions
284                text_content: None,
285            }],
286            extracted_text: None,
287            metadata,
288        })
289    }
290
291    /// Extract basic file metadata
292    fn extract_file_metadata(&self, path: &Path) -> Result<DocumentMetadata> {
293        let metadata = std::fs::metadata(path)?;
294
295        Ok(DocumentMetadata {
296            title: None,
297            author: None,
298            created_date: None,
299            modified_date: None,
300            file_size: metadata.len(),
301            page_count: None,
302        })
303    }
304
305    /// Generate a prompt for vision model analysis
306    pub fn generate_vision_prompt(
307        &self,
308        processed: &ProcessedDocument,
309        query: &str,
310    ) -> Result<String> {
311        let mut prompt = String::new();
312
313        prompt.push_str(&format!("Document: {}\n", processed.source_path.display()));
314        prompt.push_str(&format!("Type: {:?}\n", processed.doc_type));
315        prompt.push_str(&format!("Pages: {}\n\n", processed.page_count));
316
317        if let Some(text) = &processed.extracted_text {
318            prompt.push_str("Extracted Text:\n");
319            prompt.push_str(text);
320            prompt.push_str("\n\n");
321        }
322
323        prompt.push_str("Analyze the document images and provide: ");
324        prompt.push_str("\n1. A summary of the content");
325        prompt.push_str("\n2. Key insights or findings");
326        prompt.push_str("\n3. Answers to specific questions");
327        prompt.push_str(&format!("\n\nSpecific query: {}\n", query));
328
329        Ok(prompt)
330    }
331
332    /// Clean up temporary files
333    pub fn cleanup(&self) -> Result<()> {
334        if self.temp_dir.exists() {
335            std::fs::remove_dir_all(&self.temp_dir)?;
336            debug!(
337                "Cleaned up temporary directory: {}",
338                self.temp_dir.display()
339            );
340        }
341        Ok(())
342    }
343}
344
345impl Drop for DocumentProcessor {
346    fn drop(&mut self) {
347        // Attempt to clean up on drop
348        let _ = self.cleanup();
349    }
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    #[test]
357    fn test_document_type_detection() {
358        assert_eq!(
359            DocumentType::from_path(Path::new("test.pdf")),
360            DocumentType::Pdf
361        );
362        assert_eq!(
363            DocumentType::from_path(Path::new("test.docx")),
364            DocumentType::Docx
365        );
366        assert_eq!(
367            DocumentType::from_path(Path::new("test.xlsx")),
368            DocumentType::Xlsx
369        );
370        assert_eq!(
371            DocumentType::from_path(Path::new("test.png")),
372            DocumentType::Image
373        );
374        assert_eq!(
375            DocumentType::from_path(Path::new("test.unknown")),
376            DocumentType::Unknown
377        );
378    }
379
380    #[test]
381    fn test_document_processor_creation() {
382        let config = DocumentProcessorConfig::default();
383        let processor = DocumentProcessor::new(config).unwrap();
384        assert!(processor.temp_dir.exists());
385    }
386
387    #[tokio::test]
388    async fn test_process_nonexistent_document() {
389        let config = DocumentProcessorConfig::default();
390        let processor = DocumentProcessor::new(config).unwrap();
391
392        let result = processor
393            .process_document(Path::new("/nonexistent/document.pdf"))
394            .await;
395        result.unwrap_err();
396    }
397}