offline_intelligence/utils/
file_processor.rs

1use std::path::Path;
2use std::fs;
3use std::io::{Read, Cursor};
4use tracing::{debug, info};
5use anyhow::Result;
6
7// macOS: Core Graphics PDF C API — always available on macOS regardless of chip
8// (CoreGraphics framework is already linked transitively by the core-graphics crate)
9#[cfg(target_os = "macos")]
10use core_graphics::geometry::CGRect;
11
12#[cfg(target_os = "macos")]
13extern "C" {
14    fn CGPDFDocumentCreateWithProvider(provider: *mut std::ffi::c_void) -> *mut std::ffi::c_void;
15    fn CGPDFDocumentGetNumberOfPages(document: *mut std::ffi::c_void) -> usize;
16    fn CGPDFDocumentGetPage(document: *mut std::ffi::c_void, page_index: usize) -> *mut std::ffi::c_void;
17    fn CGPDFPageGetBoxRect(page: *mut std::ffi::c_void, box_type: i32) -> CGRect;
18    fn CGContextDrawPDFPage(context: *mut std::ffi::c_void, page: *mut std::ffi::c_void);
19    fn CGContextScaleCTM(context: *mut std::ffi::c_void, sx: f64, sy: f64);
20    fn CGContextTranslateCTM(context: *mut std::ffi::c_void, tx: f64, ty: f64);
21    fn CGPDFDocumentRelease(document: *mut std::ffi::c_void);
22    fn CGPDFPageRelease(page: *mut std::ffi::c_void);
23}
24
25/// Returns `true` when `file_processor` returned a sentinel error string instead of
26/// real content. Sentinels always start with `[` and describe a failure.
27/// Used by both `stream_api` (cache-hit guard) and `attachment_api` (preprocess guard).
28pub fn is_extraction_sentinel(s: &str) -> bool {
29    s.starts_with("[Could not")
30        || s.starts_with("[PDF")
31        || s.starts_with("[DOCX")
32        || s.starts_with("[Spreadsheet")
33        || s.starts_with("[Presentation")
34        || s.starts_with("[ODT")
35}
36
37/// Rough token estimate: 1 token ≈ 4 characters (common approximation).
38pub fn estimate_tokens(text: &str) -> usize {
39    (text.len() + 3) / 4
40}
41
42/// Truncate `text` so it fits within `max_tokens`, breaking at the last newline
43/// before the limit to avoid cutting mid-sentence.
44///
45/// Returns `(truncated_text, was_truncated)`.
46pub fn truncate_to_budget(text: &str, max_tokens: usize) -> (String, bool) {
47    let max_chars = max_tokens.saturating_mul(4);
48    if text.len() <= max_chars {
49        return (text.to_string(), false);
50    }
51    // Truncate byte-safe: find last char boundary at or before max_chars
52    let mut end = max_chars;
53    while end > 0 && !text.is_char_boundary(end) {
54        end -= 1;
55    }
56    let slice = &text[..end];
57    // Break at the last newline so we don't cut inside a line
58    let cut = slice.rfind('\n').unwrap_or(end);
59    (slice[..cut].to_string(), true)
60}
61
62/// Extract text content from various file formats
63pub async fn extract_file_content(file_path: &Path) -> Result<String> {
64    let file_ext = file_path.extension()
65        .and_then(|ext| ext.to_str())
66        .map(|ext| ext.to_lowercase())
67        .unwrap_or_default();
68
69    match file_ext.as_str() {
70        // Text files
71        "txt" | "md" | "json" | "yaml" | "yml" | "xml" | "csv" | "log" => {
72            extract_text_file(file_path).await
73        },
74        // Code files
75        "js" | "ts" | "jsx" | "tsx" | "py" | "java" | "cpp" | "c" | "cs" | 
76        "html" | "css" | "scss" | "go" | "rs" | "php" | "rb" | "swift" | 
77        "kt" | "scala" | "sql" | "sh" | "bat" | "ps1" | "dockerfile" | "env" => {
78            extract_text_file(file_path).await
79        },
80        // Document files
81        "pdf" => extract_pdf_content(file_path).await,
82        "doc" | "docx" => extract_docx_content(file_path).await,
83        "rtf" => extract_text_file(file_path).await,
84        "odt" => extract_odt_content(file_path).await,
85        // Spreadsheet files
86        "xls" | "xlsx" | "ods" => extract_xlsx_content(file_path).await,
87        // Presentation files
88        "ppt" | "pptx" | "odp" => extract_pptx_content(file_path).await,
89        // Default to text extraction
90        _ => {
91            debug!("Unknown file type {}, attempting text extraction", file_ext);
92            extract_text_file(file_path).await
93        }
94    }
95}
96
97/// Extract text from bytes with file extension
98pub async fn extract_content_from_bytes(bytes: &[u8], filename: &str) -> Result<String> {
99    let ext = filename.split('.').last().unwrap_or("").to_lowercase();
100
101    match ext.as_str() {
102        // Text/code files - try UTF-8 decoding
103        "txt" | "md" | "json" | "yaml" | "yml" | "xml" | "csv" | "log" |
104        "js" | "ts" | "jsx" | "tsx" | "py" | "java" | "cpp" | "c" | "cs" |
105        "html" | "css" | "scss" | "go" | "rs" | "php" | "rb" | "swift" |
106        "kt" | "scala" | "sql" | "sh" | "bat" | "ps1" | "dockerfile" | "env" | "rtf" => {
107            Ok(String::from_utf8_lossy(bytes).to_string())
108        },
109        // PDF files — run OCR on a blocking thread to avoid starving the async runtime
110        "pdf" => {
111            let bytes_owned = bytes.to_vec();
112            let text = tokio::task::spawn_blocking(move || extract_pdf_from_bytes(&bytes_owned))
113                .await
114                .unwrap_or_else(|_| "[PDF extraction panicked]".to_string());
115            Ok(text)
116        },
117        // Word documents
118        "doc" | "docx" => Ok(extract_docx_from_bytes(bytes)),
119        // OpenDocument text
120        "odt" => Ok(extract_odt_from_bytes(bytes)),
121        // Spreadsheets
122        "xls" | "xlsx" | "ods" => Ok(extract_xlsx_from_bytes(bytes, &ext)),
123        // Presentations
124        "ppt" | "pptx" | "odp" => Ok(extract_pptx_from_bytes(bytes)),
125        // Default - try text
126        _ => {
127            debug!("Unknown file type {}, attempting text extraction", ext);
128            Ok(String::from_utf8_lossy(bytes).to_string())
129        }
130    }
131}
132
133/// Extract content from text-based files
134async fn extract_text_file(file_path: &Path) -> Result<String> {
135    let content = fs::read_to_string(file_path)?;
136    Ok(content)
137}
138
139/// Extract content from PDF files
140async fn extract_pdf_content(file_path: &Path) -> Result<String> {
141    let bytes = fs::read(file_path)?;
142    // Run OCR (blocking WinRT/Vision calls) on a dedicated blocking thread
143    let text = tokio::task::spawn_blocking(move || extract_pdf_from_bytes(&bytes))
144        .await
145        .unwrap_or_else(|_| "[PDF extraction panicked]".to_string());
146    Ok(text)
147}
148
149/// Try to extract the embedded text layer from a PDF using pure Rust (no OCR required).
150///
151/// Works for text-based PDFs produced by Word, Google Docs, LibreOffice, LaTeX, etc.
152/// Returns `None` for scanned / image-only PDFs (no text layer) or on parse failure.
153///
154/// This is cross-platform and avoids the OS-specific OCR engines entirely for the
155/// majority of PDFs that users actually attach (digital documents, not scans).
156fn extract_pdf_text_layer(bytes: &[u8]) -> Option<String> {
157    let doc = lopdf::Document::load_mem(bytes).ok()?;
158    let page_count = doc.get_pages().len();
159    if page_count == 0 {
160        return None;
161    }
162
163    // Primary: extract all pages at once (fast path)
164    let page_numbers: Vec<u32> = (1..=page_count as u32).collect();
165    let full_text = doc.extract_text(&page_numbers).ok();
166
167    // Fallback: if full extraction fails or returns empty, try page-by-page
168    // (handles some PDFs where certain pages fail to decode as a batch)
169    let text = match full_text {
170        Some(ref t) if !t.trim().is_empty() => t.clone(),
171        _ => {
172            debug!("Full-document lopdf extraction returned empty — trying page-by-page");
173            let mut page_text = String::new();
174            for page_num in 1..=page_count as u32 {
175                if let Ok(t) = doc.extract_text(&[page_num]) {
176                    page_text.push_str(t.trim());
177                    page_text.push('\n');
178                }
179            }
180            page_text
181        }
182    };
183
184    let trimmed = text.trim().to_string();
185    if trimmed.is_empty() {
186        return None;
187    }
188
189    // Sanity-check: if fewer than 40% of characters are printable the text layer
190    // is likely garbage from a non-standard font encoding — fall back to OCR.
191    // Threshold relaxed from 60% → 40% to handle technical PDFs with many
192    // non-ASCII symbols (math formulae, source code with special chars, etc.).
193    let total = trimmed.chars().count();
194    if total > 0 {
195        let printable = trimmed
196            .chars()
197            .filter(|c| !c.is_control() || matches!(*c, '\n' | '\r' | '\t'))
198            .count();
199        if printable * 100 / total < 40 {
200            info!(
201                "PDF text layer looks garbled ({}/{} printable chars) — will try OCR fallback",
202                printable, total
203            );
204            return None;
205        }
206    }
207
208    info!("PDF text layer extracted ({} chars, {} pages) — no OCR needed", trimmed.len(), page_count);
209    Some(trimmed)
210}
211
212fn extract_pdf_from_bytes(bytes: &[u8]) -> String {
213    // ── 1. Fast path: pure Rust text-layer extraction ────────────────────────
214    // Works for text-based PDFs (Word/Google Docs exports, LaTeX, etc.).
215    // Cross-platform — no OS OCR engine required.
216    if let Some(text) = extract_pdf_text_layer(bytes) {
217        return text;
218    }
219
220    info!("PDF has no extractable text layer — attempting OS-native OCR");
221
222    // ── 2. Slow path: OS-native OCR (for scanned / image-based PDFs) ─────────
223    #[cfg(target_os = "windows")]
224    {
225        match windows_ocr_pdf(bytes) {
226            Some(text) if !text.trim().is_empty() => {
227                info!("PDF extracted via Windows OCR ({} chars)", text.len());
228                return text;
229            }
230            Some(_) => {
231                info!("Windows OCR returned empty text — PDF may be purely image-based");
232            }
233            None => {
234                info!("Windows OCR unavailable or failed — PDF may be encrypted or corrupted");
235            }
236        }
237    }
238
239    #[cfg(target_os = "macos")]
240    {
241        match macos_ocr_pdf(bytes) {
242            Some(text) if !text.trim().is_empty() => {
243                info!("PDF extracted via macOS Vision OCR ({} chars)", text.len());
244                return text;
245            }
246            Some(_) => {
247                info!("macOS OCR returned empty text — PDF may be purely image-based");
248            }
249            None => {
250                info!("macOS OCR unavailable or failed — PDF may be encrypted or corrupted");
251            }
252        }
253    }
254
255    // All strategies exhausted
256    "[PDF extraction failed. This file appears to be scanned, encrypted, or corrupted. \
257Please try: 1) Save as text-based PDF, 2) Use DOCX format, or 3) Paste text directly]".to_string()
258}
259
260// ── Windows OCR ──────────────────────────────────────────────────────────────
261
262/// Initialise WinRT on the calling thread (once per thread, idempotent).
263/// Called before any WinRT API use to ensure the thread has a COM apartment.
264#[cfg(target_os = "windows")]
265fn ensure_winrt_init() {
266    thread_local! {
267        static INIT: () = {
268            unsafe {
269                // S_OK = 0 (fresh init), S_FALSE = 1 (already init on this thread),
270                // RPC_E_CHANGED_MODE = STA thread — all are safe to ignore.
271                let _ = windows::Win32::System::WinRT::RoInitialize(
272                    windows::Win32::System::WinRT::RO_INIT_MULTITHREADED,
273                );
274            }
275        };
276    }
277    INIT.with(|_| ());
278}
279
280/// Render each PDF page with Windows.Data.Pdf and run Windows.Media.Ocr on it.
281/// Returns `None` when the engine is unavailable or no text is found.
282#[cfg(target_os = "windows")]
283fn windows_ocr_pdf(bytes: &[u8]) -> Option<String> {
284    use windows::{
285        core::*,
286        Data::Pdf::PdfDocument,
287        Graphics::Imaging::{BitmapDecoder, BitmapPixelFormat, SoftwareBitmap},
288        Media::Ocr::OcrEngine,
289        Storage::Streams::{DataWriter, IOutputStream, IRandomAccessStream, InMemoryRandomAccessStream},
290    };
291
292    info!("Starting Windows OCR for PDF ({} bytes)", bytes.len());
293
294    ensure_winrt_init();
295    info!("WinRT initialized successfully");
296
297    let run = || -> windows::core::Result<String> {
298        // ── 1. Write PDF bytes into an in-memory random-access stream ─────────
299        info!("Creating in-memory PDF stream");
300        let pdf_stream = InMemoryRandomAccessStream::new()?;
301        {
302            let writer = DataWriter::new()?;
303            writer.WriteBytes(bytes)?;
304            let buffer = writer.DetachBuffer()?;
305            let out: IOutputStream = pdf_stream.cast()?;
306            out.WriteAsync(&buffer)?.get()?;
307            out.FlushAsync()?.get()?;
308        }
309        pdf_stream.Seek(0)?;
310        info!("PDF stream created successfully");
311
312        // ── 2. Load the PDF document ──────────────────────────────────────────
313        info!("Loading PDF document");
314        let pdf_doc = PdfDocument::LoadFromStreamAsync(&pdf_stream)?.get()?;
315        let page_count = pdf_doc.PageCount()?;
316        info!("PDF loaded, {} pages", page_count);
317        
318        if page_count == 0 {
319            return Ok(String::new());
320        }
321
322        // ── 3. Create OCR engine (uses the user's Windows language profile) ───
323        info!("Creating OCR engine");
324        let ocr_engine = OcrEngine::TryCreateFromUserProfileLanguages()?;
325        info!("OCR engine created successfully");
326
327        // ── 4. Render each page → PNG → SoftwareBitmap → OCR ─────────────────
328        let mut all_text = String::new();
329
330        for page_idx in 0..page_count {
331            info!("Processing page {}/{}", page_idx + 1, page_count);
332            let page = pdf_doc.GetPage(page_idx)?;
333
334            // Render page to PNG in memory
335            let img_stream = InMemoryRandomAccessStream::new()?;
336            let img_iras: IRandomAccessStream = img_stream.cast()?;
337            page.RenderToStreamAsync(&img_iras)?.get()?;
338            img_stream.Seek(0)?;
339            info!("Page {} rendered to stream", page_idx);
340
341            // Decode PNG to SoftwareBitmap (auto-detects format — no codec ID needed in 0.52)
342            let decoder = BitmapDecoder::CreateAsync(&img_iras)?.get()?;
343            let bitmap = decoder.GetSoftwareBitmapAsync()?.get()?;
344
345            // OcrEngine requires Bgra8 pixel format
346            let bitmap = if bitmap.BitmapPixelFormat()? != BitmapPixelFormat::Bgra8 {
347                SoftwareBitmap::Convert(&bitmap, BitmapPixelFormat::Bgra8)?
348            } else {
349                bitmap
350            };
351
352            // Recognise text on this page
353            match ocr_engine.RecognizeAsync(&bitmap)?.get() {
354                Ok(result) => {
355                    let text = result.Text()?.to_string();
356                    if !text.trim().is_empty() {
357                        all_text.push_str(&text);
358                        all_text.push('\n');
359                        info!("Extracted {} chars from page {}", text.len(), page_idx);
360                    }
361                }
362                Err(e) => info!("OCR page {} error: {}", page_idx, e),
363            }
364        }
365
366        info!("Windows OCR complete, total chars: {}", all_text.len());
367        Ok(all_text)
368    };
369
370    match run() {
371        Ok(text) if !text.trim().is_empty() => Some(text),
372        Ok(_) => {
373            info!("Windows OCR: no text found in PDF");
374            None
375        }
376        Err(e) => {
377            info!("Windows OCR failed: {}", e);
378            None
379        }
380    }
381}
382
383// ── macOS OCR ──────────────────────────────────────────────────────────────
384
385/// Render each PDF page with Core Graphics and run Vision OCR on the result.
386/// Works on both Apple Silicon (ARM64, uses Neural Engine) and Intel (x86_64, uses CPU).
387/// Returns `None` when the PDF is empty, unreadable, or yields no text.
388#[cfg(target_os = "macos")]
389fn macos_ocr_pdf(bytes: &[u8]) -> Option<String> {
390    use std::ffi::c_void;
391    use std::sync::Arc;
392    use core_graphics::{
393        color_space::CGColorSpace,
394        context::CGContext,
395        data_provider::CGDataProvider,
396    };
397    use objc2::rc::Retained;
398    use objc2_foundation::{NSArray, NSData, NSDictionary, NSString};
399    use objc2_vision::{
400        VNImageRequestHandler, VNRecognizeTextRequest,
401        VNRequest, VNRequestTextRecognitionLevel,
402    };
403
404    // kCGImageAlphaNoneSkipLast (4) — RGBX pixel format, ignore the 4th byte as alpha.
405    // Using a plain u32 avoids depending on a specific CGBitmapInfo constant path.
406    const BITMAP_INFO: u32 = 4;
407
408    let run = || -> Result<String, String> {
409        // ── 1. Load PDF bytes into a CGPDFDocument ───────────────────────────
410        //
411        // CGDataProvider::from_buffer keeps the Arc alive for the provider's lifetime,
412        // so the underlying bytes are valid for as long as we need the document.
413        let pdf_data: Arc<Vec<u8>> = Arc::new(bytes.to_vec());
414        let provider = CGDataProvider::from_buffer(pdf_data);
415
416        let doc = unsafe {
417            CGPDFDocumentCreateWithProvider(provider.as_ptr() as *mut c_void)
418        };
419        if doc.is_null() {
420            return Err("CGPDFDocumentCreateWithProvider returned null".into());
421        }
422
423        let page_count = unsafe { CGPDFDocumentGetNumberOfPages(doc) };
424        if page_count == 0 {
425            unsafe { CGPDFDocumentRelease(doc) };
426            return Ok(String::new());
427        }
428
429        info!("macOS PDF OCR: {} page(s)", page_count);
430        let mut all_text = String::new();
431
432        // ── 2. Per-page: render → PNG → Vision OCR ──────────────────────────
433        //
434        // CGPDFDocument uses 1-based page indexing.
435        for page_idx in 1..=page_count {
436            let page = unsafe { CGPDFDocumentGetPage(doc, page_idx) };
437            if page.is_null() {
438                continue;
439            }
440
441            // PDF page dimensions are in points (72 pt = 1 inch).
442            // kCGPDFMediaBox = 0 — the full physical page rectangle.
443            let media_box = unsafe { CGPDFPageGetBoxRect(page, 0) };
444            let pt_w = media_box.size.width;
445            let pt_h = media_box.size.height;
446
447            // Scale to 150 DPI: good balance between OCR accuracy and memory.
448            let scale = 150.0_f64 / 72.0;
449            let px_w = ((pt_w * scale).ceil() as usize).max(1);
450            let px_h = ((pt_h * scale).ceil() as usize).max(1);
451            let bytes_per_row = px_w * 4; // 4 bytes/pixel (RGBX)
452
453            // Allocate a white pixel buffer — pages with transparency get a white bg.
454            let mut pixel_buf = vec![255u8; bytes_per_row * px_h];
455
456            let color_space = CGColorSpace::create_device_rgb();
457            let ctx = unsafe {
458                CGContext::create_bitmap_context(
459                    Some(pixel_buf.as_mut_ptr() as *mut c_void),
460                    px_w,
461                    px_h,
462                    8,             // bits per component
463                    bytes_per_row,
464                    &color_space,
465                    BITMAP_INFO,
466                )
467            };
468            let ctx_ptr = ctx.as_ptr() as *mut c_void;
469
470            // PDF coordinate origin is bottom-left; CGBitmapContext origin is top-left.
471            // Flip Y: translate to the top of the context, then negate the Y scale.
472            unsafe {
473                CGContextTranslateCTM(ctx_ptr, 0.0, px_h as f64);
474                CGContextScaleCTM(ctx_ptr, scale, -scale);
475                CGContextDrawPDFPage(ctx_ptr, page);
476            }
477            unsafe { CGPDFPageRelease(page) };
478
479            // Drop the context to flush any deferred drawing before reading pixel_buf.
480            drop(ctx);
481
482            // ── 3. Encode rendered pixels to PNG ──────────────────────────────
483            //
484            // Vision's initWithData:options: accepts any image format that NSImage
485            // can decode (PNG, JPEG, TIFF, …).  PNG is lossless and zero-dependency.
486            let mut png_bytes: Vec<u8> = Vec::new();
487            {
488                let mut enc = png::Encoder::new(&mut png_bytes, px_w as u32, px_h as u32);
489                enc.set_color(png::ColorType::Rgba);
490                enc.set_depth(png::BitDepth::Eight);
491                enc.write_header()
492                    .and_then(|mut w| w.write_image_data(&pixel_buf))
493                    .map_err(|e| format!("PNG encode failed on page {page_idx}: {e}"))?;
494            }
495
496            info!(
497                "Page {page_idx} rendered {px_w}×{px_h} px ({} PNG bytes)",
498                png_bytes.len()
499            );
500
501            // ── 4. Vision OCR ──────────────────────────────────────────────────
502            unsafe {
503                let ns_data = NSData::with_bytes(&png_bytes);
504                let options = NSDictionary::<NSString, objc2::runtime::AnyObject>::new();
505
506                let handler = VNImageRequestHandler::initWithData_options(
507                    VNImageRequestHandler::alloc(),
508                    &ns_data,
509                    &options,
510                );
511
512                let request =
513                    VNRecognizeTextRequest::init(VNRecognizeTextRequest::alloc());
514
515                // Accurate mode uses the Neural Engine on Apple Silicon;
516                // falls back to CPU on Intel — both handled transparently by Vision.
517                request.setRecognitionLevel(VNRequestTextRecognitionLevel::Accurate);
518                request.setUsesLanguageCorrection(true);
519
520                // performRequests:error: expects NSArray<VNRequest>.
521                // VNRecognizeTextRequest IS-A VNRequest via Objective-C inheritance.
522                // Rust deref coercion traverses: Retained<VNRecognizeTextRequest>
523                //   → VNRecognizeTextRequest → VNImageBasedRequest → VNRequest
524                let req_as_base: &VNRequest = &*request;
525                let req_array = NSArray::from_slice(&[req_as_base]);
526
527                // Ignore the return value; if it fails, results() will be None.
528                let _ = handler.performRequests_error(&*req_array);
529
530                if let Some(results) = request.results() {
531                    for obs in results.iter() {
532                        // topCandidates(1) returns the single best candidate string.
533                        let candidates = obs.topCandidates(1);
534                        if let Some(top) = candidates.firstObject() {
535                            let text = top.string().to_string();
536                            if !text.is_empty() {
537                                all_text.push_str(&text);
538                                all_text.push('\n');
539                            }
540                        }
541                    }
542                }
543            }
544        }
545
546        unsafe { CGPDFDocumentRelease(doc) };
547        info!("macOS PDF OCR complete: {} chars", all_text.len());
548        Ok(all_text)
549    };
550
551    match run() {
552        Ok(text) if !text.trim().is_empty() => Some(text),
553        Ok(_) => {
554            debug!("macOS OCR: no text found in PDF");
555            None
556        }
557        Err(e) => {
558            debug!("macOS OCR failed: {e}");
559            None
560        }
561    }
562}
563
564/// Extract content from DOCX files
565async fn extract_docx_content(file_path: &Path) -> Result<String> {
566    let bytes = fs::read(file_path)?;
567    Ok(extract_docx_from_bytes(&bytes))
568}
569
570fn extract_docx_from_bytes(bytes: &[u8]) -> String {
571    let cursor = Cursor::new(bytes);
572    match zip::ZipArchive::new(cursor) {
573        Ok(mut archive) => {
574            if let Ok(mut file) = archive.by_name("word/document.xml") {
575                let mut xml = String::new();
576                if file.read_to_string(&mut xml).is_ok() {
577                    let text = xml_to_plain_text(&xml, "</w:p>", "</w:tr>");
578                    if text.is_empty() {
579                        "[DOCX file appears to be empty]".to_string()
580                    } else {
581                        text
582                    }
583                } else {
584                    "[Could not read DOCX content]".to_string()
585                }
586            } else {
587                "[Could not find document content in DOCX file]".to_string()
588            }
589        }
590        Err(e) => {
591            debug!("DOCX extraction failed: {}", e);
592            format!("[Could not extract DOCX content: {}]", e)
593        }
594    }
595}
596
597/// General XML → plain text helper used for DOCX, PPTX, and ODT.
598///
599/// `paragraph_end` and `row_end` are the XML closing tags that should become
600/// newlines before all other tags are stripped.
601fn xml_to_plain_text(xml: &str, paragraph_end: &str, row_end: &str) -> String {
602    // Insert newlines at structural boundaries before stripping all tags
603    let s = xml
604        .replace(paragraph_end, "\n")
605        .replace(row_end, "\n");
606
607    // Strip all remaining XML tags
608    let tag_re = regex::Regex::new(r"<[^>]+>").unwrap();
609    let plain = tag_re.replace_all(&s, "");
610
611    // Decode the most common XML/HTML entities
612    let plain = plain
613        .replace("&amp;", "&")
614        .replace("&lt;", "<")
615        .replace("&gt;", ">")
616        .replace("&quot;", "\"")
617        .replace("&apos;", "'")
618        .replace("&#x9;", "\t")
619        .replace("&#xA;", "\n")
620        .replace("&#xD;", "");
621
622    // Normalise: trim each line, drop lines that are only whitespace,
623    // collapse more than one consecutive blank line into a single blank line
624    let mut result = String::new();
625    let mut blank_run = 0usize;
626    for line in plain.lines() {
627        let trimmed = line.trim();
628        if trimmed.is_empty() {
629            blank_run += 1;
630            if blank_run == 1 {
631                result.push('\n');
632            }
633        } else {
634            blank_run = 0;
635            result.push_str(trimmed);
636            result.push('\n');
637        }
638    }
639
640    result.trim().to_string()
641}
642
643/// Extract content from XLSX files
644async fn extract_xlsx_content(file_path: &Path) -> Result<String> {
645    use calamine::{Reader, open_workbook_auto};
646    
647    match open_workbook_auto(file_path) {
648        Ok(mut workbook) => {
649            let mut text = String::new();
650            for sheet_name in workbook.sheet_names().to_vec() {
651                if let Ok(range) = workbook.worksheet_range(&sheet_name) {
652                    text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
653                    for row in range.rows() {
654                        let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
655                        text.push_str(&row_text.join("\t"));
656                        text.push('\n');
657                    }
658                }
659            }
660            if text.trim().is_empty() {
661                Ok("[Spreadsheet appears to be empty]".to_string())
662            } else {
663                Ok(text)
664            }
665        }
666        Err(e) => {
667            debug!("XLSX extraction failed: {}", e);
668            Ok(format!("[Could not extract spreadsheet content: {}]", e))
669        }
670    }
671}
672
673fn extract_xlsx_from_bytes(bytes: &[u8], ext: &str) -> String {
674    use calamine::{Reader, Xls, Xlsx, Ods};
675
676    let mut text = String::new();
677
678    match ext {
679        "ods" => {
680            let cursor = Cursor::new(bytes);
681            if let Ok(mut workbook) = Ods::new(cursor) {
682                for sheet_name in workbook.sheet_names().to_vec() {
683                    if let Ok(range) = workbook.worksheet_range(&sheet_name) {
684                        text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
685                        for row in range.rows() {
686                            let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
687                            text.push_str(&row_text.join("\t"));
688                            text.push('\n');
689                        }
690                    }
691                }
692            }
693        }
694        "xls" => {
695            // OLE2/BIFF format — requires calamine::Xls, not calamine::Xlsx
696            let cursor = Cursor::new(bytes);
697            if let Ok(mut workbook) = Xls::new(cursor) {
698                for sheet_name in workbook.sheet_names().to_vec() {
699                    if let Ok(range) = workbook.worksheet_range(&sheet_name) {
700                        text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
701                        for row in range.rows() {
702                            let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
703                            text.push_str(&row_text.join("\t"));
704                            text.push('\n');
705                        }
706                    }
707                }
708            }
709        }
710        _ => {
711            // xlsx, xlsb — OOXML ZIP format
712            let cursor = Cursor::new(bytes);
713            if let Ok(mut workbook) = Xlsx::new(cursor) {
714                for sheet_name in workbook.sheet_names().to_vec() {
715                    if let Ok(range) = workbook.worksheet_range(&sheet_name) {
716                        text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
717                        for row in range.rows() {
718                            let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
719                            text.push_str(&row_text.join("\t"));
720                            text.push('\n');
721                        }
722                    }
723                }
724            }
725        }
726    }
727
728    if text.trim().is_empty() {
729        "[Spreadsheet appears to be empty or could not be read]".to_string()
730    } else {
731        text
732    }
733}
734
735/// Extract content from PPTX files
736async fn extract_pptx_content(file_path: &Path) -> Result<String> {
737    let bytes = fs::read(file_path)?;
738    Ok(extract_pptx_from_bytes(&bytes))
739}
740
741fn extract_pptx_from_bytes(bytes: &[u8]) -> String {
742    let cursor = Cursor::new(bytes);
743    match zip::ZipArchive::new(cursor) {
744        Ok(mut archive) => {
745            let mut text = String::new();
746            let mut slide_num = 1;
747
748            loop {
749                let slide_path = format!("ppt/slides/slide{}.xml", slide_num);
750                match archive.by_name(&slide_path) {
751                    Ok(mut file) => {
752                        let mut xml = String::new();
753                        if file.read_to_string(&mut xml).is_ok() {
754                            let content = xml_to_plain_text(&xml, "</a:p>", "</a:r>");
755                            if !content.is_empty() {
756                                text.push_str(&format!("\n=== Slide {} ===\n{}", slide_num, content));
757                            }
758                        }
759                        slide_num += 1;
760                    }
761                    Err(_) => break,
762                }
763            }
764
765            if text.trim().is_empty() {
766                "[Presentation appears to be empty]".to_string()
767            } else {
768                text
769            }
770        }
771        Err(e) => {
772            debug!("PPTX extraction failed: {}", e);
773            format!("[Could not extract presentation content: {}]", e)
774        }
775    }
776}
777
778/// Extract content from ODT files
779async fn extract_odt_content(file_path: &Path) -> Result<String> {
780    let bytes = fs::read(file_path)?;
781    Ok(extract_odt_from_bytes(&bytes))
782}
783
784fn extract_odt_from_bytes(bytes: &[u8]) -> String {
785    let cursor = Cursor::new(bytes);
786    match zip::ZipArchive::new(cursor) {
787        Ok(mut archive) => {
788            if let Ok(mut file) = archive.by_name("content.xml") {
789                let mut xml = String::new();
790                if file.read_to_string(&mut xml).is_ok() {
791                    let text = xml_to_plain_text(&xml, "</text:p>", "</table:table-row>");
792                    if text.is_empty() {
793                        "[ODT file appears to be empty]".to_string()
794                    } else {
795                        text
796                    }
797                } else {
798                    "[Could not read ODT content]".to_string()
799                }
800            } else {
801                "[Could not find content in ODT file]".to_string()
802            }
803        }
804        Err(e) => {
805            debug!("ODT extraction failed: {}", e);
806            format!("[Could not extract ODT content: {}]", e)
807        }
808    }
809}
810
811#[cfg(test)]
812mod tests {
813    use super::*;
814    use std::fs;
815    use tempfile::NamedTempFile;
816
817    #[tokio::test]
818    async fn test_extract_text_file() {
819        let temp_file = NamedTempFile::new().unwrap();
820        let content = "Test file content\nwith multiple lines";
821        fs::write(&temp_file.path(), content).unwrap();
822
823        let result = extract_text_file(temp_file.path()).await.unwrap();
824        assert_eq!(result, content);
825    }
826
827    #[tokio::test]
828    async fn test_extract_unknown_file_type() {
829        let temp_file = NamedTempFile::new().unwrap();
830        let content = "Unknown file content";
831        fs::write(&temp_file.path(), content).unwrap();
832
833        let result = extract_file_content(temp_file.path()).await.unwrap();
834        assert_eq!(result, content);
835    }
836}
offline_intelligence/utils/file_processor.rs

offline_intelligence/utils/
file_processor.rs