offline_intelligence/utils/
file_processor.rs1use std::path::Path;
2use std::fs;
3use std::io::{Read, Cursor};
4use tracing::{debug, info};
5use anyhow::Result;
6
7#[cfg(target_os = "macos")]
10use core_graphics::geometry::CGRect;
11
12#[cfg(target_os = "macos")]
13extern "C" {
14 fn CGPDFDocumentCreateWithProvider(provider: *mut std::ffi::c_void) -> *mut std::ffi::c_void;
15 fn CGPDFDocumentGetNumberOfPages(document: *mut std::ffi::c_void) -> usize;
16 fn CGPDFDocumentGetPage(document: *mut std::ffi::c_void, page_index: usize) -> *mut std::ffi::c_void;
17 fn CGPDFPageGetBoxRect(page: *mut std::ffi::c_void, box_type: i32) -> CGRect;
18 fn CGContextDrawPDFPage(context: *mut std::ffi::c_void, page: *mut std::ffi::c_void);
19 fn CGContextScaleCTM(context: *mut std::ffi::c_void, sx: f64, sy: f64);
20 fn CGContextTranslateCTM(context: *mut std::ffi::c_void, tx: f64, ty: f64);
21 fn CGPDFDocumentRelease(document: *mut std::ffi::c_void);
22 fn CGPDFPageRelease(page: *mut std::ffi::c_void);
23}
24
25pub fn is_extraction_sentinel(s: &str) -> bool {
29 s.starts_with("[Could not")
30 || s.starts_with("[PDF")
31 || s.starts_with("[DOCX")
32 || s.starts_with("[Spreadsheet")
33 || s.starts_with("[Presentation")
34 || s.starts_with("[ODT")
35}
36
37pub fn estimate_tokens(text: &str) -> usize {
39 (text.len() + 3) / 4
40}
41
42pub fn truncate_to_budget(text: &str, max_tokens: usize) -> (String, bool) {
47 let max_chars = max_tokens.saturating_mul(4);
48 if text.len() <= max_chars {
49 return (text.to_string(), false);
50 }
51 let mut end = max_chars;
53 while end > 0 && !text.is_char_boundary(end) {
54 end -= 1;
55 }
56 let slice = &text[..end];
57 let cut = slice.rfind('\n').unwrap_or(end);
59 (slice[..cut].to_string(), true)
60}
61
62pub async fn extract_file_content(file_path: &Path) -> Result<String> {
64 let file_ext = file_path.extension()
65 .and_then(|ext| ext.to_str())
66 .map(|ext| ext.to_lowercase())
67 .unwrap_or_default();
68
69 match file_ext.as_str() {
70 "txt" | "md" | "json" | "yaml" | "yml" | "xml" | "csv" | "log" => {
72 extract_text_file(file_path).await
73 },
74 "js" | "ts" | "jsx" | "tsx" | "py" | "java" | "cpp" | "c" | "cs" |
76 "html" | "css" | "scss" | "go" | "rs" | "php" | "rb" | "swift" |
77 "kt" | "scala" | "sql" | "sh" | "bat" | "ps1" | "dockerfile" | "env" => {
78 extract_text_file(file_path).await
79 },
80 "pdf" => extract_pdf_content(file_path).await,
82 "doc" | "docx" => extract_docx_content(file_path).await,
83 "rtf" => extract_text_file(file_path).await,
84 "odt" => extract_odt_content(file_path).await,
85 "xls" | "xlsx" | "ods" => extract_xlsx_content(file_path).await,
87 "ppt" | "pptx" | "odp" => extract_pptx_content(file_path).await,
89 _ => {
91 debug!("Unknown file type {}, attempting text extraction", file_ext);
92 extract_text_file(file_path).await
93 }
94 }
95}
96
97pub async fn extract_content_from_bytes(bytes: &[u8], filename: &str) -> Result<String> {
99 let ext = filename.split('.').last().unwrap_or("").to_lowercase();
100
101 match ext.as_str() {
102 "txt" | "md" | "json" | "yaml" | "yml" | "xml" | "csv" | "log" |
104 "js" | "ts" | "jsx" | "tsx" | "py" | "java" | "cpp" | "c" | "cs" |
105 "html" | "css" | "scss" | "go" | "rs" | "php" | "rb" | "swift" |
106 "kt" | "scala" | "sql" | "sh" | "bat" | "ps1" | "dockerfile" | "env" | "rtf" => {
107 Ok(String::from_utf8_lossy(bytes).to_string())
108 },
109 "pdf" => {
111 let bytes_owned = bytes.to_vec();
112 let text = tokio::task::spawn_blocking(move || extract_pdf_from_bytes(&bytes_owned))
113 .await
114 .unwrap_or_else(|_| "[PDF extraction panicked]".to_string());
115 Ok(text)
116 },
117 "doc" | "docx" => Ok(extract_docx_from_bytes(bytes)),
119 "odt" => Ok(extract_odt_from_bytes(bytes)),
121 "xls" | "xlsx" | "ods" => Ok(extract_xlsx_from_bytes(bytes, &ext)),
123 "ppt" | "pptx" | "odp" => Ok(extract_pptx_from_bytes(bytes)),
125 _ => {
127 debug!("Unknown file type {}, attempting text extraction", ext);
128 Ok(String::from_utf8_lossy(bytes).to_string())
129 }
130 }
131}
132
133async fn extract_text_file(file_path: &Path) -> Result<String> {
135 let content = fs::read_to_string(file_path)?;
136 Ok(content)
137}
138
139async fn extract_pdf_content(file_path: &Path) -> Result<String> {
141 let bytes = fs::read(file_path)?;
142 let text = tokio::task::spawn_blocking(move || extract_pdf_from_bytes(&bytes))
144 .await
145 .unwrap_or_else(|_| "[PDF extraction panicked]".to_string());
146 Ok(text)
147}
148
149fn extract_pdf_text_layer(bytes: &[u8]) -> Option<String> {
157 let doc = lopdf::Document::load_mem(bytes).ok()?;
158 let page_count = doc.get_pages().len();
159 if page_count == 0 {
160 return None;
161 }
162
163 let page_numbers: Vec<u32> = (1..=page_count as u32).collect();
165 let full_text = doc.extract_text(&page_numbers).ok();
166
167 let text = match full_text {
170 Some(ref t) if !t.trim().is_empty() => t.clone(),
171 _ => {
172 debug!("Full-document lopdf extraction returned empty — trying page-by-page");
173 let mut page_text = String::new();
174 for page_num in 1..=page_count as u32 {
175 if let Ok(t) = doc.extract_text(&[page_num]) {
176 page_text.push_str(t.trim());
177 page_text.push('\n');
178 }
179 }
180 page_text
181 }
182 };
183
184 let trimmed = text.trim().to_string();
185 if trimmed.is_empty() {
186 return None;
187 }
188
189 let total = trimmed.chars().count();
194 if total > 0 {
195 let printable = trimmed
196 .chars()
197 .filter(|c| !c.is_control() || matches!(*c, '\n' | '\r' | '\t'))
198 .count();
199 if printable * 100 / total < 40 {
200 info!(
201 "PDF text layer looks garbled ({}/{} printable chars) — will try OCR fallback",
202 printable, total
203 );
204 return None;
205 }
206 }
207
208 info!("PDF text layer extracted ({} chars, {} pages) — no OCR needed", trimmed.len(), page_count);
209 Some(trimmed)
210}
211
212fn extract_pdf_from_bytes(bytes: &[u8]) -> String {
213 if let Some(text) = extract_pdf_text_layer(bytes) {
217 return text;
218 }
219
220 info!("PDF has no extractable text layer — attempting OS-native OCR");
221
222 #[cfg(target_os = "windows")]
224 {
225 match windows_ocr_pdf(bytes) {
226 Some(text) if !text.trim().is_empty() => {
227 info!("PDF extracted via Windows OCR ({} chars)", text.len());
228 return text;
229 }
230 Some(_) => {
231 info!("Windows OCR returned empty text — PDF may be purely image-based");
232 }
233 None => {
234 info!("Windows OCR unavailable or failed — PDF may be encrypted or corrupted");
235 }
236 }
237 }
238
239 #[cfg(target_os = "macos")]
240 {
241 match macos_ocr_pdf(bytes) {
242 Some(text) if !text.trim().is_empty() => {
243 info!("PDF extracted via macOS Vision OCR ({} chars)", text.len());
244 return text;
245 }
246 Some(_) => {
247 info!("macOS OCR returned empty text — PDF may be purely image-based");
248 }
249 None => {
250 info!("macOS OCR unavailable or failed — PDF may be encrypted or corrupted");
251 }
252 }
253 }
254
255 "[PDF extraction failed. This file appears to be scanned, encrypted, or corrupted. \
257Please try: 1) Save as text-based PDF, 2) Use DOCX format, or 3) Paste text directly]".to_string()
258}
259
260#[cfg(target_os = "windows")]
265fn ensure_winrt_init() {
266 thread_local! {
267 static INIT: () = {
268 unsafe {
269 let _ = windows::Win32::System::WinRT::RoInitialize(
272 windows::Win32::System::WinRT::RO_INIT_MULTITHREADED,
273 );
274 }
275 };
276 }
277 INIT.with(|_| ());
278}
279
280#[cfg(target_os = "windows")]
283fn windows_ocr_pdf(bytes: &[u8]) -> Option<String> {
284 use windows::{
285 core::*,
286 Data::Pdf::PdfDocument,
287 Graphics::Imaging::{BitmapDecoder, BitmapPixelFormat, SoftwareBitmap},
288 Media::Ocr::OcrEngine,
289 Storage::Streams::{DataWriter, IOutputStream, IRandomAccessStream, InMemoryRandomAccessStream},
290 };
291
292 info!("Starting Windows OCR for PDF ({} bytes)", bytes.len());
293
294 ensure_winrt_init();
295 info!("WinRT initialized successfully");
296
297 let run = || -> windows::core::Result<String> {
298 info!("Creating in-memory PDF stream");
300 let pdf_stream = InMemoryRandomAccessStream::new()?;
301 {
302 let writer = DataWriter::new()?;
303 writer.WriteBytes(bytes)?;
304 let buffer = writer.DetachBuffer()?;
305 let out: IOutputStream = pdf_stream.cast()?;
306 out.WriteAsync(&buffer)?.get()?;
307 out.FlushAsync()?.get()?;
308 }
309 pdf_stream.Seek(0)?;
310 info!("PDF stream created successfully");
311
312 info!("Loading PDF document");
314 let pdf_doc = PdfDocument::LoadFromStreamAsync(&pdf_stream)?.get()?;
315 let page_count = pdf_doc.PageCount()?;
316 info!("PDF loaded, {} pages", page_count);
317
318 if page_count == 0 {
319 return Ok(String::new());
320 }
321
322 info!("Creating OCR engine");
324 let ocr_engine = OcrEngine::TryCreateFromUserProfileLanguages()?;
325 info!("OCR engine created successfully");
326
327 let mut all_text = String::new();
329
330 for page_idx in 0..page_count {
331 info!("Processing page {}/{}", page_idx + 1, page_count);
332 let page = pdf_doc.GetPage(page_idx)?;
333
334 let img_stream = InMemoryRandomAccessStream::new()?;
336 let img_iras: IRandomAccessStream = img_stream.cast()?;
337 page.RenderToStreamAsync(&img_iras)?.get()?;
338 img_stream.Seek(0)?;
339 info!("Page {} rendered to stream", page_idx);
340
341 let decoder = BitmapDecoder::CreateAsync(&img_iras)?.get()?;
343 let bitmap = decoder.GetSoftwareBitmapAsync()?.get()?;
344
345 let bitmap = if bitmap.BitmapPixelFormat()? != BitmapPixelFormat::Bgra8 {
347 SoftwareBitmap::Convert(&bitmap, BitmapPixelFormat::Bgra8)?
348 } else {
349 bitmap
350 };
351
352 match ocr_engine.RecognizeAsync(&bitmap)?.get() {
354 Ok(result) => {
355 let text = result.Text()?.to_string();
356 if !text.trim().is_empty() {
357 all_text.push_str(&text);
358 all_text.push('\n');
359 info!("Extracted {} chars from page {}", text.len(), page_idx);
360 }
361 }
362 Err(e) => info!("OCR page {} error: {}", page_idx, e),
363 }
364 }
365
366 info!("Windows OCR complete, total chars: {}", all_text.len());
367 Ok(all_text)
368 };
369
370 match run() {
371 Ok(text) if !text.trim().is_empty() => Some(text),
372 Ok(_) => {
373 info!("Windows OCR: no text found in PDF");
374 None
375 }
376 Err(e) => {
377 info!("Windows OCR failed: {}", e);
378 None
379 }
380 }
381}
382
383#[cfg(target_os = "macos")]
389fn macos_ocr_pdf(bytes: &[u8]) -> Option<String> {
390 use std::ffi::c_void;
391 use std::sync::Arc;
392 use core_graphics::{
393 color_space::CGColorSpace,
394 context::CGContext,
395 data_provider::CGDataProvider,
396 };
397 use objc2::rc::Retained;
398 use objc2_foundation::{NSArray, NSData, NSDictionary, NSString};
399 use objc2_vision::{
400 VNImageRequestHandler, VNRecognizeTextRequest,
401 VNRequest, VNRequestTextRecognitionLevel,
402 };
403
404 const BITMAP_INFO: u32 = 4;
407
408 let run = || -> Result<String, String> {
409 let pdf_data: Arc<Vec<u8>> = Arc::new(bytes.to_vec());
414 let provider = CGDataProvider::from_buffer(pdf_data);
415
416 let doc = unsafe {
417 CGPDFDocumentCreateWithProvider(provider.as_ptr() as *mut c_void)
418 };
419 if doc.is_null() {
420 return Err("CGPDFDocumentCreateWithProvider returned null".into());
421 }
422
423 let page_count = unsafe { CGPDFDocumentGetNumberOfPages(doc) };
424 if page_count == 0 {
425 unsafe { CGPDFDocumentRelease(doc) };
426 return Ok(String::new());
427 }
428
429 info!("macOS PDF OCR: {} page(s)", page_count);
430 let mut all_text = String::new();
431
432 for page_idx in 1..=page_count {
436 let page = unsafe { CGPDFDocumentGetPage(doc, page_idx) };
437 if page.is_null() {
438 continue;
439 }
440
441 let media_box = unsafe { CGPDFPageGetBoxRect(page, 0) };
444 let pt_w = media_box.size.width;
445 let pt_h = media_box.size.height;
446
447 let scale = 150.0_f64 / 72.0;
449 let px_w = ((pt_w * scale).ceil() as usize).max(1);
450 let px_h = ((pt_h * scale).ceil() as usize).max(1);
451 let bytes_per_row = px_w * 4; let mut pixel_buf = vec![255u8; bytes_per_row * px_h];
455
456 let color_space = CGColorSpace::create_device_rgb();
457 let ctx = unsafe {
458 CGContext::create_bitmap_context(
459 Some(pixel_buf.as_mut_ptr() as *mut c_void),
460 px_w,
461 px_h,
462 8, bytes_per_row,
464 &color_space,
465 BITMAP_INFO,
466 )
467 };
468 let ctx_ptr = ctx.as_ptr() as *mut c_void;
469
470 unsafe {
473 CGContextTranslateCTM(ctx_ptr, 0.0, px_h as f64);
474 CGContextScaleCTM(ctx_ptr, scale, -scale);
475 CGContextDrawPDFPage(ctx_ptr, page);
476 }
477 unsafe { CGPDFPageRelease(page) };
478
479 drop(ctx);
481
482 let mut png_bytes: Vec<u8> = Vec::new();
487 {
488 let mut enc = png::Encoder::new(&mut png_bytes, px_w as u32, px_h as u32);
489 enc.set_color(png::ColorType::Rgba);
490 enc.set_depth(png::BitDepth::Eight);
491 enc.write_header()
492 .and_then(|mut w| w.write_image_data(&pixel_buf))
493 .map_err(|e| format!("PNG encode failed on page {page_idx}: {e}"))?;
494 }
495
496 info!(
497 "Page {page_idx} rendered {px_w}×{px_h} px ({} PNG bytes)",
498 png_bytes.len()
499 );
500
501 unsafe {
503 let ns_data = NSData::with_bytes(&png_bytes);
504 let options = NSDictionary::<NSString, objc2::runtime::AnyObject>::new();
505
506 let handler = VNImageRequestHandler::initWithData_options(
507 VNImageRequestHandler::alloc(),
508 &ns_data,
509 &options,
510 );
511
512 let request =
513 VNRecognizeTextRequest::init(VNRecognizeTextRequest::alloc());
514
515 request.setRecognitionLevel(VNRequestTextRecognitionLevel::Accurate);
518 request.setUsesLanguageCorrection(true);
519
520 let req_as_base: &VNRequest = &*request;
525 let req_array = NSArray::from_slice(&[req_as_base]);
526
527 let _ = handler.performRequests_error(&*req_array);
529
530 if let Some(results) = request.results() {
531 for obs in results.iter() {
532 let candidates = obs.topCandidates(1);
534 if let Some(top) = candidates.firstObject() {
535 let text = top.string().to_string();
536 if !text.is_empty() {
537 all_text.push_str(&text);
538 all_text.push('\n');
539 }
540 }
541 }
542 }
543 }
544 }
545
546 unsafe { CGPDFDocumentRelease(doc) };
547 info!("macOS PDF OCR complete: {} chars", all_text.len());
548 Ok(all_text)
549 };
550
551 match run() {
552 Ok(text) if !text.trim().is_empty() => Some(text),
553 Ok(_) => {
554 debug!("macOS OCR: no text found in PDF");
555 None
556 }
557 Err(e) => {
558 debug!("macOS OCR failed: {e}");
559 None
560 }
561 }
562}
563
564async fn extract_docx_content(file_path: &Path) -> Result<String> {
566 let bytes = fs::read(file_path)?;
567 Ok(extract_docx_from_bytes(&bytes))
568}
569
570fn extract_docx_from_bytes(bytes: &[u8]) -> String {
571 let cursor = Cursor::new(bytes);
572 match zip::ZipArchive::new(cursor) {
573 Ok(mut archive) => {
574 if let Ok(mut file) = archive.by_name("word/document.xml") {
575 let mut xml = String::new();
576 if file.read_to_string(&mut xml).is_ok() {
577 let text = xml_to_plain_text(&xml, "</w:p>", "</w:tr>");
578 if text.is_empty() {
579 "[DOCX file appears to be empty]".to_string()
580 } else {
581 text
582 }
583 } else {
584 "[Could not read DOCX content]".to_string()
585 }
586 } else {
587 "[Could not find document content in DOCX file]".to_string()
588 }
589 }
590 Err(e) => {
591 debug!("DOCX extraction failed: {}", e);
592 format!("[Could not extract DOCX content: {}]", e)
593 }
594 }
595}
596
597fn xml_to_plain_text(xml: &str, paragraph_end: &str, row_end: &str) -> String {
602 let s = xml
604 .replace(paragraph_end, "\n")
605 .replace(row_end, "\n");
606
607 let tag_re = regex::Regex::new(r"<[^>]+>").unwrap();
609 let plain = tag_re.replace_all(&s, "");
610
611 let plain = plain
613 .replace("&", "&")
614 .replace("<", "<")
615 .replace(">", ">")
616 .replace(""", "\"")
617 .replace("'", "'")
618 .replace("	", "\t")
619 .replace("
", "\n")
620 .replace("
", "");
621
622 let mut result = String::new();
625 let mut blank_run = 0usize;
626 for line in plain.lines() {
627 let trimmed = line.trim();
628 if trimmed.is_empty() {
629 blank_run += 1;
630 if blank_run == 1 {
631 result.push('\n');
632 }
633 } else {
634 blank_run = 0;
635 result.push_str(trimmed);
636 result.push('\n');
637 }
638 }
639
640 result.trim().to_string()
641}
642
643async fn extract_xlsx_content(file_path: &Path) -> Result<String> {
645 use calamine::{Reader, open_workbook_auto};
646
647 match open_workbook_auto(file_path) {
648 Ok(mut workbook) => {
649 let mut text = String::new();
650 for sheet_name in workbook.sheet_names().to_vec() {
651 if let Ok(range) = workbook.worksheet_range(&sheet_name) {
652 text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
653 for row in range.rows() {
654 let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
655 text.push_str(&row_text.join("\t"));
656 text.push('\n');
657 }
658 }
659 }
660 if text.trim().is_empty() {
661 Ok("[Spreadsheet appears to be empty]".to_string())
662 } else {
663 Ok(text)
664 }
665 }
666 Err(e) => {
667 debug!("XLSX extraction failed: {}", e);
668 Ok(format!("[Could not extract spreadsheet content: {}]", e))
669 }
670 }
671}
672
673fn extract_xlsx_from_bytes(bytes: &[u8], ext: &str) -> String {
674 use calamine::{Reader, Xls, Xlsx, Ods};
675
676 let mut text = String::new();
677
678 match ext {
679 "ods" => {
680 let cursor = Cursor::new(bytes);
681 if let Ok(mut workbook) = Ods::new(cursor) {
682 for sheet_name in workbook.sheet_names().to_vec() {
683 if let Ok(range) = workbook.worksheet_range(&sheet_name) {
684 text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
685 for row in range.rows() {
686 let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
687 text.push_str(&row_text.join("\t"));
688 text.push('\n');
689 }
690 }
691 }
692 }
693 }
694 "xls" => {
695 let cursor = Cursor::new(bytes);
697 if let Ok(mut workbook) = Xls::new(cursor) {
698 for sheet_name in workbook.sheet_names().to_vec() {
699 if let Ok(range) = workbook.worksheet_range(&sheet_name) {
700 text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
701 for row in range.rows() {
702 let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
703 text.push_str(&row_text.join("\t"));
704 text.push('\n');
705 }
706 }
707 }
708 }
709 }
710 _ => {
711 let cursor = Cursor::new(bytes);
713 if let Ok(mut workbook) = Xlsx::new(cursor) {
714 for sheet_name in workbook.sheet_names().to_vec() {
715 if let Ok(range) = workbook.worksheet_range(&sheet_name) {
716 text.push_str(&format!("\n=== Sheet: {} ===\n", sheet_name));
717 for row in range.rows() {
718 let row_text: Vec<String> = row.iter().map(|c| c.to_string()).collect();
719 text.push_str(&row_text.join("\t"));
720 text.push('\n');
721 }
722 }
723 }
724 }
725 }
726 }
727
728 if text.trim().is_empty() {
729 "[Spreadsheet appears to be empty or could not be read]".to_string()
730 } else {
731 text
732 }
733}
734
735async fn extract_pptx_content(file_path: &Path) -> Result<String> {
737 let bytes = fs::read(file_path)?;
738 Ok(extract_pptx_from_bytes(&bytes))
739}
740
741fn extract_pptx_from_bytes(bytes: &[u8]) -> String {
742 let cursor = Cursor::new(bytes);
743 match zip::ZipArchive::new(cursor) {
744 Ok(mut archive) => {
745 let mut text = String::new();
746 let mut slide_num = 1;
747
748 loop {
749 let slide_path = format!("ppt/slides/slide{}.xml", slide_num);
750 match archive.by_name(&slide_path) {
751 Ok(mut file) => {
752 let mut xml = String::new();
753 if file.read_to_string(&mut xml).is_ok() {
754 let content = xml_to_plain_text(&xml, "</a:p>", "</a:r>");
755 if !content.is_empty() {
756 text.push_str(&format!("\n=== Slide {} ===\n{}", slide_num, content));
757 }
758 }
759 slide_num += 1;
760 }
761 Err(_) => break,
762 }
763 }
764
765 if text.trim().is_empty() {
766 "[Presentation appears to be empty]".to_string()
767 } else {
768 text
769 }
770 }
771 Err(e) => {
772 debug!("PPTX extraction failed: {}", e);
773 format!("[Could not extract presentation content: {}]", e)
774 }
775 }
776}
777
778async fn extract_odt_content(file_path: &Path) -> Result<String> {
780 let bytes = fs::read(file_path)?;
781 Ok(extract_odt_from_bytes(&bytes))
782}
783
784fn extract_odt_from_bytes(bytes: &[u8]) -> String {
785 let cursor = Cursor::new(bytes);
786 match zip::ZipArchive::new(cursor) {
787 Ok(mut archive) => {
788 if let Ok(mut file) = archive.by_name("content.xml") {
789 let mut xml = String::new();
790 if file.read_to_string(&mut xml).is_ok() {
791 let text = xml_to_plain_text(&xml, "</text:p>", "</table:table-row>");
792 if text.is_empty() {
793 "[ODT file appears to be empty]".to_string()
794 } else {
795 text
796 }
797 } else {
798 "[Could not read ODT content]".to_string()
799 }
800 } else {
801 "[Could not find content in ODT file]".to_string()
802 }
803 }
804 Err(e) => {
805 debug!("ODT extraction failed: {}", e);
806 format!("[Could not extract ODT content: {}]", e)
807 }
808 }
809}
810
811#[cfg(test)]
812mod tests {
813 use super::*;
814 use std::fs;
815 use tempfile::NamedTempFile;
816
817 #[tokio::test]
818 async fn test_extract_text_file() {
819 let temp_file = NamedTempFile::new().unwrap();
820 let content = "Test file content\nwith multiple lines";
821 fs::write(&temp_file.path(), content).unwrap();
822
823 let result = extract_text_file(temp_file.path()).await.unwrap();
824 assert_eq!(result, content);
825 }
826
827 #[tokio::test]
828 async fn test_extract_unknown_file_type() {
829 let temp_file = NamedTempFile::new().unwrap();
830 let content = "Unknown file content";
831 fs::write(&temp_file.path(), content).unwrap();
832
833 let result = extract_file_content(temp_file.path()).await.unwrap();
834 assert_eq!(result, content);
835 }
836}