memvid_core/reader/
xls.rs

1use std::io::Cursor;
2
3use calamine::{DataType, Reader as CalamineReader, Xls};
4
5use crate::{
6    DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
7    Result,
8};
9
10/// Reader for legacy Excel 97-2003 (.xls) files using calamine.
11pub struct XlsReader;
12
13impl XlsReader {
14    fn extract_text(bytes: &[u8]) -> Result<String> {
15        let cursor = Cursor::new(bytes);
16        let mut workbook =
17            Xls::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
18                reason: format!("failed to read xls workbook: {err}").into(),
19            })?;
20
21        let mut out = String::new();
22        for sheet_name in workbook.sheet_names().to_owned() {
23            if let Some(Ok(range)) = workbook.worksheet_range(&sheet_name) {
24                if !out.is_empty() {
25                    out.push_str("\n");
26                }
27                out.push_str(&format!("Sheet: {}\n", sheet_name));
28                for row in range.rows() {
29                    let mut first_cell = true;
30                    for cell in row {
31                        if !first_cell {
32                            out.push('\t');
33                        }
34                        first_cell = false;
35                        match cell {
36                            DataType::String(s) => out.push_str(s.trim()),
37                            DataType::Float(v) => out.push_str(&format!("{}", v)),
38                            DataType::Int(v) => out.push_str(&format!("{}", v)),
39                            DataType::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
40                            DataType::Error(e) => out.push_str(&format!("#{:?}", e)),
41                            DataType::Empty => {}
42                            DataType::DateTime(v) => out.push_str(&format!("{}", v)),
43                            DataType::DateTimeIso(s) => out.push_str(s),
44                            DataType::Duration(v) => out.push_str(&format!("{}", v)),
45                            DataType::DurationIso(s) => out.push_str(s),
46                        }
47                    }
48                    out.push('\n');
49                }
50            }
51        }
52
53        Ok(out.trim().to_string())
54    }
55}
56
57impl DocumentReader for XlsReader {
58    fn name(&self) -> &'static str {
59        "xls"
60    }
61
62    fn supports(&self, hint: &ReaderHint<'_>) -> bool {
63        matches!(hint.format, Some(DocumentFormat::Xls))
64            || hint
65                .mime
66                .map(|mime| mime.eq_ignore_ascii_case("application/vnd.ms-excel"))
67                .unwrap_or(false)
68    }
69
70    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
71        match Self::extract_text(bytes) {
72            Ok(text) => {
73                if text.trim().is_empty() {
74                    // Calamine returned empty - try extractous as fallback
75                    let mut fallback = PassthroughReader.extract(bytes, hint)?;
76                    fallback.reader_name = self.name().to_string();
77                    fallback.diagnostics.mark_fallback();
78                    fallback.diagnostics.record_warning(
79                        "xls reader produced empty text; falling back to default extractor",
80                    );
81                    Ok(fallback)
82                } else {
83                    // Calamine succeeded - build output directly WITHOUT calling extractous
84                    let mut document = crate::ExtractedDocument::empty();
85                    document.text = Some(text);
86                    document.mime_type = Some("application/vnd.ms-excel".to_string());
87                    Ok(ReaderOutput::new(document, self.name())
88                        .with_diagnostics(ReaderDiagnostics::default()))
89                }
90            }
91            Err(err) => {
92                // Calamine failed - try extractous as fallback
93                let mut fallback = PassthroughReader.extract(bytes, hint)?;
94                fallback.reader_name = self.name().to_string();
95                fallback.diagnostics.mark_fallback();
96                fallback
97                    .diagnostics
98                    .record_warning(format!("xls reader error: {err}"));
99                Ok(fallback)
100            }
101        }
102    }
103}