memvid_core/reader/
xlsx.rs

1use std::io::Cursor;
2
3use calamine::{DataType, Reader as CalamineReader, Xlsx};
4
5use crate::{
6    DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
7    Result,
8};
9
10pub struct XlsxReader;
11
12impl XlsxReader {
13    fn extract_text(bytes: &[u8]) -> Result<String> {
14        let cursor = Cursor::new(bytes);
15        let mut workbook =
16            Xlsx::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
17                reason: format!("failed to read xlsx workbook: {err}").into(),
18            })?;
19
20        let mut out = String::new();
21        for sheet_name in workbook.sheet_names().to_owned() {
22            if let Some(Ok(range)) = workbook.worksheet_range(&sheet_name) {
23                if !out.is_empty() {
24                    out.push_str("\n");
25                }
26                out.push_str(&format!("Sheet: {}\n", sheet_name));
27                for row in range.rows() {
28                    let mut first_cell = true;
29                    for cell in row {
30                        if !first_cell {
31                            out.push('\t');
32                        }
33                        first_cell = false;
34                        match cell {
35                            DataType::String(s) => out.push_str(s.trim()),
36                            DataType::Float(v) => out.push_str(&format!("{}", v)),
37                            DataType::Int(v) => out.push_str(&format!("{}", v)),
38                            DataType::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
39                            DataType::Error(e) => out.push_str(&format!("#{:?}", e)),
40                            DataType::Empty => {}
41                            DataType::DateTime(v) => out.push_str(&format!("{}", v)),
42                            DataType::DateTimeIso(s) => out.push_str(s),
43                            DataType::Duration(v) => out.push_str(&format!("{}", v)),
44                            DataType::DurationIso(s) => out.push_str(s),
45                        }
46                    }
47                    out.push('\n');
48                }
49            }
50        }
51
52        Ok(out.trim().to_string())
53    }
54}
55
56impl DocumentReader for XlsxReader {
57    fn name(&self) -> &'static str {
58        "xlsx"
59    }
60
61    fn supports(&self, hint: &ReaderHint<'_>) -> bool {
62        matches!(hint.format, Some(DocumentFormat::Xlsx))
63            || hint
64                .mime
65                .map(|mime| {
66                    mime.eq_ignore_ascii_case(
67                        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
68                    )
69                })
70                .unwrap_or(false)
71    }
72
73    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
74        match Self::extract_text(bytes) {
75            Ok(text) => {
76                if text.trim().is_empty() {
77                    // Calamine returned empty - try extractous as fallback
78                    let mut fallback = PassthroughReader.extract(bytes, hint)?;
79                    fallback.reader_name = self.name().to_string();
80                    fallback.diagnostics.mark_fallback();
81                    fallback.diagnostics.record_warning(
82                        "xlsx reader produced empty text; falling back to default extractor",
83                    );
84                    Ok(fallback)
85                } else {
86                    // Calamine succeeded - build output directly WITHOUT calling extractous
87                    let mut document = crate::ExtractedDocument::empty();
88                    document.text = Some(text);
89                    document.mime_type = Some(
90                        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
91                            .to_string(),
92                    );
93                    Ok(ReaderOutput::new(document, self.name())
94                        .with_diagnostics(ReaderDiagnostics::default()))
95                }
96            }
97            Err(err) => {
98                // Calamine failed - try extractous as fallback
99                let mut fallback = PassthroughReader.extract(bytes, hint)?;
100                fallback.reader_name = self.name().to_string();
101                fallback.diagnostics.mark_fallback();
102                fallback
103                    .diagnostics
104                    .record_warning(format!("xlsx reader error: {err}"));
105                Ok(fallback)
106            }
107        }
108    }
109}