memvid_core/reader/
xlsx.rs1use std::io::Cursor;
2
3use calamine::{DataType, Reader as CalamineReader, Xlsx};
4
5use crate::{
6 DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
7 Result,
8};
9
10pub struct XlsxReader;
11
12impl XlsxReader {
13 fn extract_text(bytes: &[u8]) -> Result<String> {
14 let cursor = Cursor::new(bytes);
15 let mut workbook =
16 Xlsx::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
17 reason: format!("failed to read xlsx workbook: {err}").into(),
18 })?;
19
20 let mut out = String::new();
21 for sheet_name in workbook.sheet_names().to_owned() {
22 if let Some(Ok(range)) = workbook.worksheet_range(&sheet_name) {
23 if !out.is_empty() {
24 out.push_str("\n");
25 }
26 out.push_str(&format!("Sheet: {}\n", sheet_name));
27 for row in range.rows() {
28 let mut first_cell = true;
29 for cell in row {
30 if !first_cell {
31 out.push('\t');
32 }
33 first_cell = false;
34 match cell {
35 DataType::String(s) => out.push_str(s.trim()),
36 DataType::Float(v) => out.push_str(&format!("{}", v)),
37 DataType::Int(v) => out.push_str(&format!("{}", v)),
38 DataType::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
39 DataType::Error(e) => out.push_str(&format!("#{:?}", e)),
40 DataType::Empty => {}
41 DataType::DateTime(v) => out.push_str(&format!("{}", v)),
42 DataType::DateTimeIso(s) => out.push_str(s),
43 DataType::Duration(v) => out.push_str(&format!("{}", v)),
44 DataType::DurationIso(s) => out.push_str(s),
45 }
46 }
47 out.push('\n');
48 }
49 }
50 }
51
52 Ok(out.trim().to_string())
53 }
54}
55
56impl DocumentReader for XlsxReader {
57 fn name(&self) -> &'static str {
58 "xlsx"
59 }
60
61 fn supports(&self, hint: &ReaderHint<'_>) -> bool {
62 matches!(hint.format, Some(DocumentFormat::Xlsx))
63 || hint
64 .mime
65 .map(|mime| {
66 mime.eq_ignore_ascii_case(
67 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
68 )
69 })
70 .unwrap_or(false)
71 }
72
73 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
74 match Self::extract_text(bytes) {
75 Ok(text) => {
76 if text.trim().is_empty() {
77 let mut fallback = PassthroughReader.extract(bytes, hint)?;
79 fallback.reader_name = self.name().to_string();
80 fallback.diagnostics.mark_fallback();
81 fallback.diagnostics.record_warning(
82 "xlsx reader produced empty text; falling back to default extractor",
83 );
84 Ok(fallback)
85 } else {
86 let mut document = crate::ExtractedDocument::empty();
88 document.text = Some(text);
89 document.mime_type = Some(
90 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
91 .to_string(),
92 );
93 Ok(ReaderOutput::new(document, self.name())
94 .with_diagnostics(ReaderDiagnostics::default()))
95 }
96 }
97 Err(err) => {
98 let mut fallback = PassthroughReader.extract(bytes, hint)?;
100 fallback.reader_name = self.name().to_string();
101 fallback.diagnostics.mark_fallback();
102 fallback
103 .diagnostics
104 .record_warning(format!("xlsx reader error: {err}"));
105 Ok(fallback)
106 }
107 }
108 }
109}