memvid_core/reader/
xlsx.rs1use std::io::Cursor;
2
3use calamine::{DataType, Reader as CalamineReader, Xlsx};
4
5use super::xlsx_chunker::{XlsxChunkingOptions, chunk_workbook, generate_flat_text};
6use super::xlsx_ooxml::{OoxmlMetadata, parse_ooxml_metadata};
7use super::xlsx_table_detect::{CellValue, DetectedTable, SheetGrid, detect_tables};
8use crate::{
9 DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
10 Result, types::structure::ChunkingResult,
11};
12
13pub struct XlsxStructuredResult {
15 pub text: String,
17 pub tables: Vec<DetectedTable>,
19 pub chunks: ChunkingResult,
21 pub metadata: OoxmlMetadata,
23 pub diagnostics: XlsxStructuredDiagnostics,
25}
26
27pub struct XlsxStructuredDiagnostics {
29 pub warnings: Vec<String>,
30}
31
32pub struct XlsxReader;
33
34impl XlsxReader {
35 fn build_grids(bytes: &[u8]) -> Result<Vec<SheetGrid>> {
37 let cursor = Cursor::new(bytes);
38 let mut workbook =
39 Xlsx::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
40 reason: format!("failed to read xlsx workbook: {err}").into(),
41 })?;
42
43 let sheet_names: Vec<String> = workbook.sheet_names().clone();
44 let mut grids = Vec::new();
45
46 for sheet_name in &sheet_names {
47 let Some(Ok(range)) = workbook.worksheet_range(sheet_name) else {
48 continue;
49 };
50
51 let mut grid = SheetGrid::new(sheet_name.clone());
52 #[allow(clippy::cast_possible_truncation)]
53 let num_rows = range.height() as u32;
54 #[allow(clippy::cast_possible_truncation)]
55 let num_cols = range.width() as u32;
56
57 for row in range.rows() {
58 let cells: Vec<CellValue> = row
59 .iter()
60 .map(|cell| match cell {
61 DataType::String(s) => CellValue::Text(s.clone()),
62 DataType::Float(v) => CellValue::Number(*v),
63 DataType::Int(v) => CellValue::Integer(*v),
64 DataType::Bool(b) => CellValue::Boolean(*b),
65 DataType::DateTime(v) => CellValue::Number(*v),
66 DataType::DateTimeIso(s) => CellValue::DateTime(s.clone()),
67 DataType::Duration(v) => CellValue::Number(*v),
68 DataType::DurationIso(s) => CellValue::Text(s.clone()),
69 DataType::Error(e) => CellValue::Error(format!("#{e:?}")),
70 DataType::Empty => CellValue::Empty,
71 })
72 .collect();
73 grid.rows.push(cells);
74 }
75
76 grid.num_rows = num_rows;
77 grid.num_cols = num_cols;
78 grids.push(grid);
79 }
80
81 Ok(grids)
82 }
83
84 pub fn extract_structured(bytes: &[u8]) -> Result<XlsxStructuredResult> {
86 Self::extract_structured_with_options(bytes, XlsxChunkingOptions::default())
87 }
88
89 pub fn extract_structured_with_options(
91 bytes: &[u8],
92 options: XlsxChunkingOptions,
93 ) -> Result<XlsxStructuredResult> {
94 let grids = Self::build_grids(bytes)?;
95 let metadata = parse_ooxml_metadata(bytes).unwrap_or_default();
96
97 let mut all_tables = Vec::new();
98 let mut warnings = Vec::new();
99
100 for grid in &grids {
101 let sheet_merged = metadata
102 .merged_regions
103 .get(&grid.sheet_name)
104 .cloned()
105 .unwrap_or_default();
106 let sheet_ooxml_tables: Vec<_> = metadata
107 .table_defs
108 .iter()
109 .filter(|t| t.sheet_name == grid.sheet_name)
110 .cloned()
111 .collect();
112
113 let tables = detect_tables(grid, &sheet_ooxml_tables, &sheet_merged);
114 if tables.is_empty() {
115 warnings.push(format!("No tables detected in sheet '{}'", grid.sheet_name));
116 }
117 all_tables.extend(tables);
118 }
119
120 let chunks = chunk_workbook(&grids, &all_tables, &metadata, &options);
121 let text = generate_flat_text(&grids, &all_tables, &metadata);
122
123 warnings.extend(chunks.warnings.iter().cloned());
125
126 Ok(XlsxStructuredResult {
127 text,
128 tables: all_tables,
129 chunks,
130 metadata,
131 diagnostics: XlsxStructuredDiagnostics { warnings },
132 })
133 }
134
135 fn extract_text(bytes: &[u8]) -> Result<String> {
136 let cursor = Cursor::new(bytes);
137 let mut workbook =
138 Xlsx::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
139 reason: format!("failed to read xlsx workbook: {err}").into(),
140 })?;
141
142 let mut out = String::new();
143 for sheet_name in workbook.sheet_names().clone() {
144 if let Some(Ok(range)) = workbook.worksheet_range(&sheet_name) {
145 if !out.is_empty() {
146 out.push('\n');
147 }
148 out.push_str(&format!("Sheet: {sheet_name}\n"));
149 for row in range.rows() {
150 let mut first_cell = true;
151 for cell in row {
152 if !first_cell {
153 out.push('\t');
154 }
155 first_cell = false;
156 match cell {
157 DataType::String(s) => out.push_str(s.trim()),
158 DataType::Float(v) => out.push_str(&format!("{v}")),
159 DataType::Int(v) => out.push_str(&format!("{v}")),
160 DataType::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
161 DataType::Error(e) => out.push_str(&format!("#{e:?}")),
162 DataType::Empty => {}
163 DataType::DateTime(v) => out.push_str(&format!("{v}")),
164 DataType::DateTimeIso(s) => out.push_str(s),
165 DataType::Duration(v) => out.push_str(&format!("{v}")),
166 DataType::DurationIso(s) => out.push_str(s),
167 }
168 }
169 out.push('\n');
170 }
171 }
172 }
173
174 Ok(out.trim().to_string())
175 }
176}
177
178impl DocumentReader for XlsxReader {
179 fn name(&self) -> &'static str {
180 "xlsx"
181 }
182
183 fn supports(&self, hint: &ReaderHint<'_>) -> bool {
184 matches!(hint.format, Some(DocumentFormat::Xlsx))
185 || hint.mime.is_some_and(|mime| {
186 mime.eq_ignore_ascii_case(
187 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
188 )
189 })
190 }
191
192 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
193 match Self::extract_text(bytes) {
194 Ok(text) => {
195 if text.trim().is_empty() {
196 let mut fallback = PassthroughReader.extract(bytes, hint)?;
198 fallback.reader_name = self.name().to_string();
199 fallback.diagnostics.mark_fallback();
200 fallback.diagnostics.record_warning(
201 "xlsx reader produced empty text; falling back to default extractor",
202 );
203 Ok(fallback)
204 } else {
205 let mut document = crate::ExtractedDocument::empty();
207 document.text = Some(text);
208 document.mime_type = Some(
209 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
210 .to_string(),
211 );
212 Ok(ReaderOutput::new(document, self.name())
213 .with_diagnostics(ReaderDiagnostics::default()))
214 }
215 }
216 Err(err) => {
217 let mut fallback = PassthroughReader.extract(bytes, hint)?;
219 fallback.reader_name = self.name().to_string();
220 fallback.diagnostics.mark_fallback();
221 fallback
222 .diagnostics
223 .record_warning(format!("xlsx reader error: {err}"));
224 Ok(fallback)
225 }
226 }
227 }
228}