Skip to main content

memvid_core/reader/
xlsx.rs

1use std::io::Cursor;
2
3use calamine::{DataType, Reader as CalamineReader, Xlsx};
4
5use super::xlsx_chunker::{XlsxChunkingOptions, chunk_workbook, generate_flat_text};
6use super::xlsx_ooxml::{OoxmlMetadata, parse_ooxml_metadata};
7use super::xlsx_table_detect::{CellValue, DetectedTable, SheetGrid, detect_tables};
8use crate::{
9    DocumentFormat, DocumentReader, PassthroughReader, ReaderDiagnostics, ReaderHint, ReaderOutput,
10    Result, types::structure::ChunkingResult,
11};
12
13/// Result of the structured XLSX extraction pipeline.
14pub struct XlsxStructuredResult {
15    /// Backward-compatible flat text.
16    pub text: String,
17    /// Detected tables with metadata.
18    pub tables: Vec<DetectedTable>,
19    /// Semantic chunks with header-value pairing.
20    pub chunks: ChunkingResult,
21    /// OOXML metadata (number formats, merged regions, etc.).
22    pub metadata: OoxmlMetadata,
23    /// Extraction diagnostics.
24    pub diagnostics: XlsxStructuredDiagnostics,
25}
26
27/// Diagnostics from structured extraction.
28pub struct XlsxStructuredDiagnostics {
29    pub warnings: Vec<String>,
30}
31
32pub struct XlsxReader;
33
34impl XlsxReader {
35    /// Build `SheetGrid`s from raw XLSX bytes using calamine.
36    fn build_grids(bytes: &[u8]) -> Result<Vec<SheetGrid>> {
37        let cursor = Cursor::new(bytes);
38        let mut workbook =
39            Xlsx::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
40                reason: format!("failed to read xlsx workbook: {err}").into(),
41            })?;
42
43        let sheet_names: Vec<String> = workbook.sheet_names().clone();
44        let mut grids = Vec::new();
45
46        for sheet_name in &sheet_names {
47            let Some(Ok(range)) = workbook.worksheet_range(sheet_name) else {
48                continue;
49            };
50
51            let mut grid = SheetGrid::new(sheet_name.clone());
52            #[allow(clippy::cast_possible_truncation)]
53            let num_rows = range.height() as u32;
54            #[allow(clippy::cast_possible_truncation)]
55            let num_cols = range.width() as u32;
56
57            for row in range.rows() {
58                let cells: Vec<CellValue> = row
59                    .iter()
60                    .map(|cell| match cell {
61                        DataType::String(s) => CellValue::Text(s.clone()),
62                        DataType::Float(v) => CellValue::Number(*v),
63                        DataType::Int(v) => CellValue::Integer(*v),
64                        DataType::Bool(b) => CellValue::Boolean(*b),
65                        DataType::DateTime(v) => CellValue::Number(*v),
66                        DataType::DateTimeIso(s) => CellValue::DateTime(s.clone()),
67                        DataType::Duration(v) => CellValue::Number(*v),
68                        DataType::DurationIso(s) => CellValue::Text(s.clone()),
69                        DataType::Error(e) => CellValue::Error(format!("#{e:?}")),
70                        DataType::Empty => CellValue::Empty,
71                    })
72                    .collect();
73                grid.rows.push(cells);
74            }
75
76            grid.num_rows = num_rows;
77            grid.num_cols = num_cols;
78            grids.push(grid);
79        }
80
81        Ok(grids)
82    }
83
84    /// Extract structured data from XLSX bytes with default options.
85    pub fn extract_structured(bytes: &[u8]) -> Result<XlsxStructuredResult> {
86        Self::extract_structured_with_options(bytes, XlsxChunkingOptions::default())
87    }
88
89    /// Extract structured data from XLSX bytes with custom chunking options.
90    pub fn extract_structured_with_options(
91        bytes: &[u8],
92        options: XlsxChunkingOptions,
93    ) -> Result<XlsxStructuredResult> {
94        let grids = Self::build_grids(bytes)?;
95        let metadata = parse_ooxml_metadata(bytes).unwrap_or_default();
96
97        let mut all_tables = Vec::new();
98        let mut warnings = Vec::new();
99
100        for grid in &grids {
101            let sheet_merged = metadata
102                .merged_regions
103                .get(&grid.sheet_name)
104                .cloned()
105                .unwrap_or_default();
106            let sheet_ooxml_tables: Vec<_> = metadata
107                .table_defs
108                .iter()
109                .filter(|t| t.sheet_name == grid.sheet_name)
110                .cloned()
111                .collect();
112
113            let tables = detect_tables(grid, &sheet_ooxml_tables, &sheet_merged);
114            if tables.is_empty() {
115                warnings.push(format!("No tables detected in sheet '{}'", grid.sheet_name));
116            }
117            all_tables.extend(tables);
118        }
119
120        let chunks = chunk_workbook(&grids, &all_tables, &metadata, &options);
121        let text = generate_flat_text(&grids, &all_tables, &metadata);
122
123        // Merge chunker warnings
124        warnings.extend(chunks.warnings.iter().cloned());
125
126        Ok(XlsxStructuredResult {
127            text,
128            tables: all_tables,
129            chunks,
130            metadata,
131            diagnostics: XlsxStructuredDiagnostics { warnings },
132        })
133    }
134
135    fn extract_text(bytes: &[u8]) -> Result<String> {
136        let cursor = Cursor::new(bytes);
137        let mut workbook =
138            Xlsx::new(cursor).map_err(|err| crate::MemvidError::ExtractionFailed {
139                reason: format!("failed to read xlsx workbook: {err}").into(),
140            })?;
141
142        let mut out = String::new();
143        for sheet_name in workbook.sheet_names().clone() {
144            if let Some(Ok(range)) = workbook.worksheet_range(&sheet_name) {
145                if !out.is_empty() {
146                    out.push('\n');
147                }
148                out.push_str(&format!("Sheet: {sheet_name}\n"));
149                for row in range.rows() {
150                    let mut first_cell = true;
151                    for cell in row {
152                        if !first_cell {
153                            out.push('\t');
154                        }
155                        first_cell = false;
156                        match cell {
157                            DataType::String(s) => out.push_str(s.trim()),
158                            DataType::Float(v) => out.push_str(&format!("{v}")),
159                            DataType::Int(v) => out.push_str(&format!("{v}")),
160                            DataType::Bool(b) => out.push_str(if *b { "true" } else { "false" }),
161                            DataType::Error(e) => out.push_str(&format!("#{e:?}")),
162                            DataType::Empty => {}
163                            DataType::DateTime(v) => out.push_str(&format!("{v}")),
164                            DataType::DateTimeIso(s) => out.push_str(s),
165                            DataType::Duration(v) => out.push_str(&format!("{v}")),
166                            DataType::DurationIso(s) => out.push_str(s),
167                        }
168                    }
169                    out.push('\n');
170                }
171            }
172        }
173
174        Ok(out.trim().to_string())
175    }
176}
177
178impl DocumentReader for XlsxReader {
179    fn name(&self) -> &'static str {
180        "xlsx"
181    }
182
183    fn supports(&self, hint: &ReaderHint<'_>) -> bool {
184        matches!(hint.format, Some(DocumentFormat::Xlsx))
185            || hint.mime.is_some_and(|mime| {
186                mime.eq_ignore_ascii_case(
187                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
188                )
189            })
190    }
191
192    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
193        match Self::extract_text(bytes) {
194            Ok(text) => {
195                if text.trim().is_empty() {
196                    // Calamine returned empty - try extractous as fallback
197                    let mut fallback = PassthroughReader.extract(bytes, hint)?;
198                    fallback.reader_name = self.name().to_string();
199                    fallback.diagnostics.mark_fallback();
200                    fallback.diagnostics.record_warning(
201                        "xlsx reader produced empty text; falling back to default extractor",
202                    );
203                    Ok(fallback)
204                } else {
205                    // Calamine succeeded - build output directly WITHOUT calling extractous
206                    let mut document = crate::ExtractedDocument::empty();
207                    document.text = Some(text);
208                    document.mime_type = Some(
209                        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
210                            .to_string(),
211                    );
212                    Ok(ReaderOutput::new(document, self.name())
213                        .with_diagnostics(ReaderDiagnostics::default()))
214                }
215            }
216            Err(err) => {
217                // Calamine failed - try extractous as fallback
218                let mut fallback = PassthroughReader.extract(bytes, hint)?;
219                fallback.reader_name = self.name().to_string();
220                fallback.diagnostics.mark_fallback();
221                fallback
222                    .diagnostics
223                    .record_warning(format!("xlsx reader error: {err}"));
224                Ok(fallback)
225            }
226        }
227    }
228}