Skip to main content

memvid_core/reader/
xlsx_chunker.rs

1//! Row-aligned semantic chunking for XLSX spreadsheets.
2//!
3//! Produces structure-aware chunks that:
4//! - Never split a row across chunks
5//! - Prefix every chunk with sheet/table context and header row
6//! - Format rows as `Header: Value | Header: Value` for search accuracy
7//! - Skip empty cells for compact output
8#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
9
10use crate::types::structure::{ChunkingResult, StructuredChunk};
11
12use super::xlsx_ooxml::{
13    NumFmtKind, OoxmlMetadata, excel_serial_to_iso, format_currency, format_percentage,
14};
15use super::xlsx_table_detect::{CellValue, DetectedTable, SheetGrid};
16
17/// Default target chunk size in characters.
18const DEFAULT_MAX_CHUNK_CHARS: usize = 1200;
19
20/// Maximum number of chunks to produce from a single workbook.
21const MAX_SPREADSHEET_CHUNKS: usize = 500;
22
23/// Options for XLSX semantic chunking.
24#[derive(Debug, Clone)]
25pub struct XlsxChunkingOptions {
26    pub max_chars: usize,
27    pub max_chunks: usize,
28}
29
30impl Default for XlsxChunkingOptions {
31    fn default() -> Self {
32        Self {
33            max_chars: DEFAULT_MAX_CHUNK_CHARS,
34            max_chunks: MAX_SPREADSHEET_CHUNKS,
35        }
36    }
37}
38
39/// Format a cell value using OOXML metadata for type-aware rendering.
40#[must_use]
41pub fn format_cell_value(
42    cell: &CellValue,
43    fmt_kind: NumFmtKind,
44    _metadata: &OoxmlMetadata,
45) -> String {
46    match (cell, fmt_kind) {
47        (CellValue::Empty, _) => String::new(),
48        (CellValue::Text(s), _) => s.trim().to_string(),
49        (CellValue::Number(v), NumFmtKind::Date | NumFmtKind::DateTime) => {
50            excel_serial_to_iso(*v).unwrap_or_else(|| format!("{v}"))
51        }
52        (CellValue::Number(v), NumFmtKind::Percentage) => format_percentage(*v),
53        (CellValue::Number(v), NumFmtKind::Currency) => format_currency(*v, "$"),
54        (CellValue::Number(v), _) => {
55            // Clean up float display — use integer format if no fractional part
56            if (v.fract()).abs() < 1e-10 {
57                format!("{}", *v as i64)
58            } else {
59                format!("{v}")
60            }
61        }
62        (CellValue::Integer(v), NumFmtKind::Date | NumFmtKind::DateTime) => {
63            excel_serial_to_iso(*v as f64).unwrap_or_else(|| format!("{v}"))
64        }
65        (CellValue::Integer(v), NumFmtKind::Percentage) => format_percentage(*v as f64),
66        (CellValue::Integer(v), NumFmtKind::Currency) => format_currency(*v as f64, "$"),
67        (CellValue::Integer(v), _) => format!("{v}"),
68        (CellValue::Boolean(b), _) => if *b { "true" } else { "false" }.to_string(),
69        (CellValue::DateTime(s), _) => s.clone(),
70        (CellValue::Error(s), _) => s.clone(),
71    }
72}
73
74/// Format a single row as `Header: Value | Header: Value`, skipping empty cells.
75fn format_row_with_headers(
76    grid: &SheetGrid,
77    row_idx: u32,
78    headers: &[String],
79    first_col: u32,
80    last_col: u32,
81    metadata: &OoxmlMetadata,
82) -> String {
83    let mut parts = Vec::new();
84
85    for col in first_col..=last_col {
86        let cell = grid.cell(row_idx, col);
87        if cell.is_empty() {
88            continue;
89        }
90
91        let fmt_kind = grid.num_fmt(row_idx, col);
92        let formatted = format_cell_value(cell, fmt_kind, metadata);
93        if formatted.is_empty() {
94            continue;
95        }
96
97        let col_offset = (col - first_col) as usize;
98        let header = headers.get(col_offset).filter(|h| !h.is_empty()).cloned();
99
100        if let Some(h) = header {
101            parts.push(format!("{h}: {formatted}"));
102        } else {
103            parts.push(formatted);
104        }
105    }
106
107    parts.join(" | ")
108}
109
110/// Build a context prefix for a chunk: `[Sheet: X] [Table: Y]`
111fn build_context_prefix(sheet_name: &str, table_name: &str) -> String {
112    format!("[Sheet: {sheet_name}] [Table: {table_name}]")
113}
114
115/// Build a header line: `Header1 | Header2 | Header3`
116fn build_header_line(headers: &[String]) -> String {
117    let nonempty: Vec<&str> = headers
118        .iter()
119        .map(String::as_str)
120        .filter(|h| !h.is_empty())
121        .collect();
122    if nonempty.is_empty() {
123        String::new()
124    } else {
125        nonempty.join(" | ")
126    }
127}
128
129/// Chunk a single detected table into structure-aware chunks.
130fn chunk_table(
131    grid: &SheetGrid,
132    table: &DetectedTable,
133    metadata: &OoxmlMetadata,
134    options: &XlsxChunkingOptions,
135    chunk_index_start: usize,
136) -> Vec<StructuredChunk> {
137    let context_prefix = build_context_prefix(&table.sheet_name, &table.name);
138    let header_line = build_header_line(&table.headers);
139
140    // Build the fixed prefix that goes into every chunk
141    let fixed_prefix = if header_line.is_empty() {
142        format!("{context_prefix}\n")
143    } else {
144        format!("{context_prefix}\n{header_line}\n")
145    };
146    let prefix_len = fixed_prefix.len();
147
148    // Format all data rows
149    let mut formatted_rows: Vec<String> = Vec::new();
150    for row_idx in table.first_data_row..=table.last_data_row {
151        let line = format_row_with_headers(
152            grid,
153            row_idx,
154            &table.headers,
155            table.first_col,
156            table.last_col,
157            metadata,
158        );
159        if !line.is_empty() {
160            formatted_rows.push(line);
161        }
162    }
163
164    if formatted_rows.is_empty() {
165        return Vec::new();
166    }
167
168    // Bin-pack rows into chunks, respecting max_chars
169    let mut chunks = Vec::new();
170    let mut current_rows: Vec<String> = Vec::new();
171    let mut current_len = prefix_len;
172
173    for row_text in &formatted_rows {
174        let row_len = row_text.len() + 1; // +1 for newline
175
176        if !current_rows.is_empty() && current_len + row_len > options.max_chars {
177            // Emit current chunk
178            let text = format!("{fixed_prefix}{}", current_rows.join("\n"));
179            chunks.push(text);
180            current_rows.clear();
181            current_len = prefix_len;
182        }
183
184        current_rows.push(row_text.clone());
185        current_len += row_len;
186    }
187
188    // Emit final chunk
189    if !current_rows.is_empty() {
190        let text = format!("{fixed_prefix}{}", current_rows.join("\n"));
191        chunks.push(text);
192    }
193
194    // Convert to StructuredChunk
195    let total_parts = chunks.len() as u32;
196    let table_id = format!("{}:{}", table.sheet_name, table.name);
197
198    chunks
199        .into_iter()
200        .enumerate()
201        .map(|(i, text)| {
202            let char_count = text.len();
203            let idx = chunk_index_start + i;
204
205            if total_parts == 1 {
206                StructuredChunk::table(text, idx, &table_id, 0, char_count)
207            } else {
208                StructuredChunk::table_continuation(
209                    text,
210                    idx,
211                    &table_id,
212                    (i + 1) as u32,
213                    total_parts,
214                    &fixed_prefix,
215                    0,
216                    char_count,
217                )
218            }
219        })
220        .collect()
221}
222
223/// Chunk an entire workbook's detected tables into structured chunks.
224#[must_use]
225pub fn chunk_workbook(
226    grids: &[SheetGrid],
227    tables: &[DetectedTable],
228    metadata: &OoxmlMetadata,
229    options: &XlsxChunkingOptions,
230) -> ChunkingResult {
231    let mut result = ChunkingResult::empty();
232    let mut chunk_index = 0;
233
234    for table in tables {
235        // Find the grid for this table's sheet
236        let Some(grid) = grids.iter().find(|g| g.sheet_name == table.sheet_name) else {
237            result.warn(format!(
238                "No grid found for sheet '{}', skipping table '{}'",
239                table.sheet_name, table.name
240            ));
241            continue;
242        };
243
244        let table_chunks = chunk_table(grid, table, metadata, options, chunk_index);
245
246        if table_chunks.len() > 1 {
247            result.tables_split += 1;
248        }
249        result.tables_processed += 1;
250        chunk_index += table_chunks.len();
251        result.chunks.extend(table_chunks);
252
253        // Respect global chunk limit
254        if result.chunks.len() >= options.max_chunks {
255            result.warn(format!(
256                "Hit max chunk limit ({}) — remaining tables skipped",
257                options.max_chunks
258            ));
259            result.chunks.truncate(options.max_chunks);
260            break;
261        }
262    }
263
264    result
265}
266
267/// Generate backward-compatible flat text from grids (for `ReaderOutput.document.text`).
268#[must_use]
269pub fn generate_flat_text(
270    grids: &[SheetGrid],
271    tables: &[DetectedTable],
272    metadata: &OoxmlMetadata,
273) -> String {
274    let mut out = String::new();
275
276    for table in tables {
277        let grid = match grids.iter().find(|g| g.sheet_name == table.sheet_name) {
278            Some(g) => g,
279            None => continue,
280        };
281
282        if !out.is_empty() {
283            out.push('\n');
284        }
285        out.push_str(&format!("Sheet: {}\n", table.sheet_name));
286
287        // Header line
288        if !table.headers.is_empty() {
289            let header_line = table
290                .headers
291                .iter()
292                .filter(|h| !h.is_empty())
293                .cloned()
294                .collect::<Vec<_>>()
295                .join(" | ");
296            if !header_line.is_empty() {
297                out.push_str(&header_line);
298                out.push('\n');
299            }
300        }
301
302        // Data rows
303        for row_idx in table.first_data_row..=table.last_data_row {
304            let line = format_row_with_headers(
305                grid,
306                row_idx,
307                &table.headers,
308                table.first_col,
309                table.last_col,
310                metadata,
311            );
312            if !line.is_empty() {
313                out.push_str(&line);
314                out.push('\n');
315            }
316        }
317    }
318
319    out.trim().to_string()
320}
321
322#[cfg(test)]
323mod tests {
324    use super::*;
325    use crate::reader::xlsx_table_detect::SheetGrid;
326    use crate::types::structure::ChunkType;
327
328    fn make_grid(data: Vec<Vec<CellValue>>, sheet_name: &str) -> SheetGrid {
329        let num_rows = data.len() as u32;
330        let num_cols = data.iter().map(|r| r.len()).max().unwrap_or(0) as u32;
331        SheetGrid {
332            sheet_name: sheet_name.to_string(),
333            rows: data,
334            num_fmt_kinds: Vec::new(),
335            num_rows,
336            num_cols,
337        }
338    }
339
340    #[test]
341    fn test_format_cell_value_date() {
342        let metadata = OoxmlMetadata::default();
343        let cell = CellValue::Number(44927.0);
344        let result = format_cell_value(&cell, NumFmtKind::Date, &metadata);
345        assert_eq!(result, "2023-01-01");
346    }
347
348    #[test]
349    fn test_format_cell_value_percentage() {
350        let metadata = OoxmlMetadata::default();
351        let cell = CellValue::Number(0.153);
352        let result = format_cell_value(&cell, NumFmtKind::Percentage, &metadata);
353        assert_eq!(result, "15.3%");
354    }
355
356    #[test]
357    fn test_format_cell_value_currency() {
358        let metadata = OoxmlMetadata::default();
359        let cell = CellValue::Number(1234.56);
360        let result = format_cell_value(&cell, NumFmtKind::Currency, &metadata);
361        assert_eq!(result, "$1234.56");
362    }
363
364    #[test]
365    fn test_format_row_with_headers() {
366        let grid = make_grid(
367            vec![vec![
368                CellValue::Text("Alice".into()),
369                CellValue::Integer(30),
370                CellValue::Text("Austin".into()),
371            ]],
372            "Sheet1",
373        );
374        let metadata = OoxmlMetadata::default();
375        let headers = vec!["Name".to_string(), "Age".to_string(), "City".to_string()];
376
377        let result = format_row_with_headers(&grid, 0, &headers, 0, 2, &metadata);
378        assert_eq!(result, "Name: Alice | Age: 30 | City: Austin");
379    }
380
381    #[test]
382    fn test_format_row_skips_empty() {
383        let grid = make_grid(
384            vec![vec![
385                CellValue::Text("Alice".into()),
386                CellValue::Empty,
387                CellValue::Text("Austin".into()),
388            ]],
389            "Sheet1",
390        );
391        let metadata = OoxmlMetadata::default();
392        let headers = vec!["Name".to_string(), "Age".to_string(), "City".to_string()];
393
394        let result = format_row_with_headers(&grid, 0, &headers, 0, 2, &metadata);
395        assert_eq!(result, "Name: Alice | City: Austin");
396    }
397
398    #[test]
399    fn test_chunk_table_single_chunk() {
400        let grid = make_grid(
401            vec![
402                vec![
403                    CellValue::Text("Name".into()),
404                    CellValue::Text("Value".into()),
405                ],
406                vec![CellValue::Text("A".into()), CellValue::Integer(100)],
407                vec![CellValue::Text("B".into()), CellValue::Integer(200)],
408            ],
409            "Sheet1",
410        );
411        let metadata = OoxmlMetadata::default();
412        let table = DetectedTable {
413            name: "Revenue".to_string(),
414            sheet_name: "Sheet1".to_string(),
415            headers: vec!["Name".to_string(), "Value".to_string()],
416            column_types: vec![],
417            first_data_row: 1,
418            last_data_row: 2,
419            first_col: 0,
420            last_col: 1,
421            header_row: Some(0),
422            confidence: 0.7,
423        };
424
425        let options = XlsxChunkingOptions::default();
426        let chunks = chunk_table(&grid, &table, &metadata, &options, 0);
427
428        assert_eq!(chunks.len(), 1);
429        let text = &chunks[0].text;
430        assert!(text.contains("[Sheet: Sheet1] [Table: Revenue]"));
431        assert!(text.contains("Name | Value"));
432        assert!(text.contains("Name: A | Value: 100"));
433        assert!(text.contains("Name: B | Value: 200"));
434        assert_eq!(chunks[0].chunk_type, ChunkType::Table);
435    }
436
437    #[test]
438    fn test_chunk_table_splits_large() {
439        let mut rows = vec![vec![
440            CellValue::Text("Col1".into()),
441            CellValue::Text("Col2".into()),
442        ]];
443        // Add 50 data rows to exceed a small chunk limit
444        for i in 0..50 {
445            rows.push(vec![
446                CellValue::Text(format!("Row{i} long text that takes up space in the chunk")),
447                CellValue::Integer(i as i64 * 1000),
448            ]);
449        }
450
451        let grid = make_grid(rows, "Sheet1");
452        let metadata = OoxmlMetadata::default();
453        let table = DetectedTable {
454            name: "Data".to_string(),
455            sheet_name: "Sheet1".to_string(),
456            headers: vec!["Col1".to_string(), "Col2".to_string()],
457            column_types: vec![],
458            first_data_row: 1,
459            last_data_row: 50,
460            first_col: 0,
461            last_col: 1,
462            header_row: Some(0),
463            confidence: 0.7,
464        };
465
466        let options = XlsxChunkingOptions {
467            max_chars: 300,
468            max_chunks: 100,
469        };
470        let chunks = chunk_table(&grid, &table, &metadata, &options, 0);
471
472        assert!(chunks.len() > 1, "Should split into multiple chunks");
473        // Every chunk should have the header context
474        for chunk in &chunks {
475            assert!(chunk.text.contains("[Sheet: Sheet1]"));
476            assert!(chunk.text.contains("Col1 | Col2"));
477            assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
478        }
479    }
480
481    #[test]
482    fn test_generate_flat_text() {
483        let grid = make_grid(
484            vec![
485                vec![
486                    CellValue::Text("Name".into()),
487                    CellValue::Text("Score".into()),
488                ],
489                vec![CellValue::Text("Alice".into()), CellValue::Integer(95)],
490            ],
491            "Results",
492        );
493        let metadata = OoxmlMetadata::default();
494        let table = DetectedTable {
495            name: "Scores".to_string(),
496            sheet_name: "Results".to_string(),
497            headers: vec!["Name".to_string(), "Score".to_string()],
498            column_types: vec![],
499            first_data_row: 1,
500            last_data_row: 1,
501            first_col: 0,
502            last_col: 1,
503            header_row: Some(0),
504            confidence: 0.7,
505        };
506
507        let text = generate_flat_text(&[grid], &[table], &metadata);
508        assert!(text.contains("Sheet: Results"));
509        assert!(text.contains("Name | Score"));
510        assert!(text.contains("Name: Alice | Score: 95"));
511    }
512}