Skip to main content

memvid_core/reader/
xlsx_table_detect.rs

1//! Table structure detection for XLSX sheets.
2//!
3//! Detects header rows, table boundaries, and column types for sheets
4//! not covered by OOXML table definitions.
5#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
6
7use serde::{Deserialize, Serialize};
8
9use super::xlsx_ooxml::{MergedRegion, NumFmtKind, TableDefinition};
10
11/// Column type inferred from data sampling.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
13#[serde(rename_all = "snake_case")]
14pub enum ColumnType {
15    #[default]
16    Text,
17    Integer,
18    Float,
19    Date,
20    DateTime,
21    Time,
22    Currency,
23    Percentage,
24    Boolean,
25    Mixed,
26    Empty,
27}
28
29/// A detected table within a sheet.
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct DetectedTable {
32    /// Table name (from OOXML or auto-generated)
33    pub name: String,
34    /// Sheet name this table belongs to
35    pub sheet_name: String,
36    /// Column headers (may be empty if no header detected)
37    pub headers: Vec<String>,
38    /// Column types inferred from data
39    pub column_types: Vec<ColumnType>,
40    /// First data row (0-based, the row after the header)
41    pub first_data_row: u32,
42    /// Last data row (inclusive, 0-based)
43    pub last_data_row: u32,
44    /// First column (0-based)
45    pub first_col: u32,
46    /// Last column (inclusive, 0-based)
47    pub last_col: u32,
48    /// Header row index (0-based), None if no header detected
49    pub header_row: Option<u32>,
50    /// Detection confidence (0.0 - 1.0)
51    pub confidence: f64,
52}
53
54/// A cell value representation for detection purposes.
55#[derive(Debug, Clone)]
56pub enum CellValue {
57    Empty,
58    Text(String),
59    Number(f64),
60    Integer(i64),
61    Boolean(bool),
62    DateTime(String),
63    Error(String),
64}
65
66impl CellValue {
67    #[must_use]
68    pub fn is_empty(&self) -> bool {
69        matches!(self, Self::Empty)
70    }
71
72    #[must_use]
73    pub fn is_text(&self) -> bool {
74        matches!(self, Self::Text(_))
75    }
76
77    #[must_use]
78    pub fn is_numeric(&self) -> bool {
79        matches!(self, Self::Number(_) | Self::Integer(_))
80    }
81
82    #[must_use]
83    pub fn as_text(&self) -> String {
84        match self {
85            Self::Empty => String::new(),
86            Self::Text(s) => s.clone(),
87            Self::Number(v) => format!("{v}"),
88            Self::Integer(v) => format!("{v}"),
89            Self::Boolean(b) => if *b { "true" } else { "false" }.to_string(),
90            Self::DateTime(s) => s.clone(),
91            Self::Error(s) => s.clone(),
92        }
93    }
94}
95
96/// A grid of cell values representing one sheet.
97pub struct SheetGrid {
98    pub sheet_name: String,
99    pub rows: Vec<Vec<CellValue>>,
100    /// Number format kinds per cell (row, col) if available from OOXML metadata.
101    /// Outer vec is rows, inner is columns.
102    pub num_fmt_kinds: Vec<Vec<NumFmtKind>>,
103    pub num_rows: u32,
104    pub num_cols: u32,
105}
106
107impl SheetGrid {
108    #[must_use]
109    pub fn new(sheet_name: String) -> Self {
110        Self {
111            sheet_name,
112            rows: Vec::new(),
113            num_fmt_kinds: Vec::new(),
114            num_rows: 0,
115            num_cols: 0,
116        }
117    }
118
119    /// Get cell value at (row, col). Returns Empty if out of bounds.
120    #[must_use]
121    pub fn cell(&self, row: u32, col: u32) -> &CellValue {
122        static EMPTY: CellValue = CellValue::Empty;
123        self.rows
124            .get(row as usize)
125            .and_then(|r| r.get(col as usize))
126            .unwrap_or(&EMPTY)
127    }
128
129    /// Get number format kind at (row, col). Returns General if not available.
130    #[must_use]
131    pub fn num_fmt(&self, row: u32, col: u32) -> NumFmtKind {
132        self.num_fmt_kinds
133            .get(row as usize)
134            .and_then(|r| r.get(col as usize))
135            .copied()
136            .unwrap_or(NumFmtKind::General)
137    }
138
139    /// Check if a row is entirely empty.
140    #[must_use]
141    pub fn is_row_empty(&self, row: u32) -> bool {
142        if let Some(r) = self.rows.get(row as usize) {
143            r.iter().all(CellValue::is_empty)
144        } else {
145            true
146        }
147    }
148
149    /// Count non-empty cells in a row.
150    #[must_use]
151    pub fn row_nonempty_count(&self, row: u32) -> usize {
152        if let Some(r) = self.rows.get(row as usize) {
153            r.iter().filter(|c| !c.is_empty()).count()
154        } else {
155            0
156        }
157    }
158}
159
160/// Detect tables within a sheet grid.
161///
162/// Uses cascading heuristics:
163/// 1. OOXML table definitions (confidence 1.0)
164/// 2. All-text row + typed data below (0.7)
165/// 3. Type consistency boost (+0.15)
166/// 4. First non-empty row fallback (0.4)
167#[must_use]
168pub fn detect_tables(
169    grid: &SheetGrid,
170    ooxml_tables: &[TableDefinition],
171    merged_regions: &[MergedRegion],
172) -> Vec<DetectedTable> {
173    let mut tables = Vec::new();
174
175    // Phase 1: Use OOXML table definitions for this sheet
176    for tdef in ooxml_tables {
177        if tdef.sheet_name == grid.sheet_name {
178            let column_types = infer_column_types(
179                grid,
180                tdef.first_row + 1,
181                tdef.last_row,
182                tdef.first_col,
183                tdef.last_col,
184            );
185            tables.push(DetectedTable {
186                name: tdef.name.clone(),
187                sheet_name: grid.sheet_name.clone(),
188                headers: tdef.headers.clone(),
189                column_types,
190                first_data_row: tdef.first_row + 1,
191                last_data_row: tdef.last_row,
192                first_col: tdef.first_col,
193                last_col: tdef.last_col,
194                header_row: Some(tdef.first_row),
195                confidence: 1.0,
196            });
197        }
198    }
199
200    // If OOXML tables covered the whole sheet, we're done
201    if !tables.is_empty() {
202        return tables;
203    }
204
205    // Phase 2: Heuristic detection — find table boundaries
206    let table_ranges = find_table_boundaries(grid, merged_regions);
207    let mut table_idx = 0;
208
209    for (start_row, end_row, start_col, end_col) in table_ranges {
210        let (header_row, headers, confidence) =
211            detect_header(grid, start_row, end_row, start_col, end_col);
212
213        let first_data_row = header_row.map_or(start_row, |hr| hr + 1);
214        let column_types = infer_column_types(grid, first_data_row, end_row, start_col, end_col);
215
216        // Boost confidence if column types are consistent
217        let type_boost = if column_types
218            .iter()
219            .filter(|t| **t != ColumnType::Mixed && **t != ColumnType::Empty)
220            .count()
221            > column_types.len() / 2
222        {
223            0.15
224        } else {
225            0.0
226        };
227
228        table_idx += 1;
229        tables.push(DetectedTable {
230            name: format!("Table{table_idx}"),
231            sheet_name: grid.sheet_name.clone(),
232            headers,
233            column_types,
234            first_data_row,
235            last_data_row: end_row,
236            first_col: start_col,
237            last_col: end_col,
238            header_row,
239            confidence: (confidence + type_boost).min(1.0),
240        });
241    }
242
243    tables
244}
245
246/// Find table boundaries by detecting gaps (2+ consecutive empty rows/cols).
247fn find_table_boundaries(
248    grid: &SheetGrid,
249    _merged_regions: &[MergedRegion],
250) -> Vec<(u32, u32, u32, u32)> {
251    if grid.num_rows == 0 || grid.num_cols == 0 {
252        return Vec::new();
253    }
254
255    // Find vertical boundaries (consecutive empty rows split tables)
256    let mut row_groups: Vec<(u32, u32)> = Vec::new();
257    let mut current_start: Option<u32> = None;
258    let mut empty_streak = 0u32;
259
260    for row in 0..grid.num_rows {
261        if grid.is_row_empty(row) {
262            empty_streak += 1;
263            if empty_streak >= 2 {
264                if let Some(start) = current_start.take() {
265                    let end = row.saturating_sub(empty_streak);
266                    if end >= start {
267                        row_groups.push((start, end));
268                    }
269                }
270            }
271        } else {
272            if current_start.is_none() {
273                current_start = Some(row);
274            }
275            empty_streak = 0;
276        }
277    }
278    // Close the last group
279    if let Some(start) = current_start {
280        row_groups.push((start, grid.num_rows.saturating_sub(1)));
281    }
282
283    // For each row group, find column boundaries
284    let mut boundaries = Vec::new();
285    for (start_row, end_row) in row_groups {
286        let col_ranges = find_column_boundaries(grid, start_row, end_row);
287        for (start_col, end_col) in col_ranges {
288            boundaries.push((start_row, end_row, start_col, end_col));
289        }
290    }
291
292    // Fallback: if no boundaries detected, treat entire used area as one table
293    if boundaries.is_empty() && grid.num_rows > 0 {
294        boundaries.push((
295            0,
296            grid.num_rows.saturating_sub(1),
297            0,
298            grid.num_cols.saturating_sub(1),
299        ));
300    }
301
302    boundaries
303}
304
305/// Find horizontal table boundaries within a row range.
306fn find_column_boundaries(grid: &SheetGrid, start_row: u32, end_row: u32) -> Vec<(u32, u32)> {
307    if grid.num_cols == 0 {
308        return Vec::new();
309    }
310
311    // Check which columns have any data in the row range
312    let mut col_has_data = vec![false; grid.num_cols as usize];
313    for row in start_row..=end_row {
314        if let Some(r) = grid.rows.get(row as usize) {
315            for (ci, cell) in r.iter().enumerate() {
316                if !cell.is_empty() {
317                    col_has_data[ci] = true;
318                }
319            }
320        }
321    }
322
323    // Find contiguous ranges of columns with data
324    let mut ranges = Vec::new();
325    let mut current_start: Option<u32> = None;
326    let mut empty_streak = 0u32;
327
328    for (ci, &has_data) in col_has_data.iter().enumerate() {
329        if has_data {
330            if current_start.is_none() {
331                current_start = Some(ci as u32);
332            }
333            empty_streak = 0;
334        } else {
335            empty_streak += 1;
336            if empty_streak >= 2 {
337                if let Some(start) = current_start.take() {
338                    let end = (ci as u32).saturating_sub(empty_streak);
339                    if end >= start {
340                        ranges.push((start, end));
341                    }
342                }
343            }
344        }
345    }
346    if let Some(start) = current_start {
347        ranges.push((start, (grid.num_cols).saturating_sub(1)));
348    }
349
350    // Fallback: whole range
351    if ranges.is_empty() {
352        ranges.push((0, grid.num_cols.saturating_sub(1)));
353    }
354
355    ranges
356}
357
358/// Detect header row within a table range.
359/// Returns (header_row_index, header_texts, confidence).
360fn detect_header(
361    grid: &SheetGrid,
362    start_row: u32,
363    end_row: u32,
364    start_col: u32,
365    end_col: u32,
366) -> (Option<u32>, Vec<String>, f64) {
367    // Heuristic 1: All-text row followed by typed (numeric/date) data below
368    for row in start_row..=end_row.min(start_row + 3) {
369        let nonempty = grid.row_nonempty_count(row);
370        if nonempty == 0 {
371            continue;
372        }
373
374        let all_text = (start_col..=end_col).all(|col| {
375            let cell = grid.cell(row, col);
376            cell.is_empty() || cell.is_text()
377        });
378
379        if !all_text {
380            continue;
381        }
382
383        // Check if the next row has any numeric/date data
384        let next_row = row + 1;
385        if next_row > end_row {
386            continue;
387        }
388        let has_typed_data = (start_col..=end_col).any(|col| {
389            let cell = grid.cell(next_row, col);
390            cell.is_numeric() || matches!(cell, CellValue::DateTime(_) | CellValue::Boolean(_))
391        });
392
393        if has_typed_data {
394            let headers: Vec<String> = (start_col..=end_col)
395                .map(|col| grid.cell(row, col).as_text())
396                .collect();
397            return (Some(row), headers, 0.7);
398        }
399    }
400
401    // Heuristic 2: First non-empty row as fallback
402    for row in start_row..=end_row.min(start_row + 5) {
403        if grid.row_nonempty_count(row) > 0 {
404            let headers: Vec<String> = (start_col..=end_col)
405                .map(|col| grid.cell(row, col).as_text())
406                .collect();
407            return (Some(row), headers, 0.4);
408        }
409    }
410
411    (None, Vec::new(), 0.3)
412}
413
414/// Infer column types by sampling data rows.
415fn infer_column_types(
416    grid: &SheetGrid,
417    first_data_row: u32,
418    last_data_row: u32,
419    first_col: u32,
420    last_col: u32,
421) -> Vec<ColumnType> {
422    let num_cols = (last_col - first_col + 1) as usize;
423    let mut type_counts: Vec<[u32; 10]> = vec![[0; 10]; num_cols];
424    let sample_limit = 100;
425    for (sampled, row) in (first_data_row..=last_data_row).enumerate() {
426        if sampled >= sample_limit as usize {
427            break;
428        }
429
430        for col_offset in 0..num_cols {
431            let col = first_col + col_offset as u32;
432            let cell = grid.cell(row, col);
433            let fmt = grid.num_fmt(row, col);
434
435            let type_idx = match (cell, fmt) {
436                (CellValue::Empty, _) => 9, // Empty
437                (CellValue::Text(_), _) => 0,
438                (CellValue::Integer(_), NumFmtKind::Date) => 2,
439                (CellValue::Integer(_), NumFmtKind::DateTime) => 3,
440                (CellValue::Integer(_), NumFmtKind::Time) => 4,
441                (CellValue::Integer(_), NumFmtKind::Currency) => 5,
442                (CellValue::Integer(_), NumFmtKind::Percentage) => 6,
443                (CellValue::Integer(_), _) => 1,
444                (CellValue::Number(_), NumFmtKind::Date) => 2,
445                (CellValue::Number(_), NumFmtKind::DateTime) => 3,
446                (CellValue::Number(_), NumFmtKind::Time) => 4,
447                (CellValue::Number(_), NumFmtKind::Currency) => 5,
448                (CellValue::Number(_), NumFmtKind::Percentage) => 6,
449                (CellValue::Number(_), _) => 8, // Float
450                (CellValue::Boolean(_), _) => 7,
451                (CellValue::DateTime(_), _) => 2,
452                (CellValue::Error(_), _) => 9,
453            };
454            type_counts[col_offset][type_idx] += 1;
455        }
456    }
457
458    type_counts
459        .iter()
460        .map(|counts| {
461            // Find the most common non-empty type
462            let non_empty_total: u32 = counts.iter().take(9).sum();
463            if non_empty_total == 0 {
464                return ColumnType::Empty;
465            }
466
467            let (max_idx, &max_count) = counts
468                .iter()
469                .take(9)
470                .enumerate()
471                .max_by_key(|&(_, c)| *c)
472                .unwrap_or((0, &0));
473
474            // If >30% are a different type, it's mixed
475            let threshold = (non_empty_total as f64 * 0.3).ceil() as u32;
476            let other_count = non_empty_total - max_count;
477            if other_count >= threshold && max_count < non_empty_total {
478                return ColumnType::Mixed;
479            }
480
481            match max_idx {
482                0 => ColumnType::Text,
483                1 => ColumnType::Integer,
484                2 => ColumnType::Date,
485                3 => ColumnType::DateTime,
486                4 => ColumnType::Time,
487                5 => ColumnType::Currency,
488                6 => ColumnType::Percentage,
489                7 => ColumnType::Boolean,
490                8 => ColumnType::Float,
491                _ => ColumnType::Text,
492            }
493        })
494        .collect()
495}
496
497/// Propagate merged cell values into a grid.
498/// The top-left cell's value is copied to all cells in the merged region.
499#[allow(dead_code)]
500pub fn propagate_merged_cells(grid: &mut SheetGrid, merged_regions: &[MergedRegion]) {
501    for region in merged_regions {
502        // Get the top-left cell value
503        let value = grid.cell(region.top_row, region.left_col).clone();
504        if value.is_empty() {
505            continue;
506        }
507
508        // Fill all cells in the region with the top-left value
509        for row in region.top_row..=region.bottom_row {
510            for col in region.left_col..=region.right_col {
511                // Skip the top-left cell itself
512                if row == region.top_row && col == region.left_col {
513                    continue;
514                }
515                if let Some(r) = grid.rows.get_mut(row as usize) {
516                    // Extend the row if necessary
517                    while r.len() <= col as usize {
518                        r.push(CellValue::Empty);
519                    }
520                    r[col as usize] = value.clone();
521                }
522            }
523        }
524    }
525}
526
527#[cfg(test)]
528mod tests {
529    use super::*;
530
531    fn make_grid(data: Vec<Vec<CellValue>>, sheet_name: &str) -> SheetGrid {
532        let num_rows = data.len() as u32;
533        let num_cols = data.iter().map(|r| r.len()).max().unwrap_or(0) as u32;
534        SheetGrid {
535            sheet_name: sheet_name.to_string(),
536            rows: data,
537            num_fmt_kinds: Vec::new(),
538            num_rows,
539            num_cols,
540        }
541    }
542
543    #[test]
544    fn test_detect_header_all_text_row() {
545        let grid = make_grid(
546            vec![
547                vec![
548                    CellValue::Text("Name".into()),
549                    CellValue::Text("Age".into()),
550                    CellValue::Text("City".into()),
551                ],
552                vec![
553                    CellValue::Text("Alice".into()),
554                    CellValue::Integer(30),
555                    CellValue::Text("Austin".into()),
556                ],
557                vec![
558                    CellValue::Text("Bob".into()),
559                    CellValue::Integer(25),
560                    CellValue::Text("Boston".into()),
561                ],
562            ],
563            "Sheet1",
564        );
565
566        let tables = detect_tables(&grid, &[], &[]);
567        assert_eq!(tables.len(), 1);
568        assert_eq!(tables[0].headers, vec!["Name", "Age", "City"]);
569        assert!(tables[0].confidence >= 0.7);
570        assert_eq!(tables[0].header_row, Some(0));
571    }
572
573    #[test]
574    fn test_detect_multi_table_gap() {
575        let grid = make_grid(
576            vec![
577                vec![CellValue::Text("A".into()), CellValue::Integer(1)],
578                vec![CellValue::Text("B".into()), CellValue::Integer(2)],
579                vec![CellValue::Empty, CellValue::Empty],
580                vec![CellValue::Empty, CellValue::Empty],
581                vec![CellValue::Text("X".into()), CellValue::Integer(10)],
582                vec![CellValue::Text("Y".into()), CellValue::Integer(20)],
583            ],
584            "Sheet1",
585        );
586
587        let tables = detect_tables(&grid, &[], &[]);
588        assert_eq!(tables.len(), 2);
589    }
590
591    #[test]
592    fn test_propagate_merged_cells() {
593        let mut grid = make_grid(
594            vec![
595                vec![
596                    CellValue::Text("Merged Title".into()),
597                    CellValue::Empty,
598                    CellValue::Empty,
599                ],
600                vec![
601                    CellValue::Text("A".into()),
602                    CellValue::Text("B".into()),
603                    CellValue::Text("C".into()),
604                ],
605            ],
606            "Sheet1",
607        );
608
609        let regions = vec![MergedRegion {
610            top_row: 0,
611            left_col: 0,
612            bottom_row: 0,
613            right_col: 2,
614        }];
615
616        propagate_merged_cells(&mut grid, &regions);
617
618        assert!(matches!(grid.cell(0, 1), CellValue::Text(s) if s == "Merged Title"));
619        assert!(matches!(grid.cell(0, 2), CellValue::Text(s) if s == "Merged Title"));
620    }
621
622    #[test]
623    fn test_column_type_inference() {
624        let grid = make_grid(
625            vec![
626                vec![
627                    CellValue::Text("Name".into()),
628                    CellValue::Text("Value".into()),
629                ],
630                vec![CellValue::Text("A".into()), CellValue::Integer(100)],
631                vec![CellValue::Text("B".into()), CellValue::Integer(200)],
632                vec![CellValue::Text("C".into()), CellValue::Integer(300)],
633            ],
634            "Sheet1",
635        );
636
637        let types = infer_column_types(&grid, 1, 3, 0, 1);
638        assert_eq!(types[0], ColumnType::Text);
639        assert_eq!(types[1], ColumnType::Integer);
640    }
641
642    #[test]
643    fn test_empty_grid() {
644        let grid = make_grid(Vec::new(), "Empty");
645        let tables = detect_tables(&grid, &[], &[]);
646        assert!(tables.is_empty());
647    }
648}