1#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
6
7use serde::{Deserialize, Serialize};
8
9use super::xlsx_ooxml::{MergedRegion, NumFmtKind, TableDefinition};
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
13#[serde(rename_all = "snake_case")]
14pub enum ColumnType {
15 #[default]
16 Text,
17 Integer,
18 Float,
19 Date,
20 DateTime,
21 Time,
22 Currency,
23 Percentage,
24 Boolean,
25 Mixed,
26 Empty,
27}
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct DetectedTable {
32 pub name: String,
34 pub sheet_name: String,
36 pub headers: Vec<String>,
38 pub column_types: Vec<ColumnType>,
40 pub first_data_row: u32,
42 pub last_data_row: u32,
44 pub first_col: u32,
46 pub last_col: u32,
48 pub header_row: Option<u32>,
50 pub confidence: f64,
52}
53
54#[derive(Debug, Clone)]
56pub enum CellValue {
57 Empty,
58 Text(String),
59 Number(f64),
60 Integer(i64),
61 Boolean(bool),
62 DateTime(String),
63 Error(String),
64}
65
66impl CellValue {
67 #[must_use]
68 pub fn is_empty(&self) -> bool {
69 matches!(self, Self::Empty)
70 }
71
72 #[must_use]
73 pub fn is_text(&self) -> bool {
74 matches!(self, Self::Text(_))
75 }
76
77 #[must_use]
78 pub fn is_numeric(&self) -> bool {
79 matches!(self, Self::Number(_) | Self::Integer(_))
80 }
81
82 #[must_use]
83 pub fn as_text(&self) -> String {
84 match self {
85 Self::Empty => String::new(),
86 Self::Text(s) => s.clone(),
87 Self::Number(v) => format!("{v}"),
88 Self::Integer(v) => format!("{v}"),
89 Self::Boolean(b) => if *b { "true" } else { "false" }.to_string(),
90 Self::DateTime(s) => s.clone(),
91 Self::Error(s) => s.clone(),
92 }
93 }
94}
95
96pub struct SheetGrid {
98 pub sheet_name: String,
99 pub rows: Vec<Vec<CellValue>>,
100 pub num_fmt_kinds: Vec<Vec<NumFmtKind>>,
103 pub num_rows: u32,
104 pub num_cols: u32,
105}
106
107impl SheetGrid {
108 #[must_use]
109 pub fn new(sheet_name: String) -> Self {
110 Self {
111 sheet_name,
112 rows: Vec::new(),
113 num_fmt_kinds: Vec::new(),
114 num_rows: 0,
115 num_cols: 0,
116 }
117 }
118
119 #[must_use]
121 pub fn cell(&self, row: u32, col: u32) -> &CellValue {
122 static EMPTY: CellValue = CellValue::Empty;
123 self.rows
124 .get(row as usize)
125 .and_then(|r| r.get(col as usize))
126 .unwrap_or(&EMPTY)
127 }
128
129 #[must_use]
131 pub fn num_fmt(&self, row: u32, col: u32) -> NumFmtKind {
132 self.num_fmt_kinds
133 .get(row as usize)
134 .and_then(|r| r.get(col as usize))
135 .copied()
136 .unwrap_or(NumFmtKind::General)
137 }
138
139 #[must_use]
141 pub fn is_row_empty(&self, row: u32) -> bool {
142 if let Some(r) = self.rows.get(row as usize) {
143 r.iter().all(CellValue::is_empty)
144 } else {
145 true
146 }
147 }
148
149 #[must_use]
151 pub fn row_nonempty_count(&self, row: u32) -> usize {
152 if let Some(r) = self.rows.get(row as usize) {
153 r.iter().filter(|c| !c.is_empty()).count()
154 } else {
155 0
156 }
157 }
158}
159
160#[must_use]
168pub fn detect_tables(
169 grid: &SheetGrid,
170 ooxml_tables: &[TableDefinition],
171 merged_regions: &[MergedRegion],
172) -> Vec<DetectedTable> {
173 let mut tables = Vec::new();
174
175 for tdef in ooxml_tables {
177 if tdef.sheet_name == grid.sheet_name {
178 let column_types = infer_column_types(
179 grid,
180 tdef.first_row + 1,
181 tdef.last_row,
182 tdef.first_col,
183 tdef.last_col,
184 );
185 tables.push(DetectedTable {
186 name: tdef.name.clone(),
187 sheet_name: grid.sheet_name.clone(),
188 headers: tdef.headers.clone(),
189 column_types,
190 first_data_row: tdef.first_row + 1,
191 last_data_row: tdef.last_row,
192 first_col: tdef.first_col,
193 last_col: tdef.last_col,
194 header_row: Some(tdef.first_row),
195 confidence: 1.0,
196 });
197 }
198 }
199
200 if !tables.is_empty() {
202 return tables;
203 }
204
205 let table_ranges = find_table_boundaries(grid, merged_regions);
207 let mut table_idx = 0;
208
209 for (start_row, end_row, start_col, end_col) in table_ranges {
210 let (header_row, headers, confidence) =
211 detect_header(grid, start_row, end_row, start_col, end_col);
212
213 let first_data_row = header_row.map_or(start_row, |hr| hr + 1);
214 let column_types = infer_column_types(grid, first_data_row, end_row, start_col, end_col);
215
216 let type_boost = if column_types
218 .iter()
219 .filter(|t| **t != ColumnType::Mixed && **t != ColumnType::Empty)
220 .count()
221 > column_types.len() / 2
222 {
223 0.15
224 } else {
225 0.0
226 };
227
228 table_idx += 1;
229 tables.push(DetectedTable {
230 name: format!("Table{table_idx}"),
231 sheet_name: grid.sheet_name.clone(),
232 headers,
233 column_types,
234 first_data_row,
235 last_data_row: end_row,
236 first_col: start_col,
237 last_col: end_col,
238 header_row,
239 confidence: (confidence + type_boost).min(1.0),
240 });
241 }
242
243 tables
244}
245
246fn find_table_boundaries(
248 grid: &SheetGrid,
249 _merged_regions: &[MergedRegion],
250) -> Vec<(u32, u32, u32, u32)> {
251 if grid.num_rows == 0 || grid.num_cols == 0 {
252 return Vec::new();
253 }
254
255 let mut row_groups: Vec<(u32, u32)> = Vec::new();
257 let mut current_start: Option<u32> = None;
258 let mut empty_streak = 0u32;
259
260 for row in 0..grid.num_rows {
261 if grid.is_row_empty(row) {
262 empty_streak += 1;
263 if empty_streak >= 2 {
264 if let Some(start) = current_start.take() {
265 let end = row.saturating_sub(empty_streak);
266 if end >= start {
267 row_groups.push((start, end));
268 }
269 }
270 }
271 } else {
272 if current_start.is_none() {
273 current_start = Some(row);
274 }
275 empty_streak = 0;
276 }
277 }
278 if let Some(start) = current_start {
280 row_groups.push((start, grid.num_rows.saturating_sub(1)));
281 }
282
283 let mut boundaries = Vec::new();
285 for (start_row, end_row) in row_groups {
286 let col_ranges = find_column_boundaries(grid, start_row, end_row);
287 for (start_col, end_col) in col_ranges {
288 boundaries.push((start_row, end_row, start_col, end_col));
289 }
290 }
291
292 if boundaries.is_empty() && grid.num_rows > 0 {
294 boundaries.push((
295 0,
296 grid.num_rows.saturating_sub(1),
297 0,
298 grid.num_cols.saturating_sub(1),
299 ));
300 }
301
302 boundaries
303}
304
305fn find_column_boundaries(grid: &SheetGrid, start_row: u32, end_row: u32) -> Vec<(u32, u32)> {
307 if grid.num_cols == 0 {
308 return Vec::new();
309 }
310
311 let mut col_has_data = vec![false; grid.num_cols as usize];
313 for row in start_row..=end_row {
314 if let Some(r) = grid.rows.get(row as usize) {
315 for (ci, cell) in r.iter().enumerate() {
316 if !cell.is_empty() {
317 col_has_data[ci] = true;
318 }
319 }
320 }
321 }
322
323 let mut ranges = Vec::new();
325 let mut current_start: Option<u32> = None;
326 let mut empty_streak = 0u32;
327
328 for (ci, &has_data) in col_has_data.iter().enumerate() {
329 if has_data {
330 if current_start.is_none() {
331 current_start = Some(ci as u32);
332 }
333 empty_streak = 0;
334 } else {
335 empty_streak += 1;
336 if empty_streak >= 2 {
337 if let Some(start) = current_start.take() {
338 let end = (ci as u32).saturating_sub(empty_streak);
339 if end >= start {
340 ranges.push((start, end));
341 }
342 }
343 }
344 }
345 }
346 if let Some(start) = current_start {
347 ranges.push((start, (grid.num_cols).saturating_sub(1)));
348 }
349
350 if ranges.is_empty() {
352 ranges.push((0, grid.num_cols.saturating_sub(1)));
353 }
354
355 ranges
356}
357
358fn detect_header(
361 grid: &SheetGrid,
362 start_row: u32,
363 end_row: u32,
364 start_col: u32,
365 end_col: u32,
366) -> (Option<u32>, Vec<String>, f64) {
367 for row in start_row..=end_row.min(start_row + 3) {
369 let nonempty = grid.row_nonempty_count(row);
370 if nonempty == 0 {
371 continue;
372 }
373
374 let all_text = (start_col..=end_col).all(|col| {
375 let cell = grid.cell(row, col);
376 cell.is_empty() || cell.is_text()
377 });
378
379 if !all_text {
380 continue;
381 }
382
383 let next_row = row + 1;
385 if next_row > end_row {
386 continue;
387 }
388 let has_typed_data = (start_col..=end_col).any(|col| {
389 let cell = grid.cell(next_row, col);
390 cell.is_numeric() || matches!(cell, CellValue::DateTime(_) | CellValue::Boolean(_))
391 });
392
393 if has_typed_data {
394 let headers: Vec<String> = (start_col..=end_col)
395 .map(|col| grid.cell(row, col).as_text())
396 .collect();
397 return (Some(row), headers, 0.7);
398 }
399 }
400
401 for row in start_row..=end_row.min(start_row + 5) {
403 if grid.row_nonempty_count(row) > 0 {
404 let headers: Vec<String> = (start_col..=end_col)
405 .map(|col| grid.cell(row, col).as_text())
406 .collect();
407 return (Some(row), headers, 0.4);
408 }
409 }
410
411 (None, Vec::new(), 0.3)
412}
413
414fn infer_column_types(
416 grid: &SheetGrid,
417 first_data_row: u32,
418 last_data_row: u32,
419 first_col: u32,
420 last_col: u32,
421) -> Vec<ColumnType> {
422 let num_cols = (last_col - first_col + 1) as usize;
423 let mut type_counts: Vec<[u32; 10]> = vec![[0; 10]; num_cols];
424 let sample_limit = 100;
425 for (sampled, row) in (first_data_row..=last_data_row).enumerate() {
426 if sampled >= sample_limit as usize {
427 break;
428 }
429
430 for col_offset in 0..num_cols {
431 let col = first_col + col_offset as u32;
432 let cell = grid.cell(row, col);
433 let fmt = grid.num_fmt(row, col);
434
435 let type_idx = match (cell, fmt) {
436 (CellValue::Empty, _) => 9, (CellValue::Text(_), _) => 0,
438 (CellValue::Integer(_), NumFmtKind::Date) => 2,
439 (CellValue::Integer(_), NumFmtKind::DateTime) => 3,
440 (CellValue::Integer(_), NumFmtKind::Time) => 4,
441 (CellValue::Integer(_), NumFmtKind::Currency) => 5,
442 (CellValue::Integer(_), NumFmtKind::Percentage) => 6,
443 (CellValue::Integer(_), _) => 1,
444 (CellValue::Number(_), NumFmtKind::Date) => 2,
445 (CellValue::Number(_), NumFmtKind::DateTime) => 3,
446 (CellValue::Number(_), NumFmtKind::Time) => 4,
447 (CellValue::Number(_), NumFmtKind::Currency) => 5,
448 (CellValue::Number(_), NumFmtKind::Percentage) => 6,
449 (CellValue::Number(_), _) => 8, (CellValue::Boolean(_), _) => 7,
451 (CellValue::DateTime(_), _) => 2,
452 (CellValue::Error(_), _) => 9,
453 };
454 type_counts[col_offset][type_idx] += 1;
455 }
456 }
457
458 type_counts
459 .iter()
460 .map(|counts| {
461 let non_empty_total: u32 = counts.iter().take(9).sum();
463 if non_empty_total == 0 {
464 return ColumnType::Empty;
465 }
466
467 let (max_idx, &max_count) = counts
468 .iter()
469 .take(9)
470 .enumerate()
471 .max_by_key(|&(_, c)| *c)
472 .unwrap_or((0, &0));
473
474 let threshold = (non_empty_total as f64 * 0.3).ceil() as u32;
476 let other_count = non_empty_total - max_count;
477 if other_count >= threshold && max_count < non_empty_total {
478 return ColumnType::Mixed;
479 }
480
481 match max_idx {
482 0 => ColumnType::Text,
483 1 => ColumnType::Integer,
484 2 => ColumnType::Date,
485 3 => ColumnType::DateTime,
486 4 => ColumnType::Time,
487 5 => ColumnType::Currency,
488 6 => ColumnType::Percentage,
489 7 => ColumnType::Boolean,
490 8 => ColumnType::Float,
491 _ => ColumnType::Text,
492 }
493 })
494 .collect()
495}
496
497#[allow(dead_code)]
500pub fn propagate_merged_cells(grid: &mut SheetGrid, merged_regions: &[MergedRegion]) {
501 for region in merged_regions {
502 let value = grid.cell(region.top_row, region.left_col).clone();
504 if value.is_empty() {
505 continue;
506 }
507
508 for row in region.top_row..=region.bottom_row {
510 for col in region.left_col..=region.right_col {
511 if row == region.top_row && col == region.left_col {
513 continue;
514 }
515 if let Some(r) = grid.rows.get_mut(row as usize) {
516 while r.len() <= col as usize {
518 r.push(CellValue::Empty);
519 }
520 r[col as usize] = value.clone();
521 }
522 }
523 }
524 }
525}
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530
531 fn make_grid(data: Vec<Vec<CellValue>>, sheet_name: &str) -> SheetGrid {
532 let num_rows = data.len() as u32;
533 let num_cols = data.iter().map(|r| r.len()).max().unwrap_or(0) as u32;
534 SheetGrid {
535 sheet_name: sheet_name.to_string(),
536 rows: data,
537 num_fmt_kinds: Vec::new(),
538 num_rows,
539 num_cols,
540 }
541 }
542
543 #[test]
544 fn test_detect_header_all_text_row() {
545 let grid = make_grid(
546 vec![
547 vec![
548 CellValue::Text("Name".into()),
549 CellValue::Text("Age".into()),
550 CellValue::Text("City".into()),
551 ],
552 vec![
553 CellValue::Text("Alice".into()),
554 CellValue::Integer(30),
555 CellValue::Text("Austin".into()),
556 ],
557 vec![
558 CellValue::Text("Bob".into()),
559 CellValue::Integer(25),
560 CellValue::Text("Boston".into()),
561 ],
562 ],
563 "Sheet1",
564 );
565
566 let tables = detect_tables(&grid, &[], &[]);
567 assert_eq!(tables.len(), 1);
568 assert_eq!(tables[0].headers, vec!["Name", "Age", "City"]);
569 assert!(tables[0].confidence >= 0.7);
570 assert_eq!(tables[0].header_row, Some(0));
571 }
572
573 #[test]
574 fn test_detect_multi_table_gap() {
575 let grid = make_grid(
576 vec![
577 vec![CellValue::Text("A".into()), CellValue::Integer(1)],
578 vec![CellValue::Text("B".into()), CellValue::Integer(2)],
579 vec![CellValue::Empty, CellValue::Empty],
580 vec![CellValue::Empty, CellValue::Empty],
581 vec![CellValue::Text("X".into()), CellValue::Integer(10)],
582 vec![CellValue::Text("Y".into()), CellValue::Integer(20)],
583 ],
584 "Sheet1",
585 );
586
587 let tables = detect_tables(&grid, &[], &[]);
588 assert_eq!(tables.len(), 2);
589 }
590
591 #[test]
592 fn test_propagate_merged_cells() {
593 let mut grid = make_grid(
594 vec![
595 vec![
596 CellValue::Text("Merged Title".into()),
597 CellValue::Empty,
598 CellValue::Empty,
599 ],
600 vec![
601 CellValue::Text("A".into()),
602 CellValue::Text("B".into()),
603 CellValue::Text("C".into()),
604 ],
605 ],
606 "Sheet1",
607 );
608
609 let regions = vec![MergedRegion {
610 top_row: 0,
611 left_col: 0,
612 bottom_row: 0,
613 right_col: 2,
614 }];
615
616 propagate_merged_cells(&mut grid, ®ions);
617
618 assert!(matches!(grid.cell(0, 1), CellValue::Text(s) if s == "Merged Title"));
619 assert!(matches!(grid.cell(0, 2), CellValue::Text(s) if s == "Merged Title"));
620 }
621
622 #[test]
623 fn test_column_type_inference() {
624 let grid = make_grid(
625 vec![
626 vec![
627 CellValue::Text("Name".into()),
628 CellValue::Text("Value".into()),
629 ],
630 vec![CellValue::Text("A".into()), CellValue::Integer(100)],
631 vec![CellValue::Text("B".into()), CellValue::Integer(200)],
632 vec![CellValue::Text("C".into()), CellValue::Integer(300)],
633 ],
634 "Sheet1",
635 );
636
637 let types = infer_column_types(&grid, 1, 3, 0, 1);
638 assert_eq!(types[0], ColumnType::Text);
639 assert_eq!(types[1], ColumnType::Integer);
640 }
641
642 #[test]
643 fn test_empty_grid() {
644 let grid = make_grid(Vec::new(), "Empty");
645 let tables = detect_tables(&grid, &[], &[]);
646 assert!(tables.is_empty());
647 }
648}