wolfxl_core/map.rs
1//! Workbook map: one-page summary of every sheet (dimensions, headers,
2//! classification, anchored tables) plus workbook-level named ranges.
3//!
4//! The map exists for agents that need to *orient* before fetching cell
5//! ranges. Loading every sheet's full grid just to ask "which sheet has
6//! the data I want?" is the cost the map prevents.
7//!
8//! Build via [`Workbook::map`](crate::Workbook::map). Render to JSON or
9//! plain text in the consuming binary — `wolfxl-core` stays serde-free.
10
11use crate::cell::CellValue;
12use crate::sheet::Sheet;
13
14/// Coarse classification of a sheet's apparent purpose, derived from its
15/// value grid alone (no merged-cell or formula inspection).
16///
17/// Drives downstream prompt strategy: `Data` sheets justify a `peek`,
18/// `Readme` sheets often want a single-column dump, `Summary` sheets
19/// look formula-heavy with low fill density.
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum SheetClass {
22 Empty,
23 Readme,
24 Summary,
25 Data,
26}
27
28impl SheetClass {
29 /// Lowercase tag suitable for serialization or grep-friendly text.
30 pub fn as_str(&self) -> &'static str {
31 match self {
32 SheetClass::Empty => "empty",
33 SheetClass::Readme => "readme",
34 SheetClass::Summary => "summary",
35 SheetClass::Data => "data",
36 }
37 }
38}
39
40#[derive(Debug, Clone)]
41pub struct SheetMap {
42 pub name: String,
43 pub rows: usize,
44 pub cols: usize,
45 pub class: SheetClass,
46 /// First-row contents, with empty cells preserved as `""` so column
47 /// position is meaningful for downstream consumers.
48 pub headers: Vec<String>,
49 /// Workbook tables (calamine `table_names_in_sheet`) anchored on this
50 /// sheet. Empty when the workbook defines no tables, which is the
51 /// common case for hand-authored sheets.
52 pub tables: Vec<String>,
53}
54
55#[derive(Debug, Clone)]
56pub struct WorkbookMap {
57 pub path: String,
58 pub sheets: Vec<SheetMap>,
59 /// Workbook-level defined names as `(name, formula)` pairs, exactly
60 /// as calamine surfaces them. The formula string is a sheet+range
61 /// reference like `'P&L'!$A$1:$D$25` for typical named ranges.
62 pub named_ranges: Vec<(String, String)>,
63}
64
65/// Classify a sheet by shape and density. Pure value-grid heuristic — does
66/// not look at merged cells, formulas, or formatting.
67///
68/// Rules in priority order:
69/// 1. Zero rows or cols → `Empty`.
70/// 2. Exactly one column wide → `Readme` (notes-column convention).
71/// 3. Small (≤20 rows × ≤10 cols) AND fill density <40% → `Summary`
72/// (sparse formula sheets, dashboards, KPI panels).
73/// 4. Otherwise → `Data` (default for anything dense or large).
74pub fn classify_sheet(sheet: &Sheet) -> SheetClass {
75 let (rows, cols) = sheet.dimensions();
76 if rows == 0 || cols == 0 {
77 return SheetClass::Empty;
78 }
79 if cols == 1 {
80 return SheetClass::Readme;
81 }
82 let total = rows * cols;
83 let non_empty: usize = sheet
84 .rows()
85 .iter()
86 .map(|row| {
87 row.iter()
88 .filter(|c| !matches!(c.value, CellValue::Empty))
89 .count()
90 })
91 .sum();
92 let density = non_empty as f64 / total as f64;
93 if rows <= 20 && cols <= 10 && density < 0.4 {
94 return SheetClass::Summary;
95 }
96 SheetClass::Data
97}
98
99#[cfg(test)]
100mod tests {
101 use super::*;
102 use crate::cell::Cell;
103
104 fn cell(s: &str) -> Cell {
105 Cell {
106 value: CellValue::String(s.to_string()),
107 number_format: None,
108 }
109 }
110
111 fn empty() -> Cell {
112 Cell::empty()
113 }
114
115 #[test]
116 fn empty_sheet_is_classified_empty() {
117 let sheet = Sheet::from_rows_for_test("blank", vec![]);
118 assert_eq!(classify_sheet(&sheet), SheetClass::Empty);
119 }
120
121 #[test]
122 fn single_column_sheet_is_classified_readme() {
123 // A notes column — multiple rows but one column wide.
124 let rows = (0..15)
125 .map(|i| vec![cell(&format!("note line {i}"))])
126 .collect();
127 let sheet = Sheet::from_rows_for_test("Notes", rows);
128 assert_eq!(classify_sheet(&sheet), SheetClass::Readme);
129 }
130
131 #[test]
132 fn small_sparse_sheet_is_classified_summary() {
133 // 5×5 with only a title and one KPI populated → density 2/25 = 8%,
134 // well under the 40% threshold; small enough on both axes.
135 let mut rows = vec![vec![empty(); 5]; 5];
136 rows[0][0] = cell("Q1 2026 Summary");
137 rows[2][1] = cell("$1.2M");
138 let sheet = Sheet::from_rows_for_test("Summary", rows);
139 assert_eq!(classify_sheet(&sheet), SheetClass::Summary);
140 }
141
142 #[test]
143 fn dense_rectangular_sheet_is_classified_data() {
144 // 10×5 fully populated grid → density 100%, defaults to Data.
145 let rows = (0..10)
146 .map(|r| (0..5).map(|c| cell(&format!("r{r}c{c}"))).collect())
147 .collect();
148 let sheet = Sheet::from_rows_for_test("Ledger", rows);
149 assert_eq!(classify_sheet(&sheet), SheetClass::Data);
150 }
151
152 #[test]
153 fn large_sheet_skips_summary_branch_even_if_sparse() {
154 // 50×15 mostly empty (only first cell filled) → density well under
155 // 40% but dimensions exceed the small-sheet gate, so falls
156 // through to Data. Without this rule, a giant mostly-empty data
157 // grid would mis-classify as Summary.
158 let mut rows = vec![vec![empty(); 15]; 50];
159 rows[0][0] = cell("seed");
160 let sheet = Sheet::from_rows_for_test("Big", rows);
161 assert_eq!(classify_sheet(&sheet), SheetClass::Data);
162 }
163}