Skip to main content

wolfxl_core/
schema.rs

1//! Per-column schema inference: type, null count, cardinality, format.
2//!
3//! Built for the `wolfxl schema` subcommand. Returns enough per-column detail
4//! for an LLM or agent to plan a query strategy without round-tripping the
5//! actual data: pick lookup columns by `cardinality`, choose dimension vs
6//! measure by `inferred_type` + `format_category`, decide whether `unique_count`
7//! is exact or capped before treating it as a primary key.
8//!
9//! All inference is single-pass O(rows × cols). Unique-count tracking caps
10//! at 10 000 distinct rendered values per column so a million-row sheet
11//! doesn't blow memory; the cap is reported via `unique_capped`.
12
13use std::collections::HashSet;
14
15use crate::cell::{Cell, CellValue};
16use crate::format::{classify_format, FormatCategory};
17use crate::sheet::Sheet;
18
19/// Inferred logical type for a column. `Mixed` means "no clear majority";
20/// `Empty` means "no non-null cells were observed".
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum InferredType {
23    String,
24    Int,
25    Float,
26    Bool,
27    Date,
28    DateTime,
29    Time,
30    Mixed,
31    Empty,
32}
33
34impl InferredType {
35    pub fn as_str(self) -> &'static str {
36        match self {
37            InferredType::String => "string",
38            InferredType::Int => "int",
39            InferredType::Float => "float",
40            InferredType::Bool => "bool",
41            InferredType::Date => "date",
42            InferredType::DateTime => "datetime",
43            InferredType::Time => "time",
44            InferredType::Mixed => "mixed",
45            InferredType::Empty => "empty",
46        }
47    }
48}
49
50/// Coarse cardinality bucket — what an agent needs to decide "is this a
51/// dimension or a measure?".
52#[derive(Debug, Clone, Copy, PartialEq, Eq)]
53pub enum Cardinality {
54    /// Every non-null value is distinct.
55    Unique,
56    /// Few distinct values (≤20) and at most half of rows. Lookup-friendly.
57    Categorical,
58    /// Many distinct values, not unique. Typical for measures, IDs, names.
59    HighCardinality,
60    /// Column has no non-null cells.
61    Empty,
62}
63
64impl Cardinality {
65    pub fn as_str(self) -> &'static str {
66        match self {
67            Cardinality::Unique => "unique",
68            Cardinality::Categorical => "categorical",
69            Cardinality::HighCardinality => "high-cardinality",
70            Cardinality::Empty => "empty",
71        }
72    }
73}
74
75#[derive(Debug, Clone)]
76pub struct ColumnSchema {
77    pub name: String,
78    pub inferred_type: InferredType,
79    /// First-non-empty-cell number-format category. Drives whether the
80    /// column is "dollars" vs "percent" vs "plain integer" — orthogonal to
81    /// `inferred_type` (a `currency` column is still typed `Float`).
82    pub format_category: FormatCategory,
83    pub null_count: usize,
84    /// Distinct rendered-value count, capped at 10 000.
85    pub unique_count: usize,
86    pub unique_capped: bool,
87    pub cardinality: Cardinality,
88    /// Up to 3 distinct rendered values. Order is whatever the column
89    /// presented first; not stable across runs if the underlying sheet
90    /// changes. Useful for grounding agent queries with concrete values.
91    pub sample_values: Vec<String>,
92}
93
94#[derive(Debug, Clone)]
95pub struct SheetSchema {
96    pub sheet: String,
97    pub rows: usize,
98    pub columns: Vec<ColumnSchema>,
99}
100
101/// Hard cap on the per-column unique-value HashSet to keep a million-row
102/// sheet from blowing memory. 10 000 is enough to confidently call a
103/// column "high-cardinality" without exact counts past that point.
104pub const UNIQUE_CAP: usize = 10_000;
105
106const SAMPLE_LIMIT: usize = 3;
107/// Categorical bucket upper bound on distinct values. Above this, a column
108/// is too varied to be useful as a lookup dimension even if it's still
109/// dense.
110const CATEGORICAL_MAX_DISTINCT: usize = 20;
111
112/// Infer per-column schema for a sheet. Header is row 0; body starts at
113/// row 1. Returns one [`ColumnSchema`] per column reported by `headers()`.
114pub fn infer_sheet_schema(sheet: &Sheet) -> SheetSchema {
115    let headers = sheet.headers();
116    let (total_rows, _) = sheet.dimensions();
117    let body_rows = total_rows.saturating_sub(1);
118    let cols = headers.len();
119    let mut columns = Vec::with_capacity(cols);
120
121    for col_idx in 0..cols {
122        columns.push(infer_column(sheet, col_idx, &headers[col_idx], body_rows));
123    }
124
125    SheetSchema {
126        sheet: sheet.name.clone(),
127        rows: body_rows,
128        columns,
129    }
130}
131
132fn infer_column(sheet: &Sheet, col_idx: usize, name: &str, body_rows: usize) -> ColumnSchema {
133    let mut counts = TypeCounts::default();
134    let mut null_count = 0usize;
135    let mut uniques: HashSet<String> = HashSet::new();
136    let mut unique_capped = false;
137    let mut samples: Vec<String> = Vec::with_capacity(SAMPLE_LIMIT);
138    let mut format_category = FormatCategory::General;
139    let mut format_locked = false;
140
141    for row in sheet.rows().iter().skip(1) {
142        let cell = match row.get(col_idx) {
143            Some(c) => c,
144            None => {
145                null_count += 1;
146                continue;
147            }
148        };
149
150        if matches!(cell.value, CellValue::Empty) {
151            null_count += 1;
152            continue;
153        }
154
155        // Lock the format from the first non-empty cell. Mixed-format
156        // columns are rare in practice; if the user wanted that, they'd
157        // be looking at a CSV not an xlsx.
158        if !format_locked {
159            if let Some(fmt) = &cell.number_format {
160                format_category = classify_format(fmt);
161            }
162            format_locked = true;
163        }
164
165        counts.observe(&cell.value);
166
167        let rendered = render_for_uniqueness(cell);
168        if !unique_capped {
169            if uniques.contains(&rendered) {
170                // Already-seen value: no cap consideration, no sample
171                // update needed.
172            } else if uniques.len() < UNIQUE_CAP {
173                uniques.insert(rendered.clone());
174                if samples.len() < SAMPLE_LIMIT {
175                    samples.push(rendered);
176                }
177            } else {
178                // First *new* distinct value past the cap. A column with
179                // exactly UNIQUE_CAP distinct values followed by repeats
180                // stays uncapped — `unique_count == UNIQUE_CAP` is then an
181                // exact, trustworthy figure.
182                unique_capped = true;
183            }
184        }
185    }
186
187    let inferred_type = counts.dominant();
188    let unique_count = uniques.len();
189    let non_null = body_rows.saturating_sub(null_count);
190    let cardinality = classify_cardinality(unique_count, non_null, unique_capped);
191
192    ColumnSchema {
193        name: name.to_string(),
194        inferred_type,
195        format_category,
196        null_count,
197        unique_count,
198        unique_capped,
199        cardinality,
200        sample_values: samples,
201    }
202}
203
204/// Render a cell's value as a HashSet key for distinct-counting and as a
205/// human-readable sample. Date/time/error formats follow the same conventions
206/// as `wolfxl peek -e text` (space-separated DateTime, `ERROR: ` error
207/// prefix) so a sample dropped into a `peek` filter expression matches what
208/// an agent would otherwise see.
209///
210/// Two intentional divergences from peek's text renderer:
211/// - **Floats keep full Rust precision** (`format!("{n}")`) rather than
212///   rounding to two decimals: dedup correctness needs `1.234` and `1.236`
213///   to count as distinct.
214/// - **Ints are not thousand-grouped**: `1000` and `1,000` would otherwise
215///   key into the HashSet as different strings.
216fn render_for_uniqueness(cell: &Cell) -> String {
217    match &cell.value {
218        CellValue::Empty => String::new(),
219        CellValue::String(s) => s.clone(),
220        CellValue::Bool(b) => b.to_string(),
221        CellValue::Int(n) => n.to_string(),
222        CellValue::Float(n) => format!("{n}"),
223        CellValue::Date(d) => d.format("%Y-%m-%d").to_string(),
224        CellValue::DateTime(dt) => dt.format("%Y-%m-%d %H:%M:%S").to_string(),
225        CellValue::Time(t) => t.format("%H:%M:%S").to_string(),
226        CellValue::Error(e) => format!("ERROR: {e}"),
227    }
228}
229
230fn classify_cardinality(unique: usize, non_null: usize, capped: bool) -> Cardinality {
231    if non_null == 0 {
232        return Cardinality::Empty;
233    }
234    // If the unique tracker hit its cap we cannot prove uniqueness either
235    // way, so default to high-cardinality (the safer bucket — caller won't
236    // wrongly treat it as a categorical lookup).
237    if capped {
238        return Cardinality::HighCardinality;
239    }
240    if unique == non_null {
241        return Cardinality::Unique;
242    }
243    if unique <= CATEGORICAL_MAX_DISTINCT && unique * 2 <= non_null {
244        return Cardinality::Categorical;
245    }
246    Cardinality::HighCardinality
247}
248
249/// How a string-valued cell should be counted for type inference. Used by
250/// [`TypeCounts::observe`] so a CSV column of numeric-looking strings lands
251/// in the Int/Float bucket instead of the catch-all String bucket.
252enum StringShape {
253    Int,
254    Float,
255    Other,
256}
257
258/// Classify a string cell's text. Conservative: only classifies as numeric
259/// when the trimmed text parses cleanly as i64 / f64 with no stray
260/// characters. `"$100"`, `"1,000"`, `"100%"` stay `Other` — a number format
261/// is typically inferred from the column's format string, not from the cell
262/// text, for those shapes.
263fn classify_string_as_type(s: &str) -> StringShape {
264    let t = s.trim();
265    if t.is_empty() {
266        return StringShape::Other;
267    }
268    if t.parse::<i64>().is_ok() {
269        StringShape::Int
270    } else if t.parse::<f64>().is_ok_and(f64::is_finite) {
271        StringShape::Float
272    } else {
273        StringShape::Other
274    }
275}
276
277#[derive(Default)]
278struct TypeCounts {
279    string: usize,
280    int: usize,
281    float: usize,
282    bool_: usize,
283    date: usize,
284    datetime: usize,
285    time: usize,
286    error: usize,
287}
288
289impl TypeCounts {
290    fn observe(&mut self, v: &CellValue) {
291        match v {
292            CellValue::Empty => {}
293            // CSV cells and openpyxl-stored-as-text workbooks surface as
294            // strings here. Per invariant B4, schema inference is the
295            // single source of truth for "this column is actually numbers";
296            // parse the rendered text so a CSV column of `"100","200",...`
297            // classifies as Int, not String.
298            CellValue::String(s) => match classify_string_as_type(s) {
299                StringShape::Int => self.int += 1,
300                StringShape::Float => self.float += 1,
301                StringShape::Other => self.string += 1,
302            },
303            CellValue::Int(_) => self.int += 1,
304            CellValue::Float(_) => self.float += 1,
305            CellValue::Bool(_) => self.bool_ += 1,
306            CellValue::Date(_) => self.date += 1,
307            CellValue::DateTime(_) => self.datetime += 1,
308            CellValue::Time(_) => self.time += 1,
309            CellValue::Error(_) => self.error += 1,
310        }
311    }
312
313    /// Pick the dominant type. Int + Float coexisting in the same column
314    /// resolve to Float (numeric supertype). Anything else with two or
315    /// more types contributing returns Mixed.
316    fn dominant(&self) -> InferredType {
317        let total = self.string
318            + self.int
319            + self.float
320            + self.bool_
321            + self.date
322            + self.datetime
323            + self.time
324            + self.error;
325        if total == 0 {
326            return InferredType::Empty;
327        }
328        // Int+Float merge: if numeric is the only category present, return
329        // Float when any cell was Float, else Int.
330        let numeric = self.int + self.float;
331        if numeric == total {
332            return if self.float > 0 {
333                InferredType::Float
334            } else {
335                InferredType::Int
336            };
337        }
338
339        let pairs: [(usize, InferredType); 7] = [
340            (self.string, InferredType::String),
341            (self.bool_, InferredType::Bool),
342            (self.date, InferredType::Date),
343            (self.datetime, InferredType::DateTime),
344            (self.time, InferredType::Time),
345            // Numeric collapses to a single bucket so a column of 9 floats
346            // and 1 string still resolves to Mixed (not Float-wins-because-
347            // it's-the-largest); but a column that's pure numeric was
348            // already handled above.
349            (numeric, InferredType::Float),
350            (self.error, InferredType::String),
351        ];
352        let nonzero = pairs.iter().filter(|(c, _)| *c > 0).count();
353        if nonzero > 1 {
354            return InferredType::Mixed;
355        }
356        pairs
357            .iter()
358            .find(|(c, _)| *c > 0)
359            .map(|(_, t)| *t)
360            .unwrap_or(InferredType::Empty)
361    }
362}
363
364#[cfg(test)]
365mod tests {
366    use super::*;
367
368    fn s(v: &str) -> Cell {
369        Cell {
370            value: CellValue::String(v.to_string()),
371            number_format: None,
372        }
373    }
374    fn i(n: i64) -> Cell {
375        Cell {
376            value: CellValue::Int(n),
377            number_format: None,
378        }
379    }
380    fn f(n: f64) -> Cell {
381        Cell {
382            value: CellValue::Float(n),
383            number_format: None,
384        }
385    }
386    fn empty() -> Cell {
387        Cell::empty()
388    }
389    fn currency_f(n: f64) -> Cell {
390        Cell {
391            value: CellValue::Float(n),
392            number_format: Some("$#,##0.00".to_string()),
393        }
394    }
395
396    fn sheet_with(name: &str, rows: Vec<Vec<Cell>>) -> Sheet {
397        Sheet::from_rows_for_test(name, rows)
398    }
399
400    #[test]
401    fn pure_int_column_infers_int_unique_when_distinct() {
402        let rows = vec![vec![s("id")], vec![i(1)], vec![i(2)], vec![i(3)]];
403        let schema = infer_sheet_schema(&sheet_with("t", rows));
404        let col = &schema.columns[0];
405        assert_eq!(col.inferred_type, InferredType::Int);
406        assert_eq!(col.null_count, 0);
407        assert_eq!(col.unique_count, 3);
408        assert_eq!(col.cardinality, Cardinality::Unique);
409    }
410
411    #[test]
412    fn int_plus_float_collapses_to_float() {
413        let rows = vec![vec![s("price")], vec![i(1)], vec![f(2.5)], vec![i(3)]];
414        let schema = infer_sheet_schema(&sheet_with("t", rows));
415        assert_eq!(schema.columns[0].inferred_type, InferredType::Float);
416    }
417
418    #[test]
419    fn mixed_string_and_numeric_returns_mixed() {
420        let rows = vec![vec![s("col")], vec![s("hello")], vec![i(42)]];
421        let schema = infer_sheet_schema(&sheet_with("t", rows));
422        assert_eq!(schema.columns[0].inferred_type, InferredType::Mixed);
423    }
424
425    #[test]
426    fn non_finite_numeric_strings_stay_strings() {
427        let rows = vec![
428            vec![s("value")],
429            vec![s("NaN")],
430            vec![s("inf")],
431            vec![s("-inf")],
432        ];
433        let schema = infer_sheet_schema(&sheet_with("t", rows));
434        assert_eq!(schema.columns[0].inferred_type, InferredType::String);
435    }
436
437    #[test]
438    fn categorical_bucket_when_few_repeated_values() {
439        // 12 rows, 3 distinct values, all repeated → categorical.
440        let rows = vec![
441            vec![s("region")],
442            vec![s("us")],
443            vec![s("eu")],
444            vec![s("apac")],
445            vec![s("us")],
446            vec![s("eu")],
447            vec![s("apac")],
448            vec![s("us")],
449            vec![s("eu")],
450            vec![s("apac")],
451            vec![s("us")],
452            vec![s("eu")],
453            vec![s("apac")],
454        ];
455        let schema = infer_sheet_schema(&sheet_with("t", rows));
456        let col = &schema.columns[0];
457        assert_eq!(col.unique_count, 3);
458        assert_eq!(col.cardinality, Cardinality::Categorical);
459        assert_eq!(col.sample_values.len(), 3);
460    }
461
462    #[test]
463    fn high_cardinality_when_distinct_count_too_high_for_categorical() {
464        // 21 distinct values exceeds CATEGORICAL_MAX_DISTINCT (20), so
465        // even though every value repeats once it still classes as
466        // high-cardinality.
467        let rows: Vec<Vec<Cell>> = std::iter::once(vec![s("x")])
468            .chain((0..21).map(|i| vec![s(&format!("v{i}"))]))
469            .chain((0..21).map(|i| vec![s(&format!("v{i}"))]))
470            .collect();
471        let schema = infer_sheet_schema(&sheet_with("t", rows));
472        let col = &schema.columns[0];
473        assert_eq!(col.unique_count, 21);
474        assert_eq!(col.cardinality, Cardinality::HighCardinality);
475    }
476
477    #[test]
478    fn null_count_handles_short_rows_and_empty_cells() {
479        let rows = vec![
480            vec![s("a"), s("b")],
481            vec![i(1), empty()],
482            vec![i(2)], // short row - col 1 is missing
483            vec![i(3), i(4)],
484        ];
485        let schema = infer_sheet_schema(&sheet_with("t", rows));
486        let b = &schema.columns[1];
487        assert_eq!(b.null_count, 2);
488        assert_eq!(b.unique_count, 1);
489    }
490
491    #[test]
492    fn currency_format_locked_from_first_non_empty_cell() {
493        let rows = vec![
494            vec![s("revenue")],
495            vec![empty()],
496            vec![currency_f(1500.0)],
497            vec![currency_f(2500.0)],
498        ];
499        let schema = infer_sheet_schema(&sheet_with("t", rows));
500        let col = &schema.columns[0];
501        assert_eq!(col.format_category, FormatCategory::Currency);
502        assert_eq!(col.inferred_type, InferredType::Float);
503    }
504
505    #[test]
506    fn at_cap_then_repeats_stays_uncapped() {
507        // A column with exactly UNIQUE_CAP distinct values followed by a
508        // long run of repeats should report the exact count and stay
509        // uncapped. The earlier flip-on-next-row logic incorrectly set
510        // `unique_capped: true` (and forced HighCardinality) on the first
511        // duplicate after the cap, misleading downstream callers about
512        // whether the cardinality figure was trustworthy.
513        let mut rows: Vec<Vec<Cell>> = vec![vec![s("id")]];
514        for n in 0..(UNIQUE_CAP as i64) {
515            rows.push(vec![i(n)]);
516        }
517        // Repeats — these must NOT flip `unique_capped`.
518        for n in 0..50 {
519            rows.push(vec![i(n)]);
520        }
521        let schema = infer_sheet_schema(&sheet_with("t", rows));
522        let col = &schema.columns[0];
523        assert_eq!(col.unique_count, UNIQUE_CAP);
524        assert!(
525            !col.unique_capped,
526            "exact-at-cap with only repeats should stay uncapped"
527        );
528    }
529
530    #[test]
531    fn one_past_cap_flips_capped() {
532        // The first genuinely new value beyond UNIQUE_CAP correctly flips
533        // the capped flag; subsequent classification falls back to
534        // HighCardinality (the safer bucket).
535        let mut rows: Vec<Vec<Cell>> = vec![vec![s("id")]];
536        for n in 0..((UNIQUE_CAP + 1) as i64) {
537            rows.push(vec![i(n)]);
538        }
539        let schema = infer_sheet_schema(&sheet_with("t", rows));
540        let col = &schema.columns[0];
541        assert!(col.unique_capped);
542        assert_eq!(col.cardinality, Cardinality::HighCardinality);
543    }
544
545    #[test]
546    fn empty_column_classifies_as_empty() {
547        let rows = vec![vec![s("a")], vec![empty()], vec![empty()]];
548        let schema = infer_sheet_schema(&sheet_with("t", rows));
549        let col = &schema.columns[0];
550        assert_eq!(col.inferred_type, InferredType::Empty);
551        assert_eq!(col.cardinality, Cardinality::Empty);
552        assert_eq!(col.null_count, 2);
553    }
554}