1use std::collections::HashSet;
14
15use crate::cell::{Cell, CellValue};
16use crate::format::{classify_format, FormatCategory};
17use crate::sheet::Sheet;
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum InferredType {
23 String,
24 Int,
25 Float,
26 Bool,
27 Date,
28 DateTime,
29 Time,
30 Mixed,
31 Empty,
32}
33
34impl InferredType {
35 pub fn as_str(self) -> &'static str {
36 match self {
37 InferredType::String => "string",
38 InferredType::Int => "int",
39 InferredType::Float => "float",
40 InferredType::Bool => "bool",
41 InferredType::Date => "date",
42 InferredType::DateTime => "datetime",
43 InferredType::Time => "time",
44 InferredType::Mixed => "mixed",
45 InferredType::Empty => "empty",
46 }
47 }
48}
49
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
53pub enum Cardinality {
54 Unique,
56 Categorical,
58 HighCardinality,
60 Empty,
62}
63
64impl Cardinality {
65 pub fn as_str(self) -> &'static str {
66 match self {
67 Cardinality::Unique => "unique",
68 Cardinality::Categorical => "categorical",
69 Cardinality::HighCardinality => "high-cardinality",
70 Cardinality::Empty => "empty",
71 }
72 }
73}
74
75#[derive(Debug, Clone)]
76pub struct ColumnSchema {
77 pub name: String,
78 pub inferred_type: InferredType,
79 pub format_category: FormatCategory,
83 pub null_count: usize,
84 pub unique_count: usize,
86 pub unique_capped: bool,
87 pub cardinality: Cardinality,
88 pub sample_values: Vec<String>,
92}
93
94#[derive(Debug, Clone)]
95pub struct SheetSchema {
96 pub sheet: String,
97 pub rows: usize,
98 pub columns: Vec<ColumnSchema>,
99}
100
101pub const UNIQUE_CAP: usize = 10_000;
105
106const SAMPLE_LIMIT: usize = 3;
107const CATEGORICAL_MAX_DISTINCT: usize = 20;
111
112pub fn infer_sheet_schema(sheet: &Sheet) -> SheetSchema {
115 let headers = sheet.headers();
116 let (total_rows, _) = sheet.dimensions();
117 let body_rows = total_rows.saturating_sub(1);
118 let cols = headers.len();
119 let mut columns = Vec::with_capacity(cols);
120
121 for col_idx in 0..cols {
122 columns.push(infer_column(sheet, col_idx, &headers[col_idx], body_rows));
123 }
124
125 SheetSchema {
126 sheet: sheet.name.clone(),
127 rows: body_rows,
128 columns,
129 }
130}
131
132fn infer_column(sheet: &Sheet, col_idx: usize, name: &str, body_rows: usize) -> ColumnSchema {
133 let mut counts = TypeCounts::default();
134 let mut null_count = 0usize;
135 let mut uniques: HashSet<String> = HashSet::new();
136 let mut unique_capped = false;
137 let mut samples: Vec<String> = Vec::with_capacity(SAMPLE_LIMIT);
138 let mut format_category = FormatCategory::General;
139 let mut format_locked = false;
140
141 for row in sheet.rows().iter().skip(1) {
142 let cell = match row.get(col_idx) {
143 Some(c) => c,
144 None => {
145 null_count += 1;
146 continue;
147 }
148 };
149
150 if matches!(cell.value, CellValue::Empty) {
151 null_count += 1;
152 continue;
153 }
154
155 if !format_locked {
159 if let Some(fmt) = &cell.number_format {
160 format_category = classify_format(fmt);
161 }
162 format_locked = true;
163 }
164
165 counts.observe(&cell.value);
166
167 let rendered = render_for_uniqueness(cell);
168 if !unique_capped {
169 if uniques.contains(&rendered) {
170 } else if uniques.len() < UNIQUE_CAP {
173 uniques.insert(rendered.clone());
174 if samples.len() < SAMPLE_LIMIT {
175 samples.push(rendered);
176 }
177 } else {
178 unique_capped = true;
183 }
184 }
185 }
186
187 let inferred_type = counts.dominant();
188 let unique_count = uniques.len();
189 let non_null = body_rows.saturating_sub(null_count);
190 let cardinality = classify_cardinality(unique_count, non_null, unique_capped);
191
192 ColumnSchema {
193 name: name.to_string(),
194 inferred_type,
195 format_category,
196 null_count,
197 unique_count,
198 unique_capped,
199 cardinality,
200 sample_values: samples,
201 }
202}
203
204fn render_for_uniqueness(cell: &Cell) -> String {
217 match &cell.value {
218 CellValue::Empty => String::new(),
219 CellValue::String(s) => s.clone(),
220 CellValue::Bool(b) => b.to_string(),
221 CellValue::Int(n) => n.to_string(),
222 CellValue::Float(n) => format!("{n}"),
223 CellValue::Date(d) => d.format("%Y-%m-%d").to_string(),
224 CellValue::DateTime(dt) => dt.format("%Y-%m-%d %H:%M:%S").to_string(),
225 CellValue::Time(t) => t.format("%H:%M:%S").to_string(),
226 CellValue::Error(e) => format!("ERROR: {e}"),
227 }
228}
229
230fn classify_cardinality(unique: usize, non_null: usize, capped: bool) -> Cardinality {
231 if non_null == 0 {
232 return Cardinality::Empty;
233 }
234 if capped {
238 return Cardinality::HighCardinality;
239 }
240 if unique == non_null {
241 return Cardinality::Unique;
242 }
243 if unique <= CATEGORICAL_MAX_DISTINCT && unique * 2 <= non_null {
244 return Cardinality::Categorical;
245 }
246 Cardinality::HighCardinality
247}
248
249enum StringShape {
253 Int,
254 Float,
255 Other,
256}
257
258fn classify_string_as_type(s: &str) -> StringShape {
264 let t = s.trim();
265 if t.is_empty() {
266 return StringShape::Other;
267 }
268 if t.parse::<i64>().is_ok() {
269 StringShape::Int
270 } else if t.parse::<f64>().is_ok_and(f64::is_finite) {
271 StringShape::Float
272 } else {
273 StringShape::Other
274 }
275}
276
277#[derive(Default)]
278struct TypeCounts {
279 string: usize,
280 int: usize,
281 float: usize,
282 bool_: usize,
283 date: usize,
284 datetime: usize,
285 time: usize,
286 error: usize,
287}
288
289impl TypeCounts {
290 fn observe(&mut self, v: &CellValue) {
291 match v {
292 CellValue::Empty => {}
293 CellValue::String(s) => match classify_string_as_type(s) {
299 StringShape::Int => self.int += 1,
300 StringShape::Float => self.float += 1,
301 StringShape::Other => self.string += 1,
302 },
303 CellValue::Int(_) => self.int += 1,
304 CellValue::Float(_) => self.float += 1,
305 CellValue::Bool(_) => self.bool_ += 1,
306 CellValue::Date(_) => self.date += 1,
307 CellValue::DateTime(_) => self.datetime += 1,
308 CellValue::Time(_) => self.time += 1,
309 CellValue::Error(_) => self.error += 1,
310 }
311 }
312
313 fn dominant(&self) -> InferredType {
317 let total = self.string
318 + self.int
319 + self.float
320 + self.bool_
321 + self.date
322 + self.datetime
323 + self.time
324 + self.error;
325 if total == 0 {
326 return InferredType::Empty;
327 }
328 let numeric = self.int + self.float;
331 if numeric == total {
332 return if self.float > 0 {
333 InferredType::Float
334 } else {
335 InferredType::Int
336 };
337 }
338
339 let pairs: [(usize, InferredType); 7] = [
340 (self.string, InferredType::String),
341 (self.bool_, InferredType::Bool),
342 (self.date, InferredType::Date),
343 (self.datetime, InferredType::DateTime),
344 (self.time, InferredType::Time),
345 (numeric, InferredType::Float),
350 (self.error, InferredType::String),
351 ];
352 let nonzero = pairs.iter().filter(|(c, _)| *c > 0).count();
353 if nonzero > 1 {
354 return InferredType::Mixed;
355 }
356 pairs
357 .iter()
358 .find(|(c, _)| *c > 0)
359 .map(|(_, t)| *t)
360 .unwrap_or(InferredType::Empty)
361 }
362}
363
364#[cfg(test)]
365mod tests {
366 use super::*;
367
368 fn s(v: &str) -> Cell {
369 Cell {
370 value: CellValue::String(v.to_string()),
371 number_format: None,
372 }
373 }
374 fn i(n: i64) -> Cell {
375 Cell {
376 value: CellValue::Int(n),
377 number_format: None,
378 }
379 }
380 fn f(n: f64) -> Cell {
381 Cell {
382 value: CellValue::Float(n),
383 number_format: None,
384 }
385 }
386 fn empty() -> Cell {
387 Cell::empty()
388 }
389 fn currency_f(n: f64) -> Cell {
390 Cell {
391 value: CellValue::Float(n),
392 number_format: Some("$#,##0.00".to_string()),
393 }
394 }
395
396 fn sheet_with(name: &str, rows: Vec<Vec<Cell>>) -> Sheet {
397 Sheet::from_rows_for_test(name, rows)
398 }
399
400 #[test]
401 fn pure_int_column_infers_int_unique_when_distinct() {
402 let rows = vec![vec![s("id")], vec![i(1)], vec![i(2)], vec![i(3)]];
403 let schema = infer_sheet_schema(&sheet_with("t", rows));
404 let col = &schema.columns[0];
405 assert_eq!(col.inferred_type, InferredType::Int);
406 assert_eq!(col.null_count, 0);
407 assert_eq!(col.unique_count, 3);
408 assert_eq!(col.cardinality, Cardinality::Unique);
409 }
410
411 #[test]
412 fn int_plus_float_collapses_to_float() {
413 let rows = vec![vec![s("price")], vec![i(1)], vec![f(2.5)], vec![i(3)]];
414 let schema = infer_sheet_schema(&sheet_with("t", rows));
415 assert_eq!(schema.columns[0].inferred_type, InferredType::Float);
416 }
417
418 #[test]
419 fn mixed_string_and_numeric_returns_mixed() {
420 let rows = vec![vec![s("col")], vec![s("hello")], vec![i(42)]];
421 let schema = infer_sheet_schema(&sheet_with("t", rows));
422 assert_eq!(schema.columns[0].inferred_type, InferredType::Mixed);
423 }
424
425 #[test]
426 fn non_finite_numeric_strings_stay_strings() {
427 let rows = vec![
428 vec![s("value")],
429 vec![s("NaN")],
430 vec![s("inf")],
431 vec![s("-inf")],
432 ];
433 let schema = infer_sheet_schema(&sheet_with("t", rows));
434 assert_eq!(schema.columns[0].inferred_type, InferredType::String);
435 }
436
437 #[test]
438 fn categorical_bucket_when_few_repeated_values() {
439 let rows = vec![
441 vec![s("region")],
442 vec![s("us")],
443 vec![s("eu")],
444 vec![s("apac")],
445 vec![s("us")],
446 vec![s("eu")],
447 vec![s("apac")],
448 vec![s("us")],
449 vec![s("eu")],
450 vec![s("apac")],
451 vec![s("us")],
452 vec![s("eu")],
453 vec![s("apac")],
454 ];
455 let schema = infer_sheet_schema(&sheet_with("t", rows));
456 let col = &schema.columns[0];
457 assert_eq!(col.unique_count, 3);
458 assert_eq!(col.cardinality, Cardinality::Categorical);
459 assert_eq!(col.sample_values.len(), 3);
460 }
461
462 #[test]
463 fn high_cardinality_when_distinct_count_too_high_for_categorical() {
464 let rows: Vec<Vec<Cell>> = std::iter::once(vec![s("x")])
468 .chain((0..21).map(|i| vec![s(&format!("v{i}"))]))
469 .chain((0..21).map(|i| vec![s(&format!("v{i}"))]))
470 .collect();
471 let schema = infer_sheet_schema(&sheet_with("t", rows));
472 let col = &schema.columns[0];
473 assert_eq!(col.unique_count, 21);
474 assert_eq!(col.cardinality, Cardinality::HighCardinality);
475 }
476
477 #[test]
478 fn null_count_handles_short_rows_and_empty_cells() {
479 let rows = vec![
480 vec![s("a"), s("b")],
481 vec![i(1), empty()],
482 vec![i(2)], vec![i(3), i(4)],
484 ];
485 let schema = infer_sheet_schema(&sheet_with("t", rows));
486 let b = &schema.columns[1];
487 assert_eq!(b.null_count, 2);
488 assert_eq!(b.unique_count, 1);
489 }
490
491 #[test]
492 fn currency_format_locked_from_first_non_empty_cell() {
493 let rows = vec![
494 vec![s("revenue")],
495 vec![empty()],
496 vec![currency_f(1500.0)],
497 vec![currency_f(2500.0)],
498 ];
499 let schema = infer_sheet_schema(&sheet_with("t", rows));
500 let col = &schema.columns[0];
501 assert_eq!(col.format_category, FormatCategory::Currency);
502 assert_eq!(col.inferred_type, InferredType::Float);
503 }
504
505 #[test]
506 fn at_cap_then_repeats_stays_uncapped() {
507 let mut rows: Vec<Vec<Cell>> = vec![vec![s("id")]];
514 for n in 0..(UNIQUE_CAP as i64) {
515 rows.push(vec![i(n)]);
516 }
517 for n in 0..50 {
519 rows.push(vec![i(n)]);
520 }
521 let schema = infer_sheet_schema(&sheet_with("t", rows));
522 let col = &schema.columns[0];
523 assert_eq!(col.unique_count, UNIQUE_CAP);
524 assert!(
525 !col.unique_capped,
526 "exact-at-cap with only repeats should stay uncapped"
527 );
528 }
529
530 #[test]
531 fn one_past_cap_flips_capped() {
532 let mut rows: Vec<Vec<Cell>> = vec![vec![s("id")]];
536 for n in 0..((UNIQUE_CAP + 1) as i64) {
537 rows.push(vec![i(n)]);
538 }
539 let schema = infer_sheet_schema(&sheet_with("t", rows));
540 let col = &schema.columns[0];
541 assert!(col.unique_capped);
542 assert_eq!(col.cardinality, Cardinality::HighCardinality);
543 }
544
545 #[test]
546 fn empty_column_classifies_as_empty() {
547 let rows = vec![vec![s("a")], vec![empty()], vec![empty()]];
548 let schema = infer_sheet_schema(&sheet_with("t", rows));
549 let col = &schema.columns[0];
550 assert_eq!(col.inferred_type, InferredType::Empty);
551 assert_eq!(col.cardinality, Cardinality::Empty);
552 assert_eq!(col.null_count, 2);
553 }
554}