use std::collections::HashSet;
use crate::cell::{Cell, CellValue};
use crate::format::{classify_format, FormatCategory};
use crate::sheet::Sheet;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum InferredType {
String,
Int,
Float,
Bool,
Date,
DateTime,
Time,
Mixed,
Empty,
}
impl InferredType {
pub fn as_str(self) -> &'static str {
match self {
InferredType::String => "string",
InferredType::Int => "int",
InferredType::Float => "float",
InferredType::Bool => "bool",
InferredType::Date => "date",
InferredType::DateTime => "datetime",
InferredType::Time => "time",
InferredType::Mixed => "mixed",
InferredType::Empty => "empty",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Cardinality {
Unique,
Categorical,
HighCardinality,
Empty,
}
impl Cardinality {
pub fn as_str(self) -> &'static str {
match self {
Cardinality::Unique => "unique",
Cardinality::Categorical => "categorical",
Cardinality::HighCardinality => "high-cardinality",
Cardinality::Empty => "empty",
}
}
}
#[derive(Debug, Clone)]
pub struct ColumnSchema {
pub name: String,
pub inferred_type: InferredType,
pub format_category: FormatCategory,
pub null_count: usize,
pub unique_count: usize,
pub unique_capped: bool,
pub cardinality: Cardinality,
pub sample_values: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct SheetSchema {
pub sheet: String,
pub rows: usize,
pub columns: Vec<ColumnSchema>,
}
pub const UNIQUE_CAP: usize = 10_000;
const SAMPLE_LIMIT: usize = 3;
const CATEGORICAL_MAX_DISTINCT: usize = 20;
pub fn infer_sheet_schema(sheet: &Sheet) -> SheetSchema {
let headers = sheet.headers();
let (total_rows, _) = sheet.dimensions();
let body_rows = total_rows.saturating_sub(1);
let cols = headers.len();
let mut columns = Vec::with_capacity(cols);
for col_idx in 0..cols {
columns.push(infer_column(sheet, col_idx, &headers[col_idx], body_rows));
}
SheetSchema {
sheet: sheet.name.clone(),
rows: body_rows,
columns,
}
}
fn infer_column(sheet: &Sheet, col_idx: usize, name: &str, body_rows: usize) -> ColumnSchema {
let mut counts = TypeCounts::default();
let mut null_count = 0usize;
let mut uniques: HashSet<String> = HashSet::new();
let mut unique_capped = false;
let mut samples: Vec<String> = Vec::with_capacity(SAMPLE_LIMIT);
let mut format_category = FormatCategory::General;
let mut format_locked = false;
for row in sheet.rows().iter().skip(1) {
let cell = match row.get(col_idx) {
Some(c) => c,
None => {
null_count += 1;
continue;
}
};
if matches!(cell.value, CellValue::Empty) {
null_count += 1;
continue;
}
if !format_locked {
if let Some(fmt) = &cell.number_format {
format_category = classify_format(fmt);
}
format_locked = true;
}
counts.observe(&cell.value);
let rendered = render_for_uniqueness(cell);
if !unique_capped {
if uniques.contains(&rendered) {
} else if uniques.len() < UNIQUE_CAP {
uniques.insert(rendered.clone());
if samples.len() < SAMPLE_LIMIT {
samples.push(rendered);
}
} else {
unique_capped = true;
}
}
}
let inferred_type = counts.dominant();
let unique_count = uniques.len();
let non_null = body_rows.saturating_sub(null_count);
let cardinality = classify_cardinality(unique_count, non_null, unique_capped);
ColumnSchema {
name: name.to_string(),
inferred_type,
format_category,
null_count,
unique_count,
unique_capped,
cardinality,
sample_values: samples,
}
}
fn render_for_uniqueness(cell: &Cell) -> String {
match &cell.value {
CellValue::Empty => String::new(),
CellValue::String(s) => s.clone(),
CellValue::Bool(b) => b.to_string(),
CellValue::Int(n) => n.to_string(),
CellValue::Float(n) => format!("{n}"),
CellValue::Date(d) => d.format("%Y-%m-%d").to_string(),
CellValue::DateTime(dt) => dt.format("%Y-%m-%d %H:%M:%S").to_string(),
CellValue::Time(t) => t.format("%H:%M:%S").to_string(),
CellValue::Error(e) => format!("ERROR: {e}"),
}
}
fn classify_cardinality(unique: usize, non_null: usize, capped: bool) -> Cardinality {
if non_null == 0 {
return Cardinality::Empty;
}
if capped {
return Cardinality::HighCardinality;
}
if unique == non_null {
return Cardinality::Unique;
}
if unique <= CATEGORICAL_MAX_DISTINCT && unique * 2 <= non_null {
return Cardinality::Categorical;
}
Cardinality::HighCardinality
}
enum StringShape {
Int,
Float,
Other,
}
fn classify_string_as_type(s: &str) -> StringShape {
let t = s.trim();
if t.is_empty() {
return StringShape::Other;
}
if t.parse::<i64>().is_ok() {
StringShape::Int
} else if t.parse::<f64>().is_ok_and(f64::is_finite) {
StringShape::Float
} else {
StringShape::Other
}
}
#[derive(Default)]
struct TypeCounts {
string: usize,
int: usize,
float: usize,
bool_: usize,
date: usize,
datetime: usize,
time: usize,
error: usize,
}
impl TypeCounts {
fn observe(&mut self, v: &CellValue) {
match v {
CellValue::Empty => {}
CellValue::String(s) => match classify_string_as_type(s) {
StringShape::Int => self.int += 1,
StringShape::Float => self.float += 1,
StringShape::Other => self.string += 1,
},
CellValue::Int(_) => self.int += 1,
CellValue::Float(_) => self.float += 1,
CellValue::Bool(_) => self.bool_ += 1,
CellValue::Date(_) => self.date += 1,
CellValue::DateTime(_) => self.datetime += 1,
CellValue::Time(_) => self.time += 1,
CellValue::Error(_) => self.error += 1,
}
}
fn dominant(&self) -> InferredType {
let total = self.string
+ self.int
+ self.float
+ self.bool_
+ self.date
+ self.datetime
+ self.time
+ self.error;
if total == 0 {
return InferredType::Empty;
}
let numeric = self.int + self.float;
if numeric == total {
return if self.float > 0 {
InferredType::Float
} else {
InferredType::Int
};
}
let pairs: [(usize, InferredType); 7] = [
(self.string, InferredType::String),
(self.bool_, InferredType::Bool),
(self.date, InferredType::Date),
(self.datetime, InferredType::DateTime),
(self.time, InferredType::Time),
(numeric, InferredType::Float),
(self.error, InferredType::String),
];
let nonzero = pairs.iter().filter(|(c, _)| *c > 0).count();
if nonzero > 1 {
return InferredType::Mixed;
}
pairs
.iter()
.find(|(c, _)| *c > 0)
.map(|(_, t)| *t)
.unwrap_or(InferredType::Empty)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn s(v: &str) -> Cell {
Cell {
value: CellValue::String(v.to_string()),
number_format: None,
}
}
fn i(n: i64) -> Cell {
Cell {
value: CellValue::Int(n),
number_format: None,
}
}
fn f(n: f64) -> Cell {
Cell {
value: CellValue::Float(n),
number_format: None,
}
}
fn empty() -> Cell {
Cell::empty()
}
fn currency_f(n: f64) -> Cell {
Cell {
value: CellValue::Float(n),
number_format: Some("$#,##0.00".to_string()),
}
}
fn sheet_with(name: &str, rows: Vec<Vec<Cell>>) -> Sheet {
Sheet::from_rows_for_test(name, rows)
}
#[test]
fn pure_int_column_infers_int_unique_when_distinct() {
let rows = vec![vec![s("id")], vec![i(1)], vec![i(2)], vec![i(3)]];
let schema = infer_sheet_schema(&sheet_with("t", rows));
let col = &schema.columns[0];
assert_eq!(col.inferred_type, InferredType::Int);
assert_eq!(col.null_count, 0);
assert_eq!(col.unique_count, 3);
assert_eq!(col.cardinality, Cardinality::Unique);
}
#[test]
fn int_plus_float_collapses_to_float() {
let rows = vec![vec![s("price")], vec![i(1)], vec![f(2.5)], vec![i(3)]];
let schema = infer_sheet_schema(&sheet_with("t", rows));
assert_eq!(schema.columns[0].inferred_type, InferredType::Float);
}
#[test]
fn mixed_string_and_numeric_returns_mixed() {
let rows = vec![vec![s("col")], vec![s("hello")], vec![i(42)]];
let schema = infer_sheet_schema(&sheet_with("t", rows));
assert_eq!(schema.columns[0].inferred_type, InferredType::Mixed);
}
#[test]
fn non_finite_numeric_strings_stay_strings() {
let rows = vec![
vec![s("value")],
vec![s("NaN")],
vec![s("inf")],
vec![s("-inf")],
];
let schema = infer_sheet_schema(&sheet_with("t", rows));
assert_eq!(schema.columns[0].inferred_type, InferredType::String);
}
#[test]
fn categorical_bucket_when_few_repeated_values() {
let rows = vec![
vec![s("region")],
vec![s("us")],
vec![s("eu")],
vec![s("apac")],
vec![s("us")],
vec![s("eu")],
vec![s("apac")],
vec![s("us")],
vec![s("eu")],
vec![s("apac")],
vec![s("us")],
vec![s("eu")],
vec![s("apac")],
];
let schema = infer_sheet_schema(&sheet_with("t", rows));
let col = &schema.columns[0];
assert_eq!(col.unique_count, 3);
assert_eq!(col.cardinality, Cardinality::Categorical);
assert_eq!(col.sample_values.len(), 3);
}
#[test]
fn high_cardinality_when_distinct_count_too_high_for_categorical() {
let rows: Vec<Vec<Cell>> = std::iter::once(vec![s("x")])
.chain((0..21).map(|i| vec![s(&format!("v{i}"))]))
.chain((0..21).map(|i| vec![s(&format!("v{i}"))]))
.collect();
let schema = infer_sheet_schema(&sheet_with("t", rows));
let col = &schema.columns[0];
assert_eq!(col.unique_count, 21);
assert_eq!(col.cardinality, Cardinality::HighCardinality);
}
#[test]
fn null_count_handles_short_rows_and_empty_cells() {
let rows = vec![
vec![s("a"), s("b")],
vec![i(1), empty()],
vec![i(2)], vec![i(3), i(4)],
];
let schema = infer_sheet_schema(&sheet_with("t", rows));
let b = &schema.columns[1];
assert_eq!(b.null_count, 2);
assert_eq!(b.unique_count, 1);
}
#[test]
fn currency_format_locked_from_first_non_empty_cell() {
let rows = vec![
vec![s("revenue")],
vec![empty()],
vec![currency_f(1500.0)],
vec![currency_f(2500.0)],
];
let schema = infer_sheet_schema(&sheet_with("t", rows));
let col = &schema.columns[0];
assert_eq!(col.format_category, FormatCategory::Currency);
assert_eq!(col.inferred_type, InferredType::Float);
}
#[test]
fn at_cap_then_repeats_stays_uncapped() {
let mut rows: Vec<Vec<Cell>> = vec![vec![s("id")]];
for n in 0..(UNIQUE_CAP as i64) {
rows.push(vec![i(n)]);
}
for n in 0..50 {
rows.push(vec![i(n)]);
}
let schema = infer_sheet_schema(&sheet_with("t", rows));
let col = &schema.columns[0];
assert_eq!(col.unique_count, UNIQUE_CAP);
assert!(
!col.unique_capped,
"exact-at-cap with only repeats should stay uncapped"
);
}
#[test]
fn one_past_cap_flips_capped() {
let mut rows: Vec<Vec<Cell>> = vec![vec![s("id")]];
for n in 0..((UNIQUE_CAP + 1) as i64) {
rows.push(vec![i(n)]);
}
let schema = infer_sheet_schema(&sheet_with("t", rows));
let col = &schema.columns[0];
assert!(col.unique_capped);
assert_eq!(col.cardinality, Cardinality::HighCardinality);
}
#[test]
fn empty_column_classifies_as_empty() {
let rows = vec![vec![s("a")], vec![empty()], vec![empty()]];
let schema = infer_sheet_schema(&sheet_with("t", rows));
let col = &schema.columns[0];
assert_eq!(col.inferred_type, InferredType::Empty);
assert_eq!(col.cardinality, Cardinality::Empty);
assert_eq!(col.null_count, 2);
}
}