Skip to main content

matten_data/
schema.rs

1//! Schema summary and column-kind inference (RFC-034 §4.3, RFC-035 §5).
2//!
3//! `schema_summary()` helps a user decide what to select and convert. It reports
4//! row/column counts, per-column names, missing counts, and a simple inferred
5//! [`ColumnKind`]. It does not perform expensive dataframe analysis.
6
7use std::fmt;
8
9use crate::table::{CellValue, Table};
10
11/// A simple, inferred kind for a column (RFC-035 §5).
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13#[non_exhaustive]
14pub enum ColumnKind {
15    /// All non-missing cells are integers.
16    Integer,
17    /// All non-missing cells are numeric (integers and/or floats).
18    Float,
19    /// All non-missing cells are booleans.
20    Boolean,
21    /// All non-missing cells are text.
22    Text,
23    /// A mix of incompatible kinds (for example text and numbers).
24    Mixed,
25    /// Every cell in the column is missing.
26    MissingOnly,
27}
28
29impl fmt::Display for ColumnKind {
30    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
31        let s = match self {
32            ColumnKind::Integer => "integer",
33            ColumnKind::Float => "float",
34            ColumnKind::Boolean => "boolean",
35            ColumnKind::Text => "text",
36            ColumnKind::Mixed => "mixed",
37            ColumnKind::MissingOnly => "missing-only",
38        };
39        f.write_str(s)
40    }
41}
42
43/// Per-column entry in a [`SchemaSummary`].
44#[derive(Debug, Clone)]
45pub struct ColumnSummary {
46    /// Column name.
47    pub name: String,
48    /// Inferred simple kind.
49    pub kind: ColumnKind,
50    /// Number of missing cells in the column.
51    pub missing: usize,
52}
53
54/// A small, displayable description of a [`Table`]'s columns.
55#[derive(Debug, Clone)]
56pub struct SchemaSummary {
57    /// Number of data rows.
58    pub rows: usize,
59    /// Number of columns.
60    pub columns: usize,
61    per_column: Vec<ColumnSummary>,
62}
63
64impl SchemaSummary {
65    /// Per-column summaries, in column order.
66    pub fn column_summaries(&self) -> &[ColumnSummary] {
67        &self.per_column
68    }
69}
70
71impl fmt::Display for SchemaSummary {
72    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73        writeln!(f, "Table: {} rows x {} columns", self.rows, self.columns)?;
74        for col in &self.per_column {
75            writeln!(
76                f,
77                "  - {} ({}, {} missing)",
78                col.name, col.kind, col.missing
79            )?;
80        }
81        Ok(())
82    }
83}
84
85/// Compute a [`SchemaSummary`] for a table.
86pub(crate) fn summarize(table: &Table) -> SchemaSummary {
87    let headers = table.headers();
88    let rows = table.rows();
89
90    let per_column = headers
91        .iter()
92        .enumerate()
93        .map(|(c, name)| {
94            let mut missing = 0usize;
95            let mut has_int = false;
96            let mut has_float = false;
97            let mut has_bool = false;
98            let mut has_text = false;
99
100            for row in rows {
101                match &row[c] {
102                    CellValue::Missing => missing += 1,
103                    CellValue::Int(_) => has_int = true,
104                    CellValue::Float(_) => has_float = true,
105                    CellValue::Bool(_) => has_bool = true,
106                    CellValue::Text(_) => has_text = true,
107                }
108            }
109
110            let kind = infer_kind(has_int, has_float, has_bool, has_text);
111            ColumnSummary {
112                name: name.clone(),
113                kind,
114                missing,
115            }
116        })
117        .collect();
118
119    SchemaSummary {
120        rows: rows.len(),
121        columns: headers.len(),
122        per_column,
123    }
124}
125
126fn infer_kind(has_int: bool, has_float: bool, has_bool: bool, has_text: bool) -> ColumnKind {
127    let numeric = has_int || has_float;
128    let categories = [numeric, has_bool, has_text].iter().filter(|&&b| b).count();
129
130    match categories {
131        0 => ColumnKind::MissingOnly,
132        1 => {
133            if has_bool {
134                ColumnKind::Boolean
135            } else if has_text {
136                ColumnKind::Text
137            } else if has_float {
138                ColumnKind::Float
139            } else {
140                ColumnKind::Integer
141            }
142        }
143        _ => ColumnKind::Mixed,
144    }
145}