Skip to main content

entrenar/config/infer/
stats.rs

1//! Column statistics for type inference
2
3/// Statistics about a column used for type inference
4#[allow(clippy::struct_excessive_bools)]
5#[derive(Debug, Clone, Default)]
6pub struct ColumnStats {
7    /// Column name
8    pub name: String,
9    /// Number of rows
10    pub count: usize,
11    /// Number of unique values
12    pub unique_count: usize,
13    /// Number of null/missing values
14    pub null_count: usize,
15    /// Whether all values are integers
16    pub all_integers: bool,
17    /// Whether all values are numeric
18    pub all_numeric: bool,
19    /// Minimum string length (if text)
20    pub min_str_len: Option<usize>,
21    /// Maximum string length (if text)
22    pub max_str_len: Option<usize>,
23    /// Average string length (if text)
24    pub avg_str_len: Option<f32>,
25    /// Whether values look like timestamps
26    pub looks_like_datetime: bool,
27    /// Whether values are arrays/lists
28    pub is_array: bool,
29    /// Array element count (if array)
30    pub array_len: Option<usize>,
31    /// Sample values for heuristic analysis
32    pub sample_values: Vec<String>,
33}
34
35impl ColumnStats {
36    /// Create stats for a column
37    pub fn new(name: impl Into<String>) -> Self {
38        Self { name: name.into(), ..Default::default() }
39    }
40
41    /// Cardinality ratio: unique_count / count
42    pub fn cardinality_ratio(&self) -> f32 {
43        if self.count == 0 {
44            0.0
45        } else {
46            self.unique_count as f32 / self.count as f32
47        }
48    }
49
50    /// Null ratio: null_count / count
51    pub fn null_ratio(&self) -> f32 {
52        if self.count == 0 {
53            0.0
54        } else {
55            self.null_count as f32 / self.count as f32
56        }
57    }
58}