Skip to main content

anomalyx_core/
record.rs

1//! The normalized columnar record model — the single shape every input format
2//! collapses into, and the only thing detectors ever see.
3//!
4//! Keeping this engine-independent (no Polars/Arrow types leak in) is what lets
5//! the *contract* stay stable while the normalization backend underneath it
6//! changes. `ax-normalize` owns the Polars dependency and converts down to this.
7
8use crate::value::{ColType, Value};
9use serde::{Deserialize, Serialize};
10
11/// One named column with an inferred type and its cells in row order.
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct Column {
14    pub name: String,
15    pub ty: ColType,
16    pub cells: Vec<Value>,
17}
18
19impl Column {
20    /// Builds a column from `name` and `cells`, inferring `ty` by folding each
21    /// cell's contributed type through [`ColType::unify`].
22    pub fn new(name: impl Into<String>, cells: Vec<Value>) -> Self {
23        let ty = cells
24            .iter()
25            .fold(ColType::Unknown, |acc, v| acc.unify(v.col_type()));
26        Column {
27            name: name.into(),
28            ty,
29            cells,
30        }
31    }
32
33    /// The finite numeric projection of this column (nulls and non-numeric
34    /// cells dropped). Empty for non-numeric columns — honest absence, not zeros.
35    pub fn numeric(&self) -> Vec<f64> {
36        self.cells
37            .iter()
38            .filter_map(Value::as_f64)
39            .filter(|x| x.is_finite())
40            .collect()
41    }
42
43    /// Count of null cells.
44    pub fn null_count(&self) -> usize {
45        self.cells.iter().filter(|v| v.is_null()).count()
46    }
47
48    pub fn len(&self) -> usize {
49        self.cells.len()
50    }
51
52    pub fn is_empty(&self) -> bool {
53        self.cells.is_empty()
54    }
55}
56
57/// A normalized corpus: named columns of equal length, plus provenance about
58/// where it came from. This is the universal input to every detector.
59#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct RecordSet {
61    /// Logical source identifier (path, URL, or `"-"` for stdin).
62    pub source: String,
63    /// The format the normalizer recognized (e.g. `"csv"`, `"ndjson"`).
64    pub format: String,
65    pub columns: Vec<Column>,
66}
67
68impl RecordSet {
69    /// Creates a record set, panicking only via debug-assert if columns are
70    /// ragged. Construction is the normalizer's responsibility; detectors may
71    /// rely on rectangularity.
72    pub fn new(source: impl Into<String>, format: impl Into<String>, columns: Vec<Column>) -> Self {
73        debug_assert!(
74            columns.windows(2).all(|w| w[0].len() == w[1].len()),
75            "RecordSet columns must be equal length"
76        );
77        RecordSet {
78            source: source.into(),
79            format: format.into(),
80            columns,
81        }
82    }
83
84    /// Number of rows (length of the first column, or 0 if columnless).
85    pub fn rows(&self) -> usize {
86        self.columns.first().map_or(0, Column::len)
87    }
88
89    pub fn width(&self) -> usize {
90        self.columns.len()
91    }
92
93    pub fn column(&self, name: &str) -> Option<&Column> {
94        self.columns.iter().find(|c| c.name == name)
95    }
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn numeric_skips_nulls_and_strings() {
104        let col = Column::new(
105            "x",
106            vec![
107                Value::Int(1),
108                Value::Null,
109                Value::Str("nope".into()),
110                Value::Float(2.5),
111            ],
112        );
113        assert_eq!(col.numeric(), vec![1.0, 2.5]);
114        assert_eq!(col.ty, ColType::Mixed);
115        assert_eq!(col.null_count(), 1);
116    }
117
118    #[test]
119    fn null_count_is_exact() {
120        assert_eq!(
121            Column::new("a", vec![Value::Int(1), Value::Int(2)]).null_count(),
122            0
123        );
124        assert_eq!(
125            Column::new("b", vec![Value::Null, Value::Int(1), Value::Null]).null_count(),
126            2
127        );
128    }
129
130    #[test]
131    fn empty_and_nonempty_columns() {
132        assert!(Column::new("e", vec![]).is_empty());
133        assert!(!Column::new("f", vec![Value::Int(1)]).is_empty());
134    }
135
136    #[test]
137    fn rows_and_width() {
138        let rs = RecordSet::new(
139            "-",
140            "csv",
141            vec![
142                Column::new("a", vec![Value::Int(1), Value::Int(2)]),
143                Column::new("b", vec![Value::Int(3), Value::Int(4)]),
144            ],
145        );
146        assert_eq!(rs.rows(), 2);
147        assert_eq!(rs.width(), 2);
148        assert!(rs.column("a").is_some());
149        assert!(rs.column("z").is_none());
150    }
151}