Skip to main content

anomalyx_core/
record.rs

1//! The normalized columnar record model — the single shape every input format
2//! collapses into, and the only thing detectors ever see.
3//!
4//! Keeping this engine-independent (no Polars/Arrow types leak in) is what lets
5//! the *contract* stay stable while the normalization backend underneath it
6//! changes. `ax-normalize` owns the Polars dependency and converts down to this.
7
8use crate::value::{ColType, Value};
9use serde::{Deserialize, Serialize};
10use std::collections::BTreeSet;
11
12/// One named column with an inferred type and its cells in row order.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct Column {
15    pub name: String,
16    pub ty: ColType,
17    pub cells: Vec<Value>,
18}
19
20impl Column {
21    /// Builds a column from `name` and `cells`, inferring `ty` by folding each
22    /// cell's contributed type through [`ColType::unify`].
23    pub fn new(name: impl Into<String>, cells: Vec<Value>) -> Self {
24        let ty = cells
25            .iter()
26            .fold(ColType::Unknown, |acc, v| acc.unify(v.col_type()));
27        Column {
28            name: name.into(),
29            ty,
30            cells,
31        }
32    }
33
34    /// The finite numeric projection of this column (nulls and non-numeric
35    /// cells dropped). Empty for non-numeric columns — honest absence, not zeros.
36    pub fn numeric(&self) -> Vec<f64> {
37        self.cells
38            .iter()
39            .filter_map(Value::as_f64)
40            .filter(|x| x.is_finite())
41            .collect()
42    }
43
44    /// Count of null cells.
45    pub fn null_count(&self) -> usize {
46        self.cells.iter().filter(|v| v.is_null()).count()
47    }
48
49    pub fn len(&self) -> usize {
50        self.cells.len()
51    }
52
53    pub fn is_empty(&self) -> bool {
54        self.cells.is_empty()
55    }
56}
57
58/// A normalized corpus: named columns of equal length, plus provenance about
59/// where it came from. This is the universal input to every detector.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct RecordSet {
62    /// Logical source identifier (path, URL, or `"-"` for stdin).
63    pub source: String,
64    /// The format the normalizer recognized (e.g. `"csv"`, `"ndjson"`).
65    pub format: String,
66    pub columns: Vec<Column>,
67}
68
69impl RecordSet {
70    /// Creates a record set, panicking only via debug-assert if columns are
71    /// ragged. Construction is the normalizer's responsibility; detectors may
72    /// rely on rectangularity.
73    pub fn new(source: impl Into<String>, format: impl Into<String>, columns: Vec<Column>) -> Self {
74        debug_assert!(
75            columns.windows(2).all(|w| w[0].len() == w[1].len()),
76            "RecordSet columns must be equal length"
77        );
78        RecordSet {
79            source: source.into(),
80            format: format.into(),
81            columns,
82        }
83    }
84
85    /// Number of rows (length of the first column, or 0 if columnless).
86    pub fn rows(&self) -> usize {
87        self.columns.first().map_or(0, Column::len)
88    }
89
90    pub fn width(&self) -> usize {
91        self.columns.len()
92    }
93
94    pub fn column(&self, name: &str) -> Option<&Column> {
95        self.columns.iter().find(|c| c.name == name)
96    }
97
98    /// A copy keeping only the columns named in `names`, in their original
99    /// column order (not the order given). Names with no matching column are
100    /// silently skipped — callers that must reject an unknown name should
101    /// validate with [`Self::column`] first. Provenance is preserved.
102    ///
103    /// This is the column-scoping primitive behind `scan --columns`: it lets a
104    /// caller focus detection on a handful of meaningful columns in a wide
105    /// corpus (e.g. journald's dozens of identifier/counter fields).
106    pub fn select(&self, names: &[String]) -> RecordSet {
107        let keep: BTreeSet<&str> = names.iter().map(String::as_str).collect();
108        self.retain(|name| keep.contains(name))
109    }
110
111    /// A copy dropping the columns named in `names`, preserving the order and
112    /// provenance of the rest. The complement of [`Self::select`], behind
113    /// `scan --exclude`.
114    pub fn without(&self, names: &[String]) -> RecordSet {
115        let drop: BTreeSet<&str> = names.iter().map(String::as_str).collect();
116        self.retain(|name| !drop.contains(name))
117    }
118
119    /// Shared projection: a copy keeping columns whose name satisfies `keep`.
120    fn retain(&self, keep: impl Fn(&str) -> bool) -> RecordSet {
121        RecordSet {
122            source: self.source.clone(),
123            format: self.format.clone(),
124            columns: self
125                .columns
126                .iter()
127                .filter(|c| keep(&c.name))
128                .cloned()
129                .collect(),
130        }
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn numeric_skips_nulls_and_strings() {
140        let col = Column::new(
141            "x",
142            vec![
143                Value::Int(1),
144                Value::Null,
145                Value::Str("nope".into()),
146                Value::Float(2.5),
147            ],
148        );
149        assert_eq!(col.numeric(), vec![1.0, 2.5]);
150        assert_eq!(col.ty, ColType::Mixed);
151        assert_eq!(col.null_count(), 1);
152    }
153
154    #[test]
155    fn null_count_is_exact() {
156        assert_eq!(
157            Column::new("a", vec![Value::Int(1), Value::Int(2)]).null_count(),
158            0
159        );
160        assert_eq!(
161            Column::new("b", vec![Value::Null, Value::Int(1), Value::Null]).null_count(),
162            2
163        );
164    }
165
166    #[test]
167    fn empty_and_nonempty_columns() {
168        assert!(Column::new("e", vec![]).is_empty());
169        assert!(!Column::new("f", vec![Value::Int(1)]).is_empty());
170    }
171
172    #[test]
173    fn rows_and_width() {
174        let rs = RecordSet::new(
175            "-",
176            "csv",
177            vec![
178                Column::new("a", vec![Value::Int(1), Value::Int(2)]),
179                Column::new("b", vec![Value::Int(3), Value::Int(4)]),
180            ],
181        );
182        assert_eq!(rs.rows(), 2);
183        assert_eq!(rs.width(), 2);
184        assert!(rs.column("a").is_some());
185        assert!(rs.column("z").is_none());
186    }
187
188    fn abc() -> RecordSet {
189        RecordSet::new(
190            "src.csv",
191            "csv",
192            vec![
193                Column::new("a", vec![Value::Int(1)]),
194                Column::new("b", vec![Value::Int(2)]),
195                Column::new("c", vec![Value::Int(3)]),
196            ],
197        )
198    }
199
200    #[test]
201    fn select_keeps_named_columns_in_original_order() {
202        // Requested order is "c,a" but the projection preserves the corpus's own
203        // column order (a then c); the dropped column (b) is gone; provenance kept.
204        let rs = abc().select(&["c".to_string(), "a".to_string()]);
205        let names: Vec<&str> = rs.columns.iter().map(|c| c.name.as_str()).collect();
206        assert_eq!(names, ["a", "c"]);
207        assert_eq!(rs.source, "src.csv");
208        assert_eq!(rs.format, "csv");
209        assert_eq!(rs.rows(), 1);
210    }
211
212    #[test]
213    fn select_skips_unknown_names() {
214        // An unknown name contributes nothing — only the real "a" survives.
215        let rs = abc().select(&["a".to_string(), "nope".to_string()]);
216        let names: Vec<&str> = rs.columns.iter().map(|c| c.name.as_str()).collect();
217        assert_eq!(names, ["a"]);
218    }
219
220    #[test]
221    fn select_empty_yields_no_columns() {
222        assert_eq!(abc().select(&[]).width(), 0);
223    }
224
225    #[test]
226    fn without_drops_named_columns_and_keeps_the_rest() {
227        let rs = abc().without(&["b".to_string()]);
228        let names: Vec<&str> = rs.columns.iter().map(|c| c.name.as_str()).collect();
229        assert_eq!(names, ["a", "c"]);
230        assert_eq!(rs.source, "src.csv");
231        assert_eq!(rs.format, "csv");
232    }
233
234    #[test]
235    fn without_empty_keeps_everything() {
236        assert_eq!(abc().without(&[]).width(), 3);
237    }
238
239    #[test]
240    fn without_unknown_name_is_a_noop() {
241        assert_eq!(abc().without(&["zzz".to_string()]).width(), 3);
242    }
243}