Skip to main content

anomalyx_normalize/
table.rs

1//! Shared row-oriented table builder for the JSON-family parsers.
2//!
3//! Accumulates record-shaped JSON values into columns with a stable, sorted
4//! key-union order. Keys missing from a given record fill with
5//! [`ax_core::Value::Null`], so every column ends equal length — absence is
6//! explicit, never a guess.
7
8use crate::infer;
9use ax_core::{Column, Value};
10use std::collections::BTreeMap;
11
12/// Synthetic column name for non-object records (scalars, arrays).
13pub const VALUE_COL: &str = "value";
14
15#[derive(Default)]
16pub struct TableBuilder {
17    order: Vec<String>,
18    index: BTreeMap<String, usize>,
19    cols: Vec<Vec<Value>>,
20    rows: usize,
21}
22
23impl TableBuilder {
24    pub fn new() -> Self {
25        TableBuilder::default()
26    }
27
28    /// Ensures a column exists, back-filling it with `Null` for prior rows.
29    fn ensure(&mut self, name: &str) -> usize {
30        if let Some(&i) = self.index.get(name) {
31            return i;
32        }
33        let i = self.order.len();
34        self.order.push(name.to_string());
35        self.index.insert(name.to_string(), i);
36        self.cols.push(vec![Value::Null; self.rows]);
37        i
38    }
39
40    /// Adds one record from already-typed cells, keyed by column name. Columns
41    /// new to this record are created (back-filled with `Null`); columns absent
42    /// from it receive `Null`. Used by record-shaped parsers (NDJSON, logfmt).
43    pub fn push_row(&mut self, mut row: BTreeMap<String, Value>) {
44        for k in row.keys() {
45            self.ensure(k);
46        }
47        for (name, &i) in &self.index {
48            let cell = row.remove(name).unwrap_or(Value::Null);
49            self.cols[i].push(cell);
50        }
51        self.rows += 1;
52    }
53
54    /// Adds one JSON record. Objects contribute their fields; anything else goes
55    /// to the synthetic [`VALUE_COL`] column.
56    pub fn push_value(&mut self, val: serde_json::Value) {
57        let mut row: BTreeMap<String, Value> = BTreeMap::new();
58        match val {
59            serde_json::Value::Object(map) => {
60                for (k, v) in map {
61                    row.insert(k, infer::json_to_value(&v));
62                }
63            }
64            other => {
65                row.insert(VALUE_COL.to_string(), infer::json_to_value(&other));
66            }
67        }
68        self.push_row(row);
69    }
70
71    pub fn finish(self) -> Vec<Column> {
72        self.order
73            .into_iter()
74            .zip(self.cols)
75            .map(|(name, cells)| Column::new(name, cells))
76            .collect()
77    }
78}
79
80#[cfg(test)]
81mod tests {
82    use super::*;
83
84    #[test]
85    fn key_union_pads_missing_with_null() {
86        let mut b = TableBuilder::new();
87        b.push_value(serde_json::json!({"a": 1}));
88        b.push_value(serde_json::json!({"a": 2, "b": 9}));
89        let cols = b.finish();
90        assert_eq!(cols.len(), 2);
91        let bcol = cols.iter().find(|c| c.name == "b").unwrap();
92        assert_eq!(bcol.null_count(), 1); // first row had no `b`
93    }
94
95    #[test]
96    fn non_object_goes_to_value_column() {
97        let mut b = TableBuilder::new();
98        b.push_value(serde_json::json!(7));
99        b.push_value(serde_json::json!(8));
100        let cols = b.finish();
101        assert_eq!(cols.len(), 1);
102        assert_eq!(cols[0].name, VALUE_COL);
103        assert_eq!(cols[0].numeric(), vec![7.0, 8.0]);
104    }
105}