Skip to main content

anomalyx_normalize/
lib.rs

1//! # ax-normalize — any corpus → one [`RecordSet`]
2//!
3//! The article's normalization promise: *"given any corpus of information
4//! regardless of its format, we'll normalize it."* This crate maps every
5//! recognized format onto the engine-independent [`RecordSet`] from `ax-core`,
6//! so detectors never see the difference between a CSV and a Parquet file.
7//!
8//! Formats are **plugins**: each is an independent [`FormatParser`] (one file
9//! under [`parsers`]), resolved by a [`ParserRegistry`] via file extension then
10//! content sniff. Adding a format is a new file plus one registration line —
11//! see [`parsers::default_registry`]. Binary columnar formats (Parquet, Arrow
12//! IPC) live behind the default-on `polars` feature.
13//!
14//! Normalization is deterministic: column order is stable (header order for
15//! tabular input, sorted key-union for JSON), and absence is explicit — a key
16//! missing from one JSON row becomes [`ax_core::Value::Null`], never a guess.
17
18use ax_core::{AxError, RecordSet};
19
20pub mod infer;
21pub mod parser;
22pub mod parsers;
23pub mod table;
24
25pub use parser::{Confidence, FormatParser, ParserRegistry};
26
27/// Normalizes `bytes` from logical `source` into a [`RecordSet`], resolving the
28/// format by extension then content sniff against the default parser registry.
29pub fn normalize(source: &str, bytes: &[u8]) -> Result<RecordSet, AxError> {
30    ParserRegistry::default().normalize(source, bytes)
31}
32
33/// Normalizes with an explicitly chosen format `id` (skips detection).
34pub fn normalize_with(id: &str, source: &str, bytes: &[u8]) -> Result<RecordSet, AxError> {
35    ParserRegistry::default().normalize_with(id, source, bytes)
36}
37
38#[cfg(test)]
39mod tests {
40    use super::*;
41    use ax_core::{ColType, Value};
42
43    #[test]
44    fn csv_end_to_end() {
45        let rs = normalize("t.csv", b"a,b\n1,x\n2,\n3,z").unwrap();
46        assert_eq!(rs.format, "csv");
47        assert_eq!(rs.width(), 2);
48        assert_eq!(rs.rows(), 3);
49        assert_eq!(rs.column("a").unwrap().ty, ColType::Int);
50        assert_eq!(rs.column("b").unwrap().null_count(), 1);
51    }
52
53    #[test]
54    fn ndjson_end_to_end() {
55        let rs = normalize("-", b"{\"a\":1}\n{\"a\":2,\"b\":9}\n").unwrap();
56        assert_eq!(rs.format, "ndjson");
57        assert_eq!(rs.rows(), 2);
58        assert_eq!(rs.column("b").unwrap().null_count(), 1);
59    }
60
61    #[test]
62    fn json_end_to_end() {
63        let rs = normalize("d.json", br#"[{"x":10},{"x":20},{"x":30}]"#).unwrap();
64        assert_eq!(rs.format, "json");
65        assert_eq!(rs.rows(), 3);
66        assert_eq!(rs.column("x").unwrap().ty, ColType::Int);
67    }
68
69    #[test]
70    fn tsv_sniffed_from_content() {
71        let rs = normalize("-", b"a\tb\n1\t2\n3\t4").unwrap();
72        assert_eq!(rs.format, "tsv");
73        assert_eq!(rs.width(), 2);
74    }
75
76    #[test]
77    fn ragged_csv_pads_and_truncates() {
78        let rs = normalize("t.csv", b"a,b\n1\n2,3,4").unwrap();
79        assert_eq!(rs.rows(), 2);
80        assert_eq!(rs.column("b").unwrap().cells[0], Value::Null);
81    }
82
83    #[test]
84    fn unknown_format_errors() {
85        assert!(matches!(
86            normalize("-", &[0x00, 0x01, 0x02, 0xff]),
87            Err(AxError::UnknownFormat(_))
88        ));
89    }
90
91    #[test]
92    fn normalize_with_explicit_id() {
93        // Force TSV parsing even though the bytes would sniff as CSV.
94        let rs = normalize_with("csv", "x", b"a,b\n1,2").unwrap();
95        assert_eq!(rs.format, "csv");
96        assert!(normalize_with("nonesuch", "x", b"a,b").is_err());
97    }
98
99    #[cfg(feature = "polars")]
100    #[test]
101    fn parquet_routes_through_the_registry() {
102        use polars::prelude::*;
103        let mut df = df!["a" => [1i64, 2, 3], "b" => [4i64, 5, 6]].unwrap();
104        let mut buf = Vec::new();
105        ParquetWriter::new(&mut buf).finish(&mut df).unwrap();
106        let rs = normalize("t.parquet", &buf).unwrap();
107        assert_eq!(rs.format, "parquet");
108        assert_eq!(rs.width(), 2);
109        assert_eq!(rs.rows(), 3);
110    }
111}