anomalyx_normalize/
lib.rs1use ax_core::{AxError, RecordSet};
19
20pub mod infer;
21pub mod parser;
22pub mod parsers;
23pub mod table;
24
25pub use parser::{Confidence, FormatParser, ParserRegistry};
26
27pub fn normalize(source: &str, bytes: &[u8]) -> Result<RecordSet, AxError> {
30 ParserRegistry::default().normalize(source, bytes)
31}
32
33pub fn normalize_with(id: &str, source: &str, bytes: &[u8]) -> Result<RecordSet, AxError> {
35 ParserRegistry::default().normalize_with(id, source, bytes)
36}
37
38#[cfg(test)]
39mod tests {
40 use super::*;
41 use ax_core::{ColType, Value};
42
43 #[test]
44 fn csv_end_to_end() {
45 let rs = normalize("t.csv", b"a,b\n1,x\n2,\n3,z").unwrap();
46 assert_eq!(rs.format, "csv");
47 assert_eq!(rs.width(), 2);
48 assert_eq!(rs.rows(), 3);
49 assert_eq!(rs.column("a").unwrap().ty, ColType::Int);
50 assert_eq!(rs.column("b").unwrap().null_count(), 1);
51 }
52
53 #[test]
54 fn ndjson_end_to_end() {
55 let rs = normalize("-", b"{\"a\":1}\n{\"a\":2,\"b\":9}\n").unwrap();
56 assert_eq!(rs.format, "ndjson");
57 assert_eq!(rs.rows(), 2);
58 assert_eq!(rs.column("b").unwrap().null_count(), 1);
59 }
60
61 #[test]
62 fn json_end_to_end() {
63 let rs = normalize("d.json", br#"[{"x":10},{"x":20},{"x":30}]"#).unwrap();
64 assert_eq!(rs.format, "json");
65 assert_eq!(rs.rows(), 3);
66 assert_eq!(rs.column("x").unwrap().ty, ColType::Int);
67 }
68
69 #[test]
70 fn tsv_sniffed_from_content() {
71 let rs = normalize("-", b"a\tb\n1\t2\n3\t4").unwrap();
72 assert_eq!(rs.format, "tsv");
73 assert_eq!(rs.width(), 2);
74 }
75
76 #[test]
77 fn ragged_csv_pads_and_truncates() {
78 let rs = normalize("t.csv", b"a,b\n1\n2,3,4").unwrap();
79 assert_eq!(rs.rows(), 2);
80 assert_eq!(rs.column("b").unwrap().cells[0], Value::Null);
81 }
82
83 #[test]
84 fn unknown_format_errors() {
85 assert!(matches!(
86 normalize("-", &[0x00, 0x01, 0x02, 0xff]),
87 Err(AxError::UnknownFormat(_))
88 ));
89 }
90
91 #[test]
92 fn normalize_with_explicit_id() {
93 let rs = normalize_with("csv", "x", b"a,b\n1,2").unwrap();
95 assert_eq!(rs.format, "csv");
96 assert!(normalize_with("nonesuch", "x", b"a,b").is_err());
97 }
98
99 #[cfg(feature = "polars")]
100 #[test]
101 fn parquet_routes_through_the_registry() {
102 use polars::prelude::*;
103 let mut df = df!["a" => [1i64, 2, 3], "b" => [4i64, 5, 6]].unwrap();
104 let mut buf = Vec::new();
105 ParquetWriter::new(&mut buf).finish(&mut df).unwrap();
106 let rs = normalize("t.parquet", &buf).unwrap();
107 assert_eq!(rs.format, "parquet");
108 assert_eq!(rs.width(), 2);
109 assert_eq!(rs.rows(), 3);
110 }
111}