anomalyx_normalize/
format.rs1use ax_core::AxError;
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum Format {
10 Csv,
11 Tsv,
12 Ndjson,
14 Json,
16 Parquet,
18 Arrow,
20}
21
22impl Format {
23 pub fn token(self) -> &'static str {
25 match self {
26 Format::Csv => "csv",
27 Format::Tsv => "tsv",
28 Format::Ndjson => "ndjson",
29 Format::Json => "json",
30 Format::Parquet => "parquet",
31 Format::Arrow => "arrow",
32 }
33 }
34
35 pub fn is_binary(self) -> bool {
37 matches!(self, Format::Parquet | Format::Arrow)
38 }
39
40 pub fn from_extension(path: &str) -> Option<Format> {
42 let ext = path.rsplit('.').next()?.to_ascii_lowercase();
43 match ext.as_str() {
44 "csv" => Some(Format::Csv),
45 "tsv" | "tab" => Some(Format::Tsv),
46 "ndjson" | "jsonl" => Some(Format::Ndjson),
47 "json" => Some(Format::Json),
48 "parquet" | "pq" => Some(Format::Parquet),
49 "arrow" | "ipc" | "feather" => Some(Format::Arrow),
50 _ => None,
51 }
52 }
53
54 pub fn sniff(bytes: &[u8]) -> Option<Format> {
58 if bytes.starts_with(b"PAR1") {
60 return Some(Format::Parquet);
61 }
62 if bytes.starts_with(b"ARROW1") {
64 return Some(Format::Arrow);
65 }
66 let text = std::str::from_utf8(bytes).ok()?;
67 let trimmed = text.trim_start();
68 let first = trimmed.chars().next()?;
69 match first {
70 '[' => Some(Format::Json),
71 '{' => {
72 let object_lines = trimmed
74 .lines()
75 .filter(|l| !l.trim().is_empty())
76 .take(3)
77 .filter(|l| l.trim_start().starts_with('{'))
78 .count();
79 if object_lines >= 2 {
80 Some(Format::Ndjson)
81 } else {
82 Some(Format::Json)
83 }
84 }
85 _ => {
86 let line = trimmed.lines().next()?;
88 match (line.find('\t'), line.find(',')) {
91 (Some(t), Some(c)) if t < c => Some(Format::Tsv),
92 (Some(_), None) => Some(Format::Tsv),
93 _ => Some(Format::Csv), }
95 }
96 }
97 }
98
99 pub fn resolve(source: &str, bytes: &[u8]) -> Result<Format, AxError> {
101 if let Some(f) = Format::from_extension(source) {
102 return Ok(f);
103 }
104 Format::sniff(bytes).ok_or_else(|| AxError::UnknownFormat(source.to_string()))
105 }
106}
107
108#[cfg(test)]
109mod tests {
110 use super::*;
111
112 #[test]
113 fn format_tokens_are_exact() {
114 assert_eq!(Format::Csv.token(), "csv");
115 assert_eq!(Format::Tsv.token(), "tsv");
116 assert_eq!(Format::Ndjson.token(), "ndjson");
117 assert_eq!(Format::Json.token(), "json");
118 assert_eq!(Format::Parquet.token(), "parquet");
119 assert_eq!(Format::Arrow.token(), "arrow");
120 }
121
122 #[test]
123 fn binary_classification() {
124 assert!(Format::Parquet.is_binary());
125 assert!(Format::Arrow.is_binary());
126 assert!(!Format::Csv.is_binary());
127 assert!(!Format::Json.is_binary());
128 }
129
130 #[test]
131 fn binary_extensions_and_magic() {
132 assert_eq!(Format::from_extension("x.parquet"), Some(Format::Parquet));
133 assert_eq!(Format::from_extension("x.feather"), Some(Format::Arrow));
134 assert_eq!(Format::from_extension("x.ipc"), Some(Format::Arrow));
135 assert_eq!(Format::sniff(b"PAR1\x00\x01rest"), Some(Format::Parquet));
137 assert_eq!(Format::sniff(b"ARROW1\x00\x00rest"), Some(Format::Arrow));
138 assert_eq!(Format::sniff(b"a,b\nPAR1,2"), Some(Format::Csv));
140 }
141
142 #[test]
143 fn extension_detection() {
144 assert_eq!(Format::from_extension("a/b.csv"), Some(Format::Csv));
145 assert_eq!(Format::from_extension("x.tsv"), Some(Format::Tsv));
146 assert_eq!(Format::from_extension("x.tab"), Some(Format::Tsv));
147 assert_eq!(Format::from_extension("x.json"), Some(Format::Json));
148 assert_eq!(Format::from_extension("x.JSONL"), Some(Format::Ndjson));
149 assert_eq!(Format::from_extension("x.xlsx"), None);
150 assert_eq!(Format::from_extension("noext"), None);
151 }
152
153 #[test]
154 fn sniff_uses_delimiter_order_when_both_present() {
155 assert_eq!(Format::sniff(b"a\tb,c\n1\t2,3"), Some(Format::Tsv));
157 assert_eq!(Format::sniff(b"a,b\tc\n1,2\t3"), Some(Format::Csv));
158 }
159
160 #[test]
161 fn sniff_json_vs_ndjson() {
162 assert_eq!(Format::sniff(b"[{\"a\":1}]"), Some(Format::Json));
163 assert_eq!(
164 Format::sniff(b"{\"a\":1}\n{\"a\":2}\n"),
165 Some(Format::Ndjson)
166 );
167 assert_eq!(Format::sniff(b"{\"a\":1}"), Some(Format::Json));
168 }
169
170 #[test]
171 fn sniff_csv_vs_tsv() {
172 assert_eq!(Format::sniff(b"a,b,c\n1,2,3"), Some(Format::Csv));
173 assert_eq!(Format::sniff(b"a\tb\tc\n1\t2\t3"), Some(Format::Tsv));
174 }
175
176 #[test]
177 fn resolve_prefers_extension_then_sniff() {
178 assert_eq!(
180 Format::resolve("data.csv", b"{\"a\":1}").unwrap(),
181 Format::Csv
182 );
183 assert_eq!(Format::resolve("-", b"a,b\n1,2").unwrap(), Format::Csv);
185 assert!(Format::resolve("-", &[0xff, 0xfe, 0x00]).is_err());
186 }
187}