Skip to main content

anomalyx_normalize/
format.rs

1//! Format identification: by extension when we have a path, by content sniff
2//! for stdin. Detection is conservative — an unrecognized stream is an
3//! [`AxError::UnknownFormat`], never a silent guess.
4
5use ax_core::AxError;
6
7/// The input formats the text normalizer understands.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum Format {
10    Csv,
11    Tsv,
12    /// Newline-delimited JSON (one JSON value per line).
13    Ndjson,
14    /// A single JSON document (array of objects, object, or array of scalars).
15    Json,
16    /// Apache Parquet (binary columnar). Requires the `polars` feature to read.
17    Parquet,
18    /// Apache Arrow IPC / Feather file (binary columnar). Requires `polars`.
19    Arrow,
20}
21
22impl Format {
23    /// Stable token recorded in the envelope's `format` field.
24    pub fn token(self) -> &'static str {
25        match self {
26            Format::Csv => "csv",
27            Format::Tsv => "tsv",
28            Format::Ndjson => "ndjson",
29            Format::Json => "json",
30            Format::Parquet => "parquet",
31            Format::Arrow => "arrow",
32        }
33    }
34
35    /// Whether this format is binary columnar (read via the Polars backbone).
36    pub fn is_binary(self) -> bool {
37        matches!(self, Format::Parquet | Format::Arrow)
38    }
39
40    /// Picks a format from a file extension, if recognized.
41    pub fn from_extension(path: &str) -> Option<Format> {
42        let ext = path.rsplit('.').next()?.to_ascii_lowercase();
43        match ext.as_str() {
44            "csv" => Some(Format::Csv),
45            "tsv" | "tab" => Some(Format::Tsv),
46            "ndjson" | "jsonl" => Some(Format::Ndjson),
47            "json" => Some(Format::Json),
48            "parquet" | "pq" => Some(Format::Parquet),
49            "arrow" | "ipc" | "feather" => Some(Format::Arrow),
50            _ => None,
51        }
52    }
53
54    /// Sniffs a format from leading content. Binary magic numbers are checked
55    /// first (they are not valid UTF-8); then textual sniffing. `None` if
56    /// nothing matches.
57    pub fn sniff(bytes: &[u8]) -> Option<Format> {
58        // Parquet files begin (and end) with the 4-byte magic "PAR1".
59        if bytes.starts_with(b"PAR1") {
60            return Some(Format::Parquet);
61        }
62        // Arrow IPC files begin with "ARROW1".
63        if bytes.starts_with(b"ARROW1") {
64            return Some(Format::Arrow);
65        }
66        let text = std::str::from_utf8(bytes).ok()?;
67        let trimmed = text.trim_start();
68        let first = trimmed.chars().next()?;
69        match first {
70            '[' => Some(Format::Json),
71            '{' => {
72                // One object → json; multiple object-lines → ndjson.
73                let object_lines = trimmed
74                    .lines()
75                    .filter(|l| !l.trim().is_empty())
76                    .take(3)
77                    .filter(|l| l.trim_start().starts_with('{'))
78                    .count();
79                if object_lines >= 2 {
80                    Some(Format::Ndjson)
81                } else {
82                    Some(Format::Json)
83                }
84            }
85            _ => {
86                // Tabular: prefer TSV if a tab appears before any comma on line 1.
87                let line = trimmed.lines().next()?;
88                // Indices of '\t' and ',' are positions of distinct characters,
89                // so they are never equal; `<` is the only meaningful test.
90                match (line.find('\t'), line.find(',')) {
91                    (Some(t), Some(c)) if t < c => Some(Format::Tsv),
92                    (Some(_), None) => Some(Format::Tsv),
93                    _ => Some(Format::Csv), // comma-first, or single comma-free column
94                }
95            }
96        }
97    }
98
99    /// Resolves the format for `source`/`bytes`: extension first, then sniff.
100    pub fn resolve(source: &str, bytes: &[u8]) -> Result<Format, AxError> {
101        if let Some(f) = Format::from_extension(source) {
102            return Ok(f);
103        }
104        Format::sniff(bytes).ok_or_else(|| AxError::UnknownFormat(source.to_string()))
105    }
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn format_tokens_are_exact() {
114        assert_eq!(Format::Csv.token(), "csv");
115        assert_eq!(Format::Tsv.token(), "tsv");
116        assert_eq!(Format::Ndjson.token(), "ndjson");
117        assert_eq!(Format::Json.token(), "json");
118        assert_eq!(Format::Parquet.token(), "parquet");
119        assert_eq!(Format::Arrow.token(), "arrow");
120    }
121
122    #[test]
123    fn binary_classification() {
124        assert!(Format::Parquet.is_binary());
125        assert!(Format::Arrow.is_binary());
126        assert!(!Format::Csv.is_binary());
127        assert!(!Format::Json.is_binary());
128    }
129
130    #[test]
131    fn binary_extensions_and_magic() {
132        assert_eq!(Format::from_extension("x.parquet"), Some(Format::Parquet));
133        assert_eq!(Format::from_extension("x.feather"), Some(Format::Arrow));
134        assert_eq!(Format::from_extension("x.ipc"), Some(Format::Arrow));
135        // magic numbers win for extensionless input
136        assert_eq!(Format::sniff(b"PAR1\x00\x01rest"), Some(Format::Parquet));
137        assert_eq!(Format::sniff(b"ARROW1\x00\x00rest"), Some(Format::Arrow));
138        // a CSV that merely mentions PAR1 later is still CSV
139        assert_eq!(Format::sniff(b"a,b\nPAR1,2"), Some(Format::Csv));
140    }
141
142    #[test]
143    fn extension_detection() {
144        assert_eq!(Format::from_extension("a/b.csv"), Some(Format::Csv));
145        assert_eq!(Format::from_extension("x.tsv"), Some(Format::Tsv));
146        assert_eq!(Format::from_extension("x.tab"), Some(Format::Tsv));
147        assert_eq!(Format::from_extension("x.json"), Some(Format::Json));
148        assert_eq!(Format::from_extension("x.JSONL"), Some(Format::Ndjson));
149        assert_eq!(Format::from_extension("x.xlsx"), None);
150        assert_eq!(Format::from_extension("noext"), None);
151    }
152
153    #[test]
154    fn sniff_uses_delimiter_order_when_both_present() {
155        // tab before comma → TSV; comma before tab → CSV.
156        assert_eq!(Format::sniff(b"a\tb,c\n1\t2,3"), Some(Format::Tsv));
157        assert_eq!(Format::sniff(b"a,b\tc\n1,2\t3"), Some(Format::Csv));
158    }
159
160    #[test]
161    fn sniff_json_vs_ndjson() {
162        assert_eq!(Format::sniff(b"[{\"a\":1}]"), Some(Format::Json));
163        assert_eq!(
164            Format::sniff(b"{\"a\":1}\n{\"a\":2}\n"),
165            Some(Format::Ndjson)
166        );
167        assert_eq!(Format::sniff(b"{\"a\":1}"), Some(Format::Json));
168    }
169
170    #[test]
171    fn sniff_csv_vs_tsv() {
172        assert_eq!(Format::sniff(b"a,b,c\n1,2,3"), Some(Format::Csv));
173        assert_eq!(Format::sniff(b"a\tb\tc\n1\t2\t3"), Some(Format::Tsv));
174    }
175
176    #[test]
177    fn resolve_prefers_extension_then_sniff() {
178        // extension wins even if content looks like something else
179        assert_eq!(
180            Format::resolve("data.csv", b"{\"a\":1}").unwrap(),
181            Format::Csv
182        );
183        // no extension → sniff
184        assert_eq!(Format::resolve("-", b"a,b\n1,2").unwrap(), Format::Csv);
185        assert!(Format::resolve("-", &[0xff, 0xfe, 0x00]).is_err());
186    }
187}