use ax_core::AxError;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Format {
Csv,
Tsv,
Ndjson,
Json,
Parquet,
Arrow,
}
impl Format {
pub fn token(self) -> &'static str {
match self {
Format::Csv => "csv",
Format::Tsv => "tsv",
Format::Ndjson => "ndjson",
Format::Json => "json",
Format::Parquet => "parquet",
Format::Arrow => "arrow",
}
}
pub fn is_binary(self) -> bool {
matches!(self, Format::Parquet | Format::Arrow)
}
pub fn from_extension(path: &str) -> Option<Format> {
let ext = path.rsplit('.').next()?.to_ascii_lowercase();
match ext.as_str() {
"csv" => Some(Format::Csv),
"tsv" | "tab" => Some(Format::Tsv),
"ndjson" | "jsonl" => Some(Format::Ndjson),
"json" => Some(Format::Json),
"parquet" | "pq" => Some(Format::Parquet),
"arrow" | "ipc" | "feather" => Some(Format::Arrow),
_ => None,
}
}
pub fn sniff(bytes: &[u8]) -> Option<Format> {
if bytes.starts_with(b"PAR1") {
return Some(Format::Parquet);
}
if bytes.starts_with(b"ARROW1") {
return Some(Format::Arrow);
}
let text = std::str::from_utf8(bytes).ok()?;
let trimmed = text.trim_start();
let first = trimmed.chars().next()?;
match first {
'[' => Some(Format::Json),
'{' => {
let object_lines = trimmed
.lines()
.filter(|l| !l.trim().is_empty())
.take(3)
.filter(|l| l.trim_start().starts_with('{'))
.count();
if object_lines >= 2 {
Some(Format::Ndjson)
} else {
Some(Format::Json)
}
}
_ => {
let line = trimmed.lines().next()?;
match (line.find('\t'), line.find(',')) {
(Some(t), Some(c)) if t < c => Some(Format::Tsv),
(Some(_), None) => Some(Format::Tsv),
_ => Some(Format::Csv), }
}
}
}
pub fn resolve(source: &str, bytes: &[u8]) -> Result<Format, AxError> {
if let Some(f) = Format::from_extension(source) {
return Ok(f);
}
Format::sniff(bytes).ok_or_else(|| AxError::UnknownFormat(source.to_string()))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn format_tokens_are_exact() {
assert_eq!(Format::Csv.token(), "csv");
assert_eq!(Format::Tsv.token(), "tsv");
assert_eq!(Format::Ndjson.token(), "ndjson");
assert_eq!(Format::Json.token(), "json");
assert_eq!(Format::Parquet.token(), "parquet");
assert_eq!(Format::Arrow.token(), "arrow");
}
#[test]
fn binary_classification() {
assert!(Format::Parquet.is_binary());
assert!(Format::Arrow.is_binary());
assert!(!Format::Csv.is_binary());
assert!(!Format::Json.is_binary());
}
#[test]
fn binary_extensions_and_magic() {
assert_eq!(Format::from_extension("x.parquet"), Some(Format::Parquet));
assert_eq!(Format::from_extension("x.feather"), Some(Format::Arrow));
assert_eq!(Format::from_extension("x.ipc"), Some(Format::Arrow));
assert_eq!(Format::sniff(b"PAR1\x00\x01rest"), Some(Format::Parquet));
assert_eq!(Format::sniff(b"ARROW1\x00\x00rest"), Some(Format::Arrow));
assert_eq!(Format::sniff(b"a,b\nPAR1,2"), Some(Format::Csv));
}
#[test]
fn extension_detection() {
assert_eq!(Format::from_extension("a/b.csv"), Some(Format::Csv));
assert_eq!(Format::from_extension("x.tsv"), Some(Format::Tsv));
assert_eq!(Format::from_extension("x.tab"), Some(Format::Tsv));
assert_eq!(Format::from_extension("x.json"), Some(Format::Json));
assert_eq!(Format::from_extension("x.JSONL"), Some(Format::Ndjson));
assert_eq!(Format::from_extension("x.xlsx"), None);
assert_eq!(Format::from_extension("noext"), None);
}
#[test]
fn sniff_uses_delimiter_order_when_both_present() {
assert_eq!(Format::sniff(b"a\tb,c\n1\t2,3"), Some(Format::Tsv));
assert_eq!(Format::sniff(b"a,b\tc\n1,2\t3"), Some(Format::Csv));
}
#[test]
fn sniff_json_vs_ndjson() {
assert_eq!(Format::sniff(b"[{\"a\":1}]"), Some(Format::Json));
assert_eq!(
Format::sniff(b"{\"a\":1}\n{\"a\":2}\n"),
Some(Format::Ndjson)
);
assert_eq!(Format::sniff(b"{\"a\":1}"), Some(Format::Json));
}
#[test]
fn sniff_csv_vs_tsv() {
assert_eq!(Format::sniff(b"a,b,c\n1,2,3"), Some(Format::Csv));
assert_eq!(Format::sniff(b"a\tb\tc\n1\t2\t3"), Some(Format::Tsv));
}
#[test]
fn resolve_prefers_extension_then_sniff() {
assert_eq!(
Format::resolve("data.csv", b"{\"a\":1}").unwrap(),
Format::Csv
);
assert_eq!(Format::resolve("-", b"a,b\n1,2").unwrap(), Format::Csv);
assert!(Format::resolve("-", &[0xff, 0xfe, 0x00]).is_err());
}
}