zarr_datafusion/reader/
dtype.rs

1//! Data type parsing and conversion utilities for Zarr arrays
2//!
3//! Handles conversion between Zarr dtype strings and Arrow DataTypes.
4
5use arrow::datatypes::DataType;
6
7/// Parse Zarr v2 numpy dtype string to normalized type name
8/// Examples: "<i8" -> "int64", "<f4" -> "float32", "|b1" -> "bool"
9pub fn parse_v2_dtype(dtype: &str) -> String {
10    // V2 dtype format: [<>|][type_char][byte_size]
11    // < = little-endian, > = big-endian, | = not applicable
12    // Type chars: i=int, u=uint, f=float, b=bool, S=string, U=unicode
13
14    let chars: Vec<char> = dtype.chars().collect();
15    if chars.len() < 2 {
16        return "float64".to_string();
17    }
18
19    // Skip endianness prefix if present
20    let (type_char, size_str) = if chars[0] == '<' || chars[0] == '>' || chars[0] == '|' {
21        if chars.len() < 3 {
22            return "float64".to_string();
23        }
24        (chars[1], &dtype[2..])
25    } else {
26        (chars[0], &dtype[1..])
27    };
28
29    let size: u32 = size_str.parse().unwrap_or(8);
30
31    match type_char {
32        'i' => match size {
33            1 => "int8",
34            2 => "int16",
35            4 => "int32",
36            8 => "int64",
37            _ => "int64",
38        },
39        'u' => match size {
40            1 => "uint8",
41            2 => "uint16",
42            4 => "uint32",
43            8 => "uint64",
44            _ => "uint64",
45        },
46        'f' => match size {
47            2 => "float16",
48            4 => "float32",
49            8 => "float64",
50            _ => "float64",
51        },
52        'b' => "bool",
53        _ => "float64",
54    }
55    .to_string()
56}
57
58/// Convert Zarr dtype string to Arrow DataType
59pub fn zarr_dtype_to_arrow(dtype: &str) -> DataType {
60    match dtype {
61        "int8" => DataType::Int8,
62        "int16" => DataType::Int16,
63        "int32" => DataType::Int32,
64        "int64" => DataType::Int64,
65        "uint8" => DataType::UInt8,
66        "uint16" => DataType::UInt16,
67        "uint32" => DataType::UInt32,
68        "uint64" => DataType::UInt64,
69        "float16" => DataType::Float16,
70        "float32" => DataType::Float32,
71        "float64" => DataType::Float64,
72        "bool" => DataType::Boolean,
73        _ => DataType::Utf8,
74    }
75}
76
77/// Convert Zarr dtype to Arrow Dictionary type for coordinates
78/// Uses Int16 keys (supports up to 32K unique values) with the value type from Zarr
79pub fn zarr_dtype_to_arrow_dictionary(dtype: &str) -> DataType {
80    let value_type = zarr_dtype_to_arrow(dtype);
81    DataType::Dictionary(Box::new(DataType::Int16), Box::new(value_type))
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87
88    #[test]
89    fn test_parse_v2_dtype_all_types() {
90        assert_eq!(parse_v2_dtype("<i1"), "int8");
91        assert_eq!(parse_v2_dtype("<i2"), "int16");
92        assert_eq!(parse_v2_dtype("<i4"), "int32");
93        assert_eq!(parse_v2_dtype("<i8"), "int64");
94        assert_eq!(parse_v2_dtype("<u1"), "uint8");
95        assert_eq!(parse_v2_dtype("<u2"), "uint16");
96        assert_eq!(parse_v2_dtype("<u4"), "uint32");
97        assert_eq!(parse_v2_dtype("<u8"), "uint64");
98        assert_eq!(parse_v2_dtype("<f2"), "float16");
99        assert_eq!(parse_v2_dtype("<f4"), "float32");
100        assert_eq!(parse_v2_dtype("<f8"), "float64");
101        assert_eq!(parse_v2_dtype("|b1"), "bool");
102    }
103
104    #[test]
105    fn test_parse_v2_dtype_big_endian() {
106        assert_eq!(parse_v2_dtype(">i4"), "int32");
107        assert_eq!(parse_v2_dtype(">f8"), "float64");
108    }
109
110    #[test]
111    fn test_parse_v2_dtype_edge_cases() {
112        assert_eq!(parse_v2_dtype(""), "float64");
113        assert_eq!(parse_v2_dtype("x"), "float64");
114        assert_eq!(parse_v2_dtype("<"), "float64");
115        assert_eq!(parse_v2_dtype("<i"), "float64");
116    }
117
118    #[test]
119    fn test_zarr_dtype_to_arrow_all_types() {
120        assert_eq!(zarr_dtype_to_arrow("int8"), DataType::Int8);
121        assert_eq!(zarr_dtype_to_arrow("int16"), DataType::Int16);
122        assert_eq!(zarr_dtype_to_arrow("int32"), DataType::Int32);
123        assert_eq!(zarr_dtype_to_arrow("int64"), DataType::Int64);
124        assert_eq!(zarr_dtype_to_arrow("uint8"), DataType::UInt8);
125        assert_eq!(zarr_dtype_to_arrow("uint16"), DataType::UInt16);
126        assert_eq!(zarr_dtype_to_arrow("uint32"), DataType::UInt32);
127        assert_eq!(zarr_dtype_to_arrow("uint64"), DataType::UInt64);
128        assert_eq!(zarr_dtype_to_arrow("float16"), DataType::Float16);
129        assert_eq!(zarr_dtype_to_arrow("float32"), DataType::Float32);
130        assert_eq!(zarr_dtype_to_arrow("float64"), DataType::Float64);
131        assert_eq!(zarr_dtype_to_arrow("bool"), DataType::Boolean);
132        assert_eq!(zarr_dtype_to_arrow("unknown"), DataType::Utf8);
133    }
134}