Skip to main content

array_format/
array.rs

1//! The [`ArrayElement`] trait: the element types that can be stored.
2//!
3//! A single generic code path handles both fixed-width primitives and
4//! variable-length values by going through this trait, which provides
5//! chunk-level encode/decode and fill-value generation.
6
7use zerocopy::IntoBytes;
8
9use crate::dtype::DType;
10use crate::layout::FillValue;
11use crate::timestamp::TimestampNs;
12
13// ── ArrayElement trait ──────────────────────────────────────────────
14
15/// Unified element type for all array operations.
16///
17/// Implemented by all fixed-width numeric primitives and variable-length
18/// types (`String`, `Vec<u8>`). The trait provides chunk-level encode/decode
19/// and fill-value generation so that a single generic code path handles both.
20pub trait ArrayElement: Clone + Send + Sync + 'static {
21    /// The [`DType`] this Rust type maps to on disk.
22    const DTYPE: DType;
23    /// Encodes a chunk's worth of values into the on-disk byte representation.
24    fn encode_chunk(values: &[Self]) -> Vec<u8>;
25    /// Decodes bytes produced by [`encode_chunk`](Self::encode_chunk) back into
26    /// values.
27    fn decode_chunk(bytes: &[u8]) -> Vec<Self>;
28    /// Returns the element used for unwritten positions, derived from the
29    /// array's `fill` value (or this type's natural default when `fill` is
30    /// `None` or not applicable).
31    fn fill_element(fill: Option<&FillValue>) -> Self;
32}
33
34// ── Shared helpers for fixed-width types ─────────────────────────────
35
36fn encode_copy<T: Sized>(values: &[T]) -> Vec<u8> {
37    let byte_len = std::mem::size_of_val(values);
38    unsafe { std::slice::from_raw_parts(values.as_ptr() as *const u8, byte_len) }.to_vec()
39}
40
41fn decode_copy<T: Sized>(bytes: &[u8]) -> Vec<T> {
42    let elem = std::mem::size_of::<T>();
43    if bytes.is_empty() || elem == 0 {
44        return vec![];
45    }
46    let n = bytes.len() / elem;
47    let mut out: Vec<T> = Vec::with_capacity(n);
48    // SAFETY: numeric primitives have no invalid bit patterns; Vec<T> is aligned.
49    unsafe {
50        std::ptr::copy_nonoverlapping(bytes.as_ptr(), out.as_mut_ptr() as *mut u8, n * elem);
51        out.set_len(n);
52    }
53    out
54}
55
56// ── Numeric implementations ──────────────────────────────────────────
57
58macro_rules! impl_element_uint {
59    ($ty:ty, $variant:expr) => {
60        impl ArrayElement for $ty {
61            const DTYPE: DType = $variant;
62            fn encode_chunk(values: &[Self]) -> Vec<u8> {
63                encode_copy(values)
64            }
65            fn decode_chunk(bytes: &[u8]) -> Vec<Self> {
66                decode_copy(bytes)
67            }
68            fn fill_element(fill: Option<&FillValue>) -> Self {
69                match fill {
70                    Some(FillValue::UInt(v)) => *v as $ty,
71                    Some(FillValue::Int(v)) => *v as $ty,
72                    Some(FillValue::Float(v)) => *v as $ty,
73                    Some(FillValue::Bool(v)) => *v as u8 as $ty,
74                    _ => 0,
75                }
76            }
77        }
78    };
79}
80
81macro_rules! impl_element_int {
82    ($ty:ty, $variant:expr) => {
83        impl ArrayElement for $ty {
84            const DTYPE: DType = $variant;
85            fn encode_chunk(values: &[Self]) -> Vec<u8> {
86                encode_copy(values)
87            }
88            fn decode_chunk(bytes: &[u8]) -> Vec<Self> {
89                decode_copy(bytes)
90            }
91            fn fill_element(fill: Option<&FillValue>) -> Self {
92                match fill {
93                    Some(FillValue::Int(v)) => *v as $ty,
94                    Some(FillValue::UInt(v)) => *v as $ty,
95                    Some(FillValue::Float(v)) => *v as $ty,
96                    _ => 0,
97                }
98            }
99        }
100    };
101}
102
103macro_rules! impl_element_float {
104    ($ty:ty, $variant:expr) => {
105        impl ArrayElement for $ty {
106            const DTYPE: DType = $variant;
107            fn encode_chunk(values: &[Self]) -> Vec<u8> {
108                encode_copy(values)
109            }
110            fn decode_chunk(bytes: &[u8]) -> Vec<Self> {
111                decode_copy(bytes)
112            }
113            fn fill_element(fill: Option<&FillValue>) -> Self {
114                match fill {
115                    Some(FillValue::Float(v)) => *v as $ty,
116                    Some(FillValue::Int(v)) => *v as $ty,
117                    Some(FillValue::UInt(v)) => *v as $ty,
118                    _ => 0.0,
119                }
120            }
121        }
122    };
123}
124
125impl_element_uint!(u8, DType::UInt8);
126impl_element_uint!(u16, DType::UInt16);
127impl_element_uint!(u32, DType::UInt32);
128impl_element_uint!(u64, DType::UInt64);
129impl_element_int!(i8, DType::Int8);
130impl_element_int!(i16, DType::Int16);
131impl_element_int!(i32, DType::Int32);
132impl_element_int!(i64, DType::Int64);
133impl_element_float!(f32, DType::Float32);
134impl_element_float!(f64, DType::Float64);
135
136// ── TimestampNs (wrapper around i64, zerocopy-backed) ────────────────
137
138impl ArrayElement for TimestampNs {
139    const DTYPE: DType = DType::TimestampNs;
140
141    fn encode_chunk(values: &[Self]) -> Vec<u8> {
142        values.as_bytes().to_vec()
143    }
144
145    fn decode_chunk(bytes: &[u8]) -> Vec<Self> {
146        let elem = std::mem::size_of::<Self>();
147        let n = bytes.len() / elem;
148        let mut out = vec![Self(0); n];
149        if n > 0 {
150            out.as_mut_bytes().copy_from_slice(&bytes[..n * elem]);
151        }
152        out
153    }
154
155    fn fill_element(fill: Option<&FillValue>) -> Self {
156        match fill {
157            Some(FillValue::TimestampNs(v)) => Self(*v),
158            Some(FillValue::Int(v)) => Self(*v),
159            _ => Self(0),
160        }
161    }
162}
163
164// ── Variable-length helpers ──────────────────────────────────────────
165
166/// Encodes a sequence of byte slices into the offset-buffer format:
167/// `[N+1 u32 LE offsets][concatenated values]`.
168fn encode_offsets<'a>(slices: impl Iterator<Item = &'a [u8]>) -> Vec<u8> {
169    let slices: Vec<&[u8]> = slices.collect();
170    let mut offsets: Vec<u32> = Vec::with_capacity(slices.len() + 1);
171    let mut buf: Vec<u8> = Vec::new();
172    offsets.push(0);
173    for s in &slices {
174        buf.extend_from_slice(s);
175        offsets.push(buf.len() as u32);
176    }
177    let mut data = Vec::with_capacity(offsets.len() * 4 + buf.len());
178    for off in &offsets {
179        data.extend_from_slice(&off.to_le_bytes());
180    }
181    data.extend_from_slice(&buf);
182    data
183}
184
185/// Decodes an offset-buffer chunk back to a `Vec<Vec<u8>>`.
186fn decode_offsets(bytes: &[u8]) -> Vec<Vec<u8>> {
187    let n = {
188        let mut n = 0usize;
189        loop {
190            let pos = (n + 1) * 4;
191            if pos + 4 > bytes.len() {
192                break;
193            }
194            let off = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()) as usize;
195            if pos + 4 + off == bytes.len() {
196                n += 1;
197                break;
198            }
199            n += 1;
200        }
201        n
202    };
203    if n == 0 {
204        return vec![];
205    }
206    let values_base = (n + 1) * 4;
207    (0..n)
208        .map(|i| {
209            let start = u32::from_le_bytes(bytes[i * 4..i * 4 + 4].try_into().unwrap()) as usize;
210            let end = u32::from_le_bytes(bytes[(i + 1) * 4..(i + 1) * 4 + 4].try_into().unwrap())
211                as usize;
212            bytes[values_base + start..values_base + end].to_vec()
213        })
214        .collect()
215}
216
217// ── String ───────────────────────────────────────────────────────────
218
219impl ArrayElement for String {
220    const DTYPE: DType = DType::String;
221
222    fn encode_chunk(values: &[Self]) -> Vec<u8> {
223        encode_offsets(values.iter().map(|s| s.as_bytes()))
224    }
225
226    fn decode_chunk(bytes: &[u8]) -> Vec<Self> {
227        decode_offsets(bytes)
228            .into_iter()
229            .map(|b| String::from_utf8_lossy(&b).into_owned())
230            .collect()
231    }
232
233    fn fill_element(_fill: Option<&FillValue>) -> Self {
234        String::new()
235    }
236}
237
238// ── Vec<u8> ──────────────────────────────────────────────────────────
239
240impl ArrayElement for Vec<u8> {
241    const DTYPE: DType = DType::Binary;
242
243    fn encode_chunk(values: &[Self]) -> Vec<u8> {
244        encode_offsets(values.iter().map(|v| v.as_slice()))
245    }
246
247    fn decode_chunk(bytes: &[u8]) -> Vec<Self> {
248        decode_offsets(bytes)
249    }
250
251    fn fill_element(_fill: Option<&FillValue>) -> Self {
252        Vec::new()
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259
260    #[test]
261    fn numeric_roundtrip_f32() {
262        let values = vec![1.0f32, 2.5, 3.5];
263        let bytes = f32::encode_chunk(&values);
264        assert_eq!(bytes.len(), 12);
265        let back = f32::decode_chunk(&bytes);
266        assert_eq!(back, values);
267    }
268
269    #[test]
270    fn numeric_roundtrip_i32() {
271        let values = vec![-1i32, 0, 42, i32::MAX];
272        let bytes = i32::encode_chunk(&values);
273        let back = i32::decode_chunk(&bytes);
274        assert_eq!(back, values);
275    }
276
277    #[test]
278    fn string_roundtrip() {
279        let values = vec!["hello".to_string(), "".to_string(), "world!".to_string()];
280        let bytes = String::encode_chunk(&values);
281        let back = String::decode_chunk(&bytes);
282        assert_eq!(back, values);
283    }
284
285    #[test]
286    fn binary_roundtrip() {
287        let values = vec![vec![1u8, 2, 3], vec![], vec![255]];
288        let bytes = Vec::<u8>::encode_chunk(&values);
289        let back = Vec::<u8>::decode_chunk(&bytes);
290        assert_eq!(back, values);
291    }
292
293    #[test]
294    fn fill_element_numeric() {
295        assert_eq!(i32::fill_element(Some(&FillValue::Int(-7))), -7i32);
296        assert_eq!(f64::fill_element(Some(&FillValue::Float(1.5))), 1.5f64);
297        assert_eq!(u8::fill_element(None), 0u8);
298    }
299
300    #[test]
301    fn fill_element_vlen_ignores_fill() {
302        assert_eq!(String::fill_element(Some(&FillValue::Int(99))), "");
303        assert_eq!(Vec::<u8>::fill_element(None), Vec::<u8>::new());
304    }
305
306    #[test]
307    fn decode_empty() {
308        assert_eq!(i32::decode_chunk(&[]), Vec::<i32>::new());
309        assert_eq!(String::decode_chunk(&[]), Vec::<String>::new());
310    }
311
312    #[test]
313    fn timestamp_roundtrip() {
314        let values = vec![
315            TimestampNs(0),
316            TimestampNs(1_700_000_000_000_000_000),
317            TimestampNs(-1),
318            TimestampNs(i64::MAX),
319            TimestampNs(i64::MIN),
320        ];
321        let bytes = TimestampNs::encode_chunk(&values);
322        assert_eq!(bytes.len(), values.len() * 8);
323        let back = TimestampNs::decode_chunk(&bytes);
324        assert_eq!(back, values);
325    }
326
327    #[test]
328    fn timestamp_fill_element() {
329        assert_eq!(
330            TimestampNs::fill_element(Some(&FillValue::TimestampNs(123))),
331            TimestampNs(123)
332        );
333        assert_eq!(
334            TimestampNs::fill_element(Some(&FillValue::Int(7))),
335            TimestampNs(7)
336        );
337        assert_eq!(TimestampNs::fill_element(None), TimestampNs(0));
338    }
339}