Skip to main content

array_format/
layout.rs

1//! Array layout definitions and array metadata.
2//!
3//! Each array stored in the file has an [`ArrayMeta`] entry in the footer
4//! describing its type, dimensions, and how its data is laid out across blocks.
5
6use rkyv::{Archive, Deserialize, Serialize};
7
8use crate::address::ChunkAddress;
9use crate::dtype::DType;
10
11/// A scalar value for a per-array key-value attribute.
12#[derive(Debug, Clone, Archive, Serialize, Deserialize)]
13pub enum AttributeValue {
14    /// Boolean value.
15    Bool(bool),
16    /// Signed 8-bit integer.
17    Int8(i8),
18    /// Signed 16-bit integer.
19    Int16(i16),
20    /// Signed 32-bit integer.
21    Int32(i32),
22    /// Signed 64-bit integer.
23    Int64(i64),
24    /// Unsigned 8-bit integer.
25    UInt8(u8),
26    /// Unsigned 16-bit integer.
27    UInt16(u16),
28    /// Unsigned 32-bit integer.
29    UInt32(u32),
30    /// Unsigned 64-bit integer.
31    UInt64(u64),
32    /// 32-bit floating point.
33    Float32(f32),
34    /// 64-bit floating point.
35    Float64(f64),
36    /// UTF-8 string.
37    String(String),
38}
39
40impl PartialEq for AttributeValue {
41    fn eq(&self, other: &Self) -> bool {
42        match (self, other) {
43            (Self::Bool(a), Self::Bool(b)) => a == b,
44            (Self::Int8(a), Self::Int8(b)) => a == b,
45            (Self::Int16(a), Self::Int16(b)) => a == b,
46            (Self::Int32(a), Self::Int32(b)) => a == b,
47            (Self::Int64(a), Self::Int64(b)) => a == b,
48            (Self::UInt8(a), Self::UInt8(b)) => a == b,
49            (Self::UInt16(a), Self::UInt16(b)) => a == b,
50            (Self::UInt32(a), Self::UInt32(b)) => a == b,
51            (Self::UInt64(a), Self::UInt64(b)) => a == b,
52            (Self::Float32(a), Self::Float32(b)) => a.to_bits() == b.to_bits(),
53            (Self::Float64(a), Self::Float64(b)) => a.to_bits() == b.to_bits(),
54            (Self::String(a), Self::String(b)) => a == b,
55            _ => false,
56        }
57    }
58}
59
60/// Controls the integer width used for attribute key/value dictionary indices.
61#[derive(Debug, Clone, Copy, PartialEq, Default, Archive, Serialize, Deserialize)]
62pub enum AttrIndexKind {
63    /// 16-bit dictionary indices (the default).
64    #[default]
65    U16,
66    /// 32-bit dictionary indices.
67    U32,
68    /// 64-bit dictionary indices.
69    U64,
70}
71
72/// Per-array attribute entries as pairs of dictionary indices.
73///
74/// The variant determines the storage width per index. All entries are sorted
75/// by key index to allow O(log n) binary search. Arrays in the same file may
76/// use different variants; each is self-describing.
77#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
78pub enum Attributes {
79    /// `(key_index, value_index)` pairs stored as 16-bit indices.
80    U16(Vec<(u16, u16)>),
81    /// `(key_index, value_index)` pairs stored as 32-bit indices.
82    U32(Vec<(u32, u32)>),
83    /// `(key_index, value_index)` pairs stored as 64-bit indices.
84    U64(Vec<(u64, u64)>),
85}
86
87impl Attributes {
88    /// Returns an empty attribute set for the given index kind.
89    pub fn empty(kind: AttrIndexKind) -> Self {
90        match kind {
91            AttrIndexKind::U16 => Self::U16(Vec::new()),
92            AttrIndexKind::U32 => Self::U32(Vec::new()),
93            AttrIndexKind::U64 => Self::U64(Vec::new()),
94        }
95    }
96
97    /// Returns the value index for `key_idx`, or `None` if absent.
98    pub fn get(&self, key_idx: usize) -> Option<usize> {
99        match self {
100            Self::U16(v) => v
101                .binary_search_by_key(&(key_idx as u16), |(k, _)| *k)
102                .ok()
103                .map(|pos| v[pos].1 as usize),
104            Self::U32(v) => v
105                .binary_search_by_key(&(key_idx as u32), |(k, _)| *k)
106                .ok()
107                .map(|pos| v[pos].1 as usize),
108            Self::U64(v) => v
109                .binary_search_by_key(&(key_idx as u64), |(k, _)| *k)
110                .ok()
111                .map(|pos| v[pos].1 as usize),
112        }
113    }
114
115    /// Inserts or replaces the value index for `key_idx`, keeping entries sorted.
116    pub fn upsert(&mut self, key_idx: usize, val_idx: usize) {
117        match self {
118            Self::U16(v) => {
119                let (ki, vi) = (key_idx as u16, val_idx as u16);
120                match v.binary_search_by_key(&ki, |(k, _)| *k) {
121                    Ok(pos) => v[pos].1 = vi,
122                    Err(pos) => v.insert(pos, (ki, vi)),
123                }
124            }
125            Self::U32(v) => {
126                let (ki, vi) = (key_idx as u32, val_idx as u32);
127                match v.binary_search_by_key(&ki, |(k, _)| *k) {
128                    Ok(pos) => v[pos].1 = vi,
129                    Err(pos) => v.insert(pos, (ki, vi)),
130                }
131            }
132            Self::U64(v) => {
133                let (ki, vi) = (key_idx as u64, val_idx as u64);
134                match v.binary_search_by_key(&ki, |(k, _)| *k) {
135                    Ok(pos) => v[pos].1 = vi,
136                    Err(pos) => v.insert(pos, (ki, vi)),
137                }
138            }
139        }
140    }
141
142    /// Iterates over all `(key_index, value_index)` pairs as `usize`.
143    pub fn iter_entries(&self) -> Box<dyn Iterator<Item = (usize, usize)> + '_> {
144        match self {
145            Self::U16(v) => Box::new(v.iter().map(|&(k, v)| (k as usize, v as usize))),
146            Self::U32(v) => Box::new(v.iter().map(|&(k, v)| (k as usize, v as usize))),
147            Self::U64(v) => Box::new(v.iter().map(|&(k, v)| (k as usize, v as usize))),
148        }
149    }
150
151    /// Returns the maximum index value that fits in this variant.
152    pub fn max_index(&self) -> usize {
153        match self {
154            Self::U16(_) => u16::MAX as usize,
155            Self::U32(_) => u32::MAX as usize,
156            Self::U64(_) => usize::MAX,
157        }
158    }
159}
160
161/// Describes how an array's data is distributed across blocks.
162///
163/// Carries the array's global shape and dimension names so callers always
164/// know the full n-dimensional structure, independent of how the data is
165/// stored internally.
166#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
167pub struct ArrayLayout {
168    /// Size of the array in each dimension (number of elements).
169    pub shape: Vec<u32>,
170    /// Name of each dimension (e.g. `["x", "y", "time"]`).
171    pub dimension_names: Vec<String>,
172    /// Storage strategy for the array data.
173    pub storage: StorageLayout,
174}
175
176/// A single entry in the chunk table: a coordinate vector paired with its
177/// storage address.
178///
179/// `coord` identifies the chunk within the array's grid (one index per
180/// dimension). `address` locates the chunk's bytes within a compressed block.
181#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
182pub struct ChunkEntry {
183    /// Chunk coordinate within the array's grid (one index per dimension).
184    pub coord: Vec<u32>,
185    /// Location of the chunk's bytes within a compressed block.
186    pub address: ChunkAddress,
187}
188
189/// Describes the physical storage strategy for array data.
190///
191/// All arrays are stored as a grid of chunks. A "flat" array is simply one
192/// where `chunk_shape == shape`, yielding a single chunk at the origin.
193#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
194pub struct StorageLayout {
195    /// The shape of each chunk in number of elements per dimension.
196    pub chunk_shape: Vec<u32>,
197    /// Mapping from chunk coordinates to their storage addresses.
198    ///
199    /// A layer sidecar carries only the chunks that changed; unchanged chunks
200    /// fall through to lower layers.
201    pub chunks: Vec<ChunkEntry>,
202}
203
204/// A scalar fill value for an array.
205///
206/// Represents the value used for unwritten or missing elements. Supports
207/// numeric types and strings; complex types (Binary, List, FixedSizeList)
208/// are not representable here.
209#[derive(Debug, Clone, Archive, Serialize, Deserialize)]
210pub enum FillValue {
211    /// Boolean fill value.
212    Bool(bool),
213    /// Signed integer fill value (for the signed integer dtypes).
214    Int(i64),
215    /// Unsigned integer fill value (for the unsigned integer dtypes).
216    UInt(u64),
217    /// Floating-point fill value (for `Float32`/`Float64`).
218    Float(f64),
219    /// String fill value.
220    String(String),
221    /// Fill value for [`DType::TimestampNs`] arrays — interpreted as `i64`
222    /// nanoseconds since the Unix epoch.
223    TimestampNs(i64),
224}
225
226impl PartialEq for FillValue {
227    fn eq(&self, other: &Self) -> bool {
228        match (self, other) {
229            (Self::Bool(a), Self::Bool(b)) => a == b,
230            (Self::Int(a), Self::Int(b)) => a == b,
231            (Self::UInt(a), Self::UInt(b)) => a == b,
232            // Compare by bit pattern so NaN == NaN.
233            (Self::Float(a), Self::Float(b)) => a.to_bits() == b.to_bits(),
234            (Self::String(a), Self::String(b)) => a == b,
235            (Self::TimestampNs(a), Self::TimestampNs(b)) => a == b,
236            _ => false,
237        }
238    }
239}
240
241/// Metadata for a single array stored in the file.
242///
243/// Stored in the footer's array table. Contains the schema, layout
244/// information, and a logical deletion flag.
245#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
246pub struct ArrayMeta {
247    /// Unique name of the array within the file.
248    pub name: String,
249    /// Element data type.
250    pub dtype: DType,
251    /// Shape, dimension names, and physical chunk layout.
252    pub layout: ArrayLayout,
253    /// Fill value for unwritten or missing elements. `None` means zero/empty.
254    pub fill_value: Option<FillValue>,
255    /// Whether this array has been logically deleted.
256    ///
257    /// Deleted arrays are ignored during reads and removed during compaction.
258    pub deleted: bool,
259    /// User-defined key-value attributes.
260    ///
261    /// Each entry is `(key_index, value_index)` referencing `Footer::attr_keys`
262    /// and `Footer::attr_values`. Entries are sorted by key index for O(log n)
263    /// lookup. The `Attributes` variant controls the storage width per index.
264    pub attributes: Attributes,
265}
266
267impl ArrayLayout {
268    /// Looks up the [`ChunkAddress`] for a given chunk coordinate.
269    pub fn get_chunk(&self, coord: &[u32]) -> Option<&ChunkAddress> {
270        self.storage
271            .chunks
272            .iter()
273            .find(|e| e.coord.as_slice() == coord)
274            .map(|e| &e.address)
275    }
276
277    /// Returns all [`ChunkAddress`]es for this layout.
278    pub fn all_addresses(&self) -> Vec<&ChunkAddress> {
279        self.storage.chunks.iter().map(|e| &e.address).collect()
280    }
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286    use crate::address::BlockId;
287
288    fn sample_addr(block: u32, offset: u32, size: u32) -> ChunkAddress {
289        ChunkAddress {
290            block_id: BlockId(block),
291            offset,
292            size,
293        }
294    }
295
296    fn chunk(coord: Vec<u32>, block: u32, offset: u32, size: u32) -> ChunkEntry {
297        ChunkEntry {
298            coord,
299            address: sample_addr(block, offset, size),
300        }
301    }
302
303    #[test]
304    fn flat_layout_addresses() {
305        let layout = ArrayLayout {
306            shape: vec![100],
307            dimension_names: vec!["x".into()],
308            storage: StorageLayout {
309                chunk_shape: vec![100],
310                chunks: vec![chunk(vec![0], 0, 0, 400)],
311            },
312        };
313        assert_eq!(layout.all_addresses().len(), 1);
314        assert!(layout.get_chunk(&[0]).is_some());
315        assert!(layout.get_chunk(&[1]).is_none());
316    }
317
318    #[test]
319    fn chunked_layout_lookup() {
320        let layout = ArrayLayout {
321            shape: vec![128, 128],
322            dimension_names: vec!["x".into(), "y".into()],
323            storage: StorageLayout {
324                chunk_shape: vec![64, 64],
325                chunks: vec![
326                    chunk(vec![0, 0], 3, 0, 1000),
327                    chunk(vec![0, 1], 7, 2000, 1000),
328                ],
329            },
330        };
331        let addr = layout.get_chunk(&[0, 1]).unwrap();
332        assert_eq!(addr.block_id, BlockId(7));
333        assert_eq!(addr.offset, 2000);
334
335        assert!(layout.get_chunk(&[1, 0]).is_none());
336    }
337
338    #[test]
339    fn chunked_all_addresses() {
340        let layout = ArrayLayout {
341            shape: vec![30],
342            dimension_names: vec!["t".into()],
343            storage: StorageLayout {
344                chunk_shape: vec![10],
345                chunks: vec![
346                    chunk(vec![0], 0, 0, 500),
347                    chunk(vec![1], 0, 500, 500),
348                    chunk(vec![2], 1, 0, 500),
349                ],
350            },
351        };
352        assert_eq!(layout.all_addresses().len(), 3);
353    }
354}