Skip to main content

array_format/
layout.rs

1//! Array layout definitions and array metadata.
2//!
3//! Each array stored in the file has an [`ArrayMeta`] entry in the footer
4//! describing its type, dimensions, and how its data is laid out across blocks.
5
6use rkyv::{Archive, Deserialize, Serialize};
7
8use crate::address::ChunkAddress;
9use crate::dtype::DType;
10
11/// A scalar value for a per-array key-value attribute.
12#[derive(Debug, Clone, Archive, Serialize, Deserialize)]
13pub enum AttributeValue {
14    Bool(bool),
15    Int8(i8),
16    Int16(i16),
17    Int32(i32),
18    Int64(i64),
19    UInt8(u8),
20    UInt16(u16),
21    UInt32(u32),
22    UInt64(u64),
23    Float32(f32),
24    Float64(f64),
25    String(String),
26}
27
28impl PartialEq for AttributeValue {
29    fn eq(&self, other: &Self) -> bool {
30        match (self, other) {
31            (Self::Bool(a), Self::Bool(b)) => a == b,
32            (Self::Int8(a), Self::Int8(b)) => a == b,
33            (Self::Int16(a), Self::Int16(b)) => a == b,
34            (Self::Int32(a), Self::Int32(b)) => a == b,
35            (Self::Int64(a), Self::Int64(b)) => a == b,
36            (Self::UInt8(a), Self::UInt8(b)) => a == b,
37            (Self::UInt16(a), Self::UInt16(b)) => a == b,
38            (Self::UInt32(a), Self::UInt32(b)) => a == b,
39            (Self::UInt64(a), Self::UInt64(b)) => a == b,
40            (Self::Float32(a), Self::Float32(b)) => a.to_bits() == b.to_bits(),
41            (Self::Float64(a), Self::Float64(b)) => a.to_bits() == b.to_bits(),
42            (Self::String(a), Self::String(b)) => a == b,
43            _ => false,
44        }
45    }
46}
47
48/// Controls the integer width used for attribute key/value dictionary indices.
49#[derive(Debug, Clone, Copy, PartialEq, Default, Archive, Serialize, Deserialize)]
50pub enum AttrIndexKind {
51    #[default]
52    U16,
53    U32,
54    U64,
55}
56
57/// Per-array attribute entries as pairs of dictionary indices.
58///
59/// The variant determines the storage width per index. All entries are sorted
60/// by key index to allow O(log n) binary search. Arrays in the same file may
61/// use different variants; each is self-describing.
62#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
63pub enum Attributes {
64    U16(Vec<(u16, u16)>),
65    U32(Vec<(u32, u32)>),
66    U64(Vec<(u64, u64)>),
67}
68
69impl Attributes {
70    /// Returns an empty attribute set for the given index kind.
71    pub fn empty(kind: AttrIndexKind) -> Self {
72        match kind {
73            AttrIndexKind::U16 => Self::U16(Vec::new()),
74            AttrIndexKind::U32 => Self::U32(Vec::new()),
75            AttrIndexKind::U64 => Self::U64(Vec::new()),
76        }
77    }
78
79    /// Returns the value index for `key_idx`, or `None` if absent.
80    pub fn get(&self, key_idx: usize) -> Option<usize> {
81        match self {
82            Self::U16(v) => v
83                .binary_search_by_key(&(key_idx as u16), |(k, _)| *k)
84                .ok()
85                .map(|pos| v[pos].1 as usize),
86            Self::U32(v) => v
87                .binary_search_by_key(&(key_idx as u32), |(k, _)| *k)
88                .ok()
89                .map(|pos| v[pos].1 as usize),
90            Self::U64(v) => v
91                .binary_search_by_key(&(key_idx as u64), |(k, _)| *k)
92                .ok()
93                .map(|pos| v[pos].1 as usize),
94        }
95    }
96
97    /// Inserts or replaces the value index for `key_idx`, keeping entries sorted.
98    pub fn upsert(&mut self, key_idx: usize, val_idx: usize) {
99        match self {
100            Self::U16(v) => {
101                let (ki, vi) = (key_idx as u16, val_idx as u16);
102                match v.binary_search_by_key(&ki, |(k, _)| *k) {
103                    Ok(pos) => v[pos].1 = vi,
104                    Err(pos) => v.insert(pos, (ki, vi)),
105                }
106            }
107            Self::U32(v) => {
108                let (ki, vi) = (key_idx as u32, val_idx as u32);
109                match v.binary_search_by_key(&ki, |(k, _)| *k) {
110                    Ok(pos) => v[pos].1 = vi,
111                    Err(pos) => v.insert(pos, (ki, vi)),
112                }
113            }
114            Self::U64(v) => {
115                let (ki, vi) = (key_idx as u64, val_idx as u64);
116                match v.binary_search_by_key(&ki, |(k, _)| *k) {
117                    Ok(pos) => v[pos].1 = vi,
118                    Err(pos) => v.insert(pos, (ki, vi)),
119                }
120            }
121        }
122    }
123
124    /// Iterates over all `(key_index, value_index)` pairs as `usize`.
125    pub fn iter_entries(&self) -> Box<dyn Iterator<Item = (usize, usize)> + '_> {
126        match self {
127            Self::U16(v) => Box::new(v.iter().map(|&(k, v)| (k as usize, v as usize))),
128            Self::U32(v) => Box::new(v.iter().map(|&(k, v)| (k as usize, v as usize))),
129            Self::U64(v) => Box::new(v.iter().map(|&(k, v)| (k as usize, v as usize))),
130        }
131    }
132
133    /// Returns the maximum index value that fits in this variant.
134    pub fn max_index(&self) -> usize {
135        match self {
136            Self::U16(_) => u16::MAX as usize,
137            Self::U32(_) => u32::MAX as usize,
138            Self::U64(_) => usize::MAX,
139        }
140    }
141}
142
143/// Describes how an array's data is distributed across blocks.
144///
145/// Carries the array's global shape and dimension names so callers always
146/// know the full n-dimensional structure, independent of how the data is
147/// stored internally.
148#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
149pub struct ArrayLayout {
150    /// Size of the array in each dimension (number of elements).
151    pub shape: Vec<u32>,
152    /// Name of each dimension (e.g. `["x", "y", "time"]`).
153    pub dimension_names: Vec<String>,
154    /// Storage strategy for the array data.
155    pub storage: StorageLayout,
156}
157
158/// A single entry in the chunk table: a coordinate vector paired with its
159/// storage address.
160///
161/// `coord` identifies the chunk within the array's grid (one index per
162/// dimension). `address` locates the chunk's bytes within a compressed block.
163#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
164pub struct ChunkEntry {
165    pub coord: Vec<u32>,
166    pub address: ChunkAddress,
167}
168
169/// Describes the physical storage strategy for array data.
170///
171/// All arrays are stored as a grid of chunks. A "flat" array is simply one
172/// where `chunk_shape == shape`, yielding a single chunk at the origin.
173#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
174pub struct StorageLayout {
175    /// The shape of each chunk in number of elements per dimension.
176    pub chunk_shape: Vec<u32>,
177    /// Mapping from chunk coordinates to their storage addresses.
178    ///
179    /// A layer sidecar carries only the chunks that changed; unchanged chunks
180    /// fall through to lower layers.
181    pub chunks: Vec<ChunkEntry>,
182}
183
184/// A scalar fill value for an array.
185///
186/// Represents the value used for unwritten or missing elements. Supports
187/// numeric types and strings; complex types (Binary, List, FixedSizeList)
188/// are not representable here.
189#[derive(Debug, Clone, Archive, Serialize, Deserialize)]
190pub enum FillValue {
191    Bool(bool),
192    Int(i64),
193    UInt(u64),
194    Float(f64),
195    String(String),
196    /// Fill value for [`DType::TimestampNs`] arrays — interpreted as `i64`
197    /// nanoseconds since the Unix epoch.
198    TimestampNs(i64),
199}
200
201impl PartialEq for FillValue {
202    fn eq(&self, other: &Self) -> bool {
203        match (self, other) {
204            (Self::Bool(a), Self::Bool(b)) => a == b,
205            (Self::Int(a), Self::Int(b)) => a == b,
206            (Self::UInt(a), Self::UInt(b)) => a == b,
207            // Compare by bit pattern so NaN == NaN.
208            (Self::Float(a), Self::Float(b)) => a.to_bits() == b.to_bits(),
209            (Self::String(a), Self::String(b)) => a == b,
210            (Self::TimestampNs(a), Self::TimestampNs(b)) => a == b,
211            _ => false,
212        }
213    }
214}
215
216/// Metadata for a single array stored in the file.
217///
218/// Stored in the footer's array table. Contains the schema, layout
219/// information, and a logical deletion flag.
220#[derive(Debug, Clone, PartialEq, Archive, Serialize, Deserialize)]
221pub struct ArrayMeta {
222    /// Unique name of the array within the file.
223    pub name: String,
224    /// Element data type.
225    pub dtype: DType,
226    /// Shape, dimension names, and physical chunk layout.
227    pub layout: ArrayLayout,
228    /// Fill value for unwritten or missing elements. `None` means zero/empty.
229    pub fill_value: Option<FillValue>,
230    /// Whether this array has been logically deleted.
231    ///
232    /// Deleted arrays are ignored during reads and removed during compaction.
233    pub deleted: bool,
234    /// User-defined key-value attributes.
235    ///
236    /// Each entry is `(key_index, value_index)` referencing `Footer::attr_keys`
237    /// and `Footer::attr_values`. Entries are sorted by key index for O(log n)
238    /// lookup. The `Attributes` variant controls the storage width per index.
239    pub attributes: Attributes,
240}
241
242impl ArrayLayout {
243    /// Looks up the [`ChunkAddress`] for a given chunk coordinate.
244    pub fn get_chunk(&self, coord: &[u32]) -> Option<&ChunkAddress> {
245        self.storage
246            .chunks
247            .iter()
248            .find(|e| e.coord.as_slice() == coord)
249            .map(|e| &e.address)
250    }
251
252    /// Returns all [`ChunkAddress`]es for this layout.
253    pub fn all_addresses(&self) -> Vec<&ChunkAddress> {
254        self.storage.chunks.iter().map(|e| &e.address).collect()
255    }
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261    use crate::address::BlockId;
262
263    fn sample_addr(block: u32, offset: u32, size: u32) -> ChunkAddress {
264        ChunkAddress {
265            block_id: BlockId(block),
266            offset,
267            size,
268        }
269    }
270
271    fn chunk(coord: Vec<u32>, block: u32, offset: u32, size: u32) -> ChunkEntry {
272        ChunkEntry {
273            coord,
274            address: sample_addr(block, offset, size),
275        }
276    }
277
278    #[test]
279    fn flat_layout_addresses() {
280        let layout = ArrayLayout {
281            shape: vec![100],
282            dimension_names: vec!["x".into()],
283            storage: StorageLayout {
284                chunk_shape: vec![100],
285                chunks: vec![chunk(vec![0], 0, 0, 400)],
286            },
287        };
288        assert_eq!(layout.all_addresses().len(), 1);
289        assert!(layout.get_chunk(&[0]).is_some());
290        assert!(layout.get_chunk(&[1]).is_none());
291    }
292
293    #[test]
294    fn chunked_layout_lookup() {
295        let layout = ArrayLayout {
296            shape: vec![128, 128],
297            dimension_names: vec!["x".into(), "y".into()],
298            storage: StorageLayout {
299                chunk_shape: vec![64, 64],
300                chunks: vec![
301                    chunk(vec![0, 0], 3, 0, 1000),
302                    chunk(vec![0, 1], 7, 2000, 1000),
303                ],
304            },
305        };
306        let addr = layout.get_chunk(&[0, 1]).unwrap();
307        assert_eq!(addr.block_id, BlockId(7));
308        assert_eq!(addr.offset, 2000);
309
310        assert!(layout.get_chunk(&[1, 0]).is_none());
311    }
312
313    #[test]
314    fn chunked_all_addresses() {
315        let layout = ArrayLayout {
316            shape: vec![30],
317            dimension_names: vec!["t".into()],
318            storage: StorageLayout {
319                chunk_shape: vec![10],
320                chunks: vec![
321                    chunk(vec![0], 0, 0, 500),
322                    chunk(vec![1], 0, 500, 500),
323                    chunk(vec![2], 1, 0, 500),
324                ],
325            },
326        };
327        assert_eq!(layout.all_addresses().len(), 3);
328    }
329}