liquid_cache_parquet/cache/
id.rs

1use std::{
2    ops::Deref,
3    path::{Path, PathBuf},
4};
5
6use liquid_cache_storage::cache::EntryID;
7
8/// This is a unique identifier for a row in a parquet file.
9#[repr(C, align(8))]
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)]
11pub struct ParquetArrayID {
12    file_id: u16,
13    rg_id: u16,
14    col_id: u16,
15    batch_id: BatchID,
16}
17
18impl From<ParquetArrayID> for usize {
19    fn from(id: ParquetArrayID) -> Self {
20        (id.file_id as usize) << 48
21            | (id.rg_id as usize) << 32
22            | (id.col_id as usize) << 16
23            | (id.batch_id.v as usize)
24    }
25}
26
27impl From<usize> for ParquetArrayID {
28    fn from(value: usize) -> Self {
29        Self {
30            file_id: (value >> 48) as u16,
31            rg_id: ((value >> 32) & 0xFFFF) as u16,
32            col_id: ((value >> 16) & 0xFFFF) as u16,
33            batch_id: BatchID::from_raw((value & 0xFFFF) as u16),
34        }
35    }
36}
37
38impl ParquetArrayID {}
39
40impl From<ParquetArrayID> for EntryID {
41    fn from(id: ParquetArrayID) -> Self {
42        EntryID::from(usize::from(id))
43    }
44}
45
46impl From<EntryID> for ParquetArrayID {
47    fn from(id: EntryID) -> Self {
48        ParquetArrayID::from(usize::from(id))
49    }
50}
51
52const _: () = assert!(std::mem::size_of::<ParquetArrayID>() == 8);
53const _: () = assert!(std::mem::align_of::<ParquetArrayID>() == 8);
54
55impl ParquetArrayID {
56    /// Creates a new CacheEntryID.
57    pub fn new(file_id: u64, row_group_id: u64, column_id: u64, batch_id: BatchID) -> Self {
58        debug_assert!(file_id <= u16::MAX as u64);
59        debug_assert!(row_group_id <= u16::MAX as u64);
60        debug_assert!(column_id <= u16::MAX as u64);
61        Self {
62            file_id: file_id as u16,
63            rg_id: row_group_id as u16,
64            col_id: column_id as u16,
65            batch_id,
66        }
67    }
68
69    /// Get the batch id.
70    pub fn batch_id_inner(&self) -> u64 {
71        self.batch_id.v as u64
72    }
73
74    /// Get the file id.
75    pub fn file_id_inner(&self) -> u64 {
76        self.file_id as u64
77    }
78
79    /// Get the row group id.
80    pub fn row_group_id_inner(&self) -> u64 {
81        self.rg_id as u64
82    }
83
84    /// Get the column id.
85    pub fn column_id_inner(&self) -> u64 {
86        self.col_id as u64
87    }
88
89    /// Get the on-disk path.
90    pub fn on_disk_path(&self, cache_root_dir: &Path) -> PathBuf {
91        let batch_id = self.batch_id_inner();
92        cache_root_dir
93            .join(format!("file_{}", self.file_id_inner()))
94            .join(format!("rg_{}", self.row_group_id_inner()))
95            .join(format!("col_{}", self.column_id_inner()))
96            .join(format!("batch_{batch_id}.liquid"))
97    }
98
99    /// Get the on-disk arrow path.
100    pub fn on_disk_arrow_path(&self, cache_root_dir: &Path) -> PathBuf {
101        let batch_id = self.batch_id_inner();
102        cache_root_dir
103            .join(format!("file_{}", self.file_id_inner()))
104            .join(format!("rg_{}", self.row_group_id_inner()))
105            .join(format!("col_{}", self.column_id_inner()))
106            .join(format!("batch_{batch_id}.arrow"))
107    }
108}
109
110/// BatchID is a unique identifier for a batch of rows,
111/// it is row id divided by the batch size.
112///
113// It's very easy to misinterpret this as row id, so we use new type idiom to avoid confusion:
114// https://doc.rust-lang.org/rust-by-example/generics/new_types.html
115#[repr(C, align(2))]
116#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)]
117pub struct BatchID {
118    v: u16,
119}
120
121impl BatchID {
122    /// Creates a new BatchID from a row id and a batch size.
123    /// The row id is at the boundary of the batch.
124    pub fn from_row_id(row_id: usize, batch_size: usize) -> Self {
125        Self {
126            v: (row_id / batch_size) as u16,
127        }
128    }
129
130    /// Creates a new BatchID from a raw value.
131    pub fn from_raw(v: u16) -> Self {
132        Self { v }
133    }
134
135    /// Increment the batch id.
136    pub fn inc(&mut self) {
137        debug_assert!(self.v < u16::MAX);
138        self.v += 1;
139    }
140}
141
142impl Deref for BatchID {
143    type Target = u16;
144
145    fn deref(&self) -> &Self::Target {
146        &self.v
147    }
148}
149
150#[cfg(test)]
151mod tests {
152    use tempfile::tempdir;
153
154    use super::*;
155
156    #[test]
157    fn test_cache_entry_id_new_and_getters() {
158        let file_id = 10u64;
159        let row_group_id = 20u64;
160        let column_id = 30u64;
161        let batch_id = BatchID::from_raw(40);
162        let entry_id = ParquetArrayID::new(file_id, row_group_id, column_id, batch_id);
163
164        assert_eq!(entry_id.file_id_inner(), file_id);
165        assert_eq!(entry_id.row_group_id_inner(), row_group_id);
166        assert_eq!(entry_id.column_id_inner(), column_id);
167        assert_eq!(entry_id.batch_id_inner(), *batch_id as u64);
168    }
169
170    #[test]
171    fn test_cache_entry_id_boundaries() {
172        let file_id = u16::MAX as u64;
173        let row_group_id = 0u64;
174        let column_id = u16::MAX as u64;
175        let batch_id = BatchID::from_raw(0);
176        let entry_id = ParquetArrayID::new(file_id, row_group_id, column_id, batch_id);
177
178        assert_eq!(entry_id.file_id_inner(), file_id);
179        assert_eq!(entry_id.row_group_id_inner(), row_group_id);
180        assert_eq!(entry_id.column_id_inner(), column_id);
181        assert_eq!(entry_id.batch_id_inner(), *batch_id as u64);
182    }
183
184    #[test]
185    #[should_panic]
186    fn test_cache_entry_id_new_panic_file_id() {
187        ParquetArrayID::new((u16::MAX as u64) + 1, 0, 0, BatchID::from_raw(0));
188    }
189
190    #[test]
191    #[should_panic]
192    fn test_cache_entry_id_new_panic_row_group_id() {
193        ParquetArrayID::new(0, (u16::MAX as u64) + 1, 0, BatchID::from_raw(0));
194    }
195
196    #[test]
197    #[should_panic]
198    fn test_cache_entry_id_new_panic_column_id() {
199        ParquetArrayID::new(0, 0, (u16::MAX as u64) + 1, BatchID::from_raw(0));
200    }
201
202    #[test]
203    fn test_cache_entry_id_on_disk_path() {
204        let temp_dir = tempdir().unwrap();
205        let cache_root = temp_dir.path();
206        let entry_id = ParquetArrayID::new(1, 2, 3, BatchID::from_raw(4));
207        let expected_path = cache_root
208            .join("file_1")
209            .join("rg_2")
210            .join("col_3")
211            .join("batch_4.liquid");
212        assert_eq!(entry_id.on_disk_path(cache_root), expected_path);
213    }
214
215    #[test]
216    fn test_batch_id_from_row_id() {
217        let batch_id = BatchID::from_row_id(256, 128);
218        assert_eq!(batch_id.v, 2);
219    }
220
221    #[test]
222    fn test_batch_id_from_raw() {
223        let batch_id = BatchID::from_raw(5);
224        assert_eq!(batch_id.v, 5);
225    }
226
227    #[test]
228    fn test_batch_id_inc() {
229        let mut batch_id = BatchID::from_raw(10);
230        batch_id.inc();
231        assert_eq!(batch_id.v, 11);
232    }
233
234    #[test]
235    #[should_panic]
236    fn test_batch_id_inc_overflow() {
237        let mut batch_id = BatchID::from_raw(u16::MAX);
238        // Should panic because incrementing exceeds u16::MAX
239        batch_id.inc();
240    }
241
242    #[test]
243    fn test_batch_id_deref() {
244        let batch_id = BatchID::from_raw(15);
245        assert_eq!(*batch_id, 15);
246    }
247}