vibesql_storage/columnar/
data.rs

1//! Column data storage types.
2//!
3//! This module provides the `ColumnData` enum for storing typed column data
4//! with NULL bitmap.
5//!
6//! ## Zero-Copy Design
7//!
8//! Column data uses `Arc<Vec<T>>` for all arrays, enabling:
9//! - Zero-copy sharing between storage and executor layers
10//! - O(1) clone operations (reference count bump instead of data copy)
11//! - Cache-friendly columnar data that can be shared across query executions
12
13use std::sync::Arc;
14
15use vibesql_types::{Date, Interval, SqlValue, Time, Timestamp};
16
17/// Typed column data with NULL bitmap
18///
19/// Each variant stores a vector of non-NULL values and a separate bitmap
20/// indicating which positions are NULL. This design:
21/// - Avoids Option<T> overhead (16 bytes vs 8 bytes for f64)
22/// - Enables direct SIMD operations on value vectors
23/// - Provides O(1) NULL checks via bitmap
24/// - Uses Arc for zero-copy sharing with executor layer
25/// - String columns use Arc<str> for O(1) cloning
26#[derive(Debug, Clone)]
27pub enum ColumnData {
28    /// 64-bit signed integers
29    Int64 { values: Arc<Vec<i64>>, nulls: Arc<Vec<bool>> },
30    /// 64-bit floating point
31    Float64 { values: Arc<Vec<f64>>, nulls: Arc<Vec<bool>> },
32    /// Variable-length strings (using Arc<str> for O(1) cloning)
33    String { values: Arc<Vec<Arc<str>>>, nulls: Arc<Vec<bool>> },
34    /// Boolean values
35    Bool { values: Arc<Vec<bool>>, nulls: Arc<Vec<bool>> },
36    /// Date values
37    Date { values: Arc<Vec<Date>>, nulls: Arc<Vec<bool>> },
38    /// Time values
39    Time { values: Arc<Vec<Time>>, nulls: Arc<Vec<bool>> },
40    /// Timestamp values
41    Timestamp { values: Arc<Vec<Timestamp>>, nulls: Arc<Vec<bool>> },
42    /// Interval values
43    Interval { values: Arc<Vec<Interval>>, nulls: Arc<Vec<bool>> },
44    /// Vector values (for AI/ML workloads)
45    Vector { values: Arc<Vec<Vec<f32>>>, nulls: Arc<Vec<bool>> },
46    /// Blob values (binary data)
47    Blob { values: Arc<Vec<Vec<u8>>>, nulls: Arc<Vec<bool>> },
48}
49
50#[allow(clippy::type_complexity)]
51impl ColumnData {
52    /// Get the number of values in this column (including NULLs)
53    pub fn len(&self) -> usize {
54        match self {
55            ColumnData::Int64 { nulls, .. } => nulls.len(),
56            ColumnData::Float64 { nulls, .. } => nulls.len(),
57            ColumnData::String { nulls, .. } => nulls.len(),
58            ColumnData::Bool { nulls, .. } => nulls.len(),
59            ColumnData::Date { nulls, .. } => nulls.len(),
60            ColumnData::Time { nulls, .. } => nulls.len(),
61            ColumnData::Timestamp { nulls, .. } => nulls.len(),
62            ColumnData::Interval { nulls, .. } => nulls.len(),
63            ColumnData::Vector { nulls, .. } => nulls.len(),
64            ColumnData::Blob { nulls, .. } => nulls.len(),
65        }
66    }
67
68    /// Check if the column is empty
69    pub fn is_empty(&self) -> bool {
70        self.len() == 0
71    }
72
73    /// Estimate the memory size of this column in bytes
74    ///
75    /// This is used for memory budgeting in the columnar cache.
76    /// The estimate includes:
77    /// - Value storage (type-specific size * element count)
78    /// - NULL bitmap (1 byte per element, not packed)
79    /// - Vec overhead (capacity, length, pointer)
80    pub fn size_in_bytes(&self) -> usize {
81        const VEC_OVERHEAD: usize = 3 * std::mem::size_of::<usize>(); // ptr, len, cap
82
83        match self {
84            ColumnData::Int64 { values, nulls } => {
85                VEC_OVERHEAD * 2
86                    + values.capacity() * std::mem::size_of::<i64>()
87                    + nulls.capacity() * std::mem::size_of::<bool>()
88            }
89            ColumnData::Float64 { values, nulls } => {
90                VEC_OVERHEAD * 2
91                    + values.capacity() * std::mem::size_of::<f64>()
92                    + nulls.capacity() * std::mem::size_of::<bool>()
93            }
94            ColumnData::String { values, nulls } => {
95                // For Arc<str>, we need to account for the Arc overhead
96                // plus the actual string data on the heap
97                let arc_overhead = std::mem::size_of::<Arc<str>>(); // ptr + refcount
98                let string_data: usize = values.iter().map(|s| s.len()).sum();
99                VEC_OVERHEAD * 2
100                    + values.capacity() * arc_overhead
101                    + string_data
102                    + nulls.capacity() * std::mem::size_of::<bool>()
103            }
104            ColumnData::Bool { values, nulls } => {
105                VEC_OVERHEAD * 2
106                    + values.capacity() * std::mem::size_of::<bool>()
107                    + nulls.capacity() * std::mem::size_of::<bool>()
108            }
109            ColumnData::Date { values, nulls } => {
110                VEC_OVERHEAD * 2
111                    + values.capacity() * std::mem::size_of::<Date>()
112                    + nulls.capacity() * std::mem::size_of::<bool>()
113            }
114            ColumnData::Time { values, nulls } => {
115                VEC_OVERHEAD * 2
116                    + values.capacity() * std::mem::size_of::<Time>()
117                    + nulls.capacity() * std::mem::size_of::<bool>()
118            }
119            ColumnData::Timestamp { values, nulls } => {
120                VEC_OVERHEAD * 2
121                    + values.capacity() * std::mem::size_of::<Timestamp>()
122                    + nulls.capacity() * std::mem::size_of::<bool>()
123            }
124            ColumnData::Interval { values, nulls } => {
125                // Interval contains a String, so we need to account for that
126                let interval_overhead = std::mem::size_of::<Interval>();
127                let string_data: usize = values.iter().map(|i| i.value.capacity()).sum();
128                VEC_OVERHEAD * 2
129                    + values.capacity() * interval_overhead
130                    + string_data
131                    + nulls.capacity() * std::mem::size_of::<bool>()
132            }
133            ColumnData::Vector { values, nulls } => {
134                // Vector contains Vec<f32>, so we need to account for each inner vector
135                let vec_overhead = std::mem::size_of::<Vec<f32>>();
136                let vector_data: usize =
137                    values.iter().map(|v| v.capacity() * std::mem::size_of::<f32>()).sum();
138                VEC_OVERHEAD * 2
139                    + values.capacity() * vec_overhead
140                    + vector_data
141                    + nulls.capacity() * std::mem::size_of::<bool>()
142            }
143            ColumnData::Blob { values, nulls } => {
144                // Blob contains Vec<u8>, so we need to account for each inner vector
145                let vec_overhead = std::mem::size_of::<Vec<u8>>();
146                let blob_data: usize = values.iter().map(|v| v.capacity()).sum();
147                VEC_OVERHEAD * 2
148                    + values.capacity() * vec_overhead
149                    + blob_data
150                    + nulls.capacity() * std::mem::size_of::<bool>()
151            }
152        }
153    }
154
155    /// Check if the value at the given index is NULL
156    pub fn is_null(&self, index: usize) -> bool {
157        match self {
158            ColumnData::Int64 { nulls, .. } => nulls[index],
159            ColumnData::Float64 { nulls, .. } => nulls[index],
160            ColumnData::String { nulls, .. } => nulls[index],
161            ColumnData::Bool { nulls, .. } => nulls[index],
162            ColumnData::Date { nulls, .. } => nulls[index],
163            ColumnData::Time { nulls, .. } => nulls[index],
164            ColumnData::Timestamp { nulls, .. } => nulls[index],
165            ColumnData::Interval { nulls, .. } => nulls[index],
166            ColumnData::Vector { nulls, .. } => nulls[index],
167            ColumnData::Blob { nulls, .. } => nulls[index],
168        }
169    }
170
171    /// Get the SQL value at the given index (converts back to SqlValue)
172    pub fn get(&self, index: usize) -> SqlValue {
173        if self.is_null(index) {
174            return SqlValue::Null;
175        }
176
177        match self {
178            ColumnData::Int64 { values, .. } => SqlValue::Integer(values[index]),
179            ColumnData::Float64 { values, .. } => SqlValue::Double(values[index]),
180            ColumnData::String { values, .. } => {
181                SqlValue::Varchar(arcstr::ArcStr::from(values[index].as_ref()))
182            }
183            ColumnData::Bool { values, .. } => SqlValue::Boolean(values[index]),
184            ColumnData::Date { values, .. } => SqlValue::Date(values[index]),
185            ColumnData::Time { values, .. } => SqlValue::Time(values[index]),
186            ColumnData::Timestamp { values, .. } => SqlValue::Timestamp(values[index]),
187            ColumnData::Interval { values, .. } => SqlValue::Interval(values[index].clone()),
188            ColumnData::Vector { values, .. } => SqlValue::Vector(values[index].clone()),
189            ColumnData::Blob { values, .. } => SqlValue::Blob(values[index].clone()),
190        }
191    }
192
193    /// Get the underlying Arc for i64 values (zero-copy sharing with executor)
194    pub fn as_i64_arc(&self) -> Option<(&Arc<Vec<i64>>, &Arc<Vec<bool>>)> {
195        match self {
196            ColumnData::Int64 { values, nulls } => Some((values, nulls)),
197            _ => None,
198        }
199    }
200
201    /// Get the underlying Arc for f64 values (zero-copy sharing with executor)
202    pub fn as_f64_arc(&self) -> Option<(&Arc<Vec<f64>>, &Arc<Vec<bool>>)> {
203        match self {
204            ColumnData::Float64 { values, nulls } => Some((values, nulls)),
205            _ => None,
206        }
207    }
208
209    /// Get the underlying Arc for string values (zero-copy sharing with executor)
210    pub fn as_string_arc(&self) -> Option<(&Arc<Vec<Arc<str>>>, &Arc<Vec<bool>>)> {
211        match self {
212            ColumnData::String { values, nulls } => Some((values, nulls)),
213            _ => None,
214        }
215    }
216
217    /// Get the underlying Arc for bool values (zero-copy sharing with executor)
218    pub fn as_bool_arc(&self) -> Option<(&Arc<Vec<bool>>, &Arc<Vec<bool>>)> {
219        match self {
220            ColumnData::Bool { values, nulls } => Some((values, nulls)),
221            _ => None,
222        }
223    }
224
225    /// Get the underlying Arc for date values (zero-copy sharing with executor)
226    pub fn as_date_arc(&self) -> Option<(&Arc<Vec<Date>>, &Arc<Vec<bool>>)> {
227        match self {
228            ColumnData::Date { values, nulls } => Some((values, nulls)),
229            _ => None,
230        }
231    }
232
233    /// Get the underlying Arc for timestamp values (zero-copy sharing with executor)
234    pub fn as_timestamp_arc(&self) -> Option<(&Arc<Vec<Timestamp>>, &Arc<Vec<bool>>)> {
235        match self {
236            ColumnData::Timestamp { values, nulls } => Some((values, nulls)),
237            _ => None,
238        }
239    }
240
241    /// Get the underlying Arc for time values (zero-copy sharing with executor)
242    pub fn as_time_arc(&self) -> Option<(&Arc<Vec<Time>>, &Arc<Vec<bool>>)> {
243        match self {
244            ColumnData::Time { values, nulls } => Some((values, nulls)),
245            _ => None,
246        }
247    }
248
249    /// Get the underlying Arc for interval values (zero-copy sharing with executor)
250    pub fn as_interval_arc(&self) -> Option<(&Arc<Vec<Interval>>, &Arc<Vec<bool>>)> {
251        match self {
252            ColumnData::Interval { values, nulls } => Some((values, nulls)),
253            _ => None,
254        }
255    }
256}