llkv_column_map/lib.rs
1//! Columnar storage engine for LLKV.
2//!
3//! This crate provides a low-level columnar storage layer that persists Apache Arrow
4//! [`RecordBatch`]es to disk and supports efficient scans, filters, and updates.
5//! It serves as the foundation for [`llkv-table`] and higher-level query execution.
6//!
7//! # Architecture
8//!
9//! The storage engine is organized into several key components:
10//!
11//! - **[`ColumnStore`]**: Primary interface for storing and retrieving columnar data.
12//! Manages column descriptors, metadata catalogs, and coordinates with the pager
13//! for persistent storage.
14//!
15//! - **[`LogicalFieldId`](types::LogicalFieldId)**: Namespaced identifier for columns.
16//! Combines a namespace (user data, row ID shadow, MVCC metadata), table ID, and
17//! field ID into a single 64-bit value to prevent collisions.
18//!
19//! - **[`ScanBuilder`]**: Builder pattern for constructing column scans with various
20//! options (filters, ordering, row ID inclusion).
21//!
22//! - **Visitor Pattern**: Scans emit data through visitor callbacks rather than
23//! materializing entire columns in memory, enabling streaming and aggregation.
24//!
25//! # Storage Model
26//!
27//! Data is stored in columnar chunks:
28//! - Each column is identified by a `LogicalFieldId`
29//! - Columns are broken into chunks for incremental writes
30//! - Each chunk stores Arrow-serialized data plus metadata (row count, min/max values)
31//! - Shadow columns track row IDs separately from user data
32//! - MVCC columns (`created_by`, `deleted_by`) track transaction visibility
33//!
34//! # Namespaces
35//!
36//! Columns are organized into namespaces to prevent ID collisions:
37//! - `UserData`: Regular table columns
38//! - `RowIdShadow`: Internal row ID tracking for each column
39//! - `TxnCreatedBy`: MVCC transaction that created each row
40//! - `TxnDeletedBy`: MVCC transaction that deleted each row
41//!
42//! # Thread Safety
43//!
44//! `ColumnStore` is thread-safe (`Send + Sync`) with internal locking for
45//! catalog updates. Read operations can occur concurrently; writes are
46//! serialized through the catalog lock.
47//!
48//! [`RecordBatch`]: arrow::record_batch::RecordBatch
49//! [`llkv-table`]: https://docs.rs/llkv-table
50//! [`ColumnStore`]: store::ColumnStore
51//! [`ScanBuilder`]: scan::ScanBuilder
52//!
53//! # Macros and Type Dispatch
54//!
55//! This crate provides macros for efficient type-specific operations without runtime
56//! dispatch overhead. See [`with_integer_arrow_type!`] for details.
57
58// NOTE: rustfmt currently re-indents portions of macro_rules! blocks in this
59// file (observed when running `cargo fmt`). This produces noisy diffs and
60// churn because rustfmt will flip formatting between runs. The problematic
61// locations in this module are the macro_rules! dispatch macros declared
62// below. Until the underlying rustfmt bug is fixed, we intentionally opt out
63// of automatic formatting for those specific macros using `#[rustfmt::skip]`,
64// while keeping the rest of the module formatted normally.
65//
66// Reproduction / debugging tips for contributors:
67// - Run `rustup run stable rustfmt -- --version` to confirm the rustfmt
68// version, then `cargo fmt` to reproduce the behavior.
69// - Narrow the change by running rustfmt on this file only:
70// rustfmt llkv-column-map/src/store/scan/unsorted.rs
71// - If you can produce a minimal self-contained example that triggers the
72// re-indent, open an issue with rustfmt (include rustfmt version and the
73// minimal example) and link it here.
74//
75// NOTE: Once a minimal reproducer for the rustfmt regression exists, link the
76// upstream issue here and remove the `#[rustfmt::skip]` attributes so the file
77// can return to standard formatting. Progress is tracked at
78// https://github.com/rust-lang/rustfmt/issues/6629#issuecomment-3395446770.
79
80/// Dispatches to type-specific code based on an Arrow `DataType`.
81///
82/// This macro eliminates runtime type checking by expanding to type-specific code
83/// at compile time. It matches the provided `DataType` against supported numeric types
84/// and binds the corresponding Arrow primitive type to the specified identifier.
85///
86/// # Parameters
87///
88/// - `$dtype` - Expression evaluating to `&arrow::datatypes::DataType`
89/// - `$ty` - Identifier to bind the Arrow primitive type to (e.g., `UInt64Type`)
90/// - `$body` - Code to execute with `$ty` bound to the matched type
91/// - `$unsupported` - Fallback expression if the type is not supported
92///
93/// # Performance
94///
95/// This macro is used in hot paths to avoid runtime `match` statements and virtual
96/// dispatch. The compiler generates specialized code for each type.
97#[macro_export]
98#[rustfmt::skip]
99macro_rules! with_integer_arrow_type {
100 ($dtype:expr, |$ty:ident| $body:expr, $unsupported:expr $(,)?) => {{
101 use std::borrow::Borrow;
102
103 let dtype_value = $dtype;
104 let dtype_ref: &arrow::datatypes::DataType = dtype_value.borrow();
105 let mut result: Option<_> = None;
106
107 macro_rules! __llkv_dispatch_integer_arrow_type {
108 (
109 $base:ident,
110 $chunk_fn:ident,
111 $chunk_with_rids_fn:ident,
112 $run_fn:ident,
113 $run_with_rids_fn:ident,
114 $array_ty:ty,
115 $physical_ty:ty,
116 $dtype_expr:expr,
117 $native_ty:ty,
118 $cast_expr:expr
119 ) => {
120 if dtype_ref == &$dtype_expr {
121 type $ty = $physical_ty;
122 result = Some($body);
123 }
124 };
125 }
126
127 llkv_for_each_arrow_numeric!(__llkv_dispatch_integer_arrow_type);
128
129 result.unwrap_or_else(|| $unsupported)
130 }};
131}
132
133/// Invokes a macro for each supported Arrow numeric type.
134///
135/// This is a helper macro that generates repetitive type-specific code. It calls
136/// the provided macro once for each numeric Arrow type with metadata about that type.
137///
138/// # Macro Arguments Provided to Callback
139///
140/// For each type, the callback macro receives:
141/// 1. Base type name (e.g., `u64`, `i32`, `f64`)
142/// 2. Chunk visitor method name (e.g., `u64_chunk`)
143/// 3. Chunk with row IDs visitor method name (e.g., `u64_chunk_with_rids`)
144/// 4. Run visitor method name (e.g., `u64_run`)
145/// 5. Run with row IDs visitor method name (e.g., `u64_run_with_rids`)
146/// 6. Arrow array type (e.g., `arrow::array::UInt64Array`)
147/// 7. Arrow physical type (e.g., `arrow::datatypes::UInt64Type`)
148/// 8. Arrow DataType enum variant (e.g., `arrow::datatypes::DataType::UInt64`)
149/// 9. Native Rust type (e.g., `u64`)
150/// 10. Cast expression for type conversion
151#[macro_export]
152#[rustfmt::skip]
153macro_rules! llkv_for_each_arrow_numeric {
154 ($macro:ident) => {
155 $macro!(
156 u64,
157 u64_chunk,
158 u64_chunk_with_rids,
159 u64_run,
160 u64_run_with_rids,
161 arrow::array::UInt64Array,
162 arrow::datatypes::UInt64Type,
163 arrow::datatypes::DataType::UInt64,
164 u64,
165 |v: u64| v as f64
166 );
167 $macro!(
168 u32,
169 u32_chunk,
170 u32_chunk_with_rids,
171 u32_run,
172 u32_run_with_rids,
173 arrow::array::UInt32Array,
174 arrow::datatypes::UInt32Type,
175 arrow::datatypes::DataType::UInt32,
176 u32,
177 |v: u32| v as f64
178 );
179 $macro!(
180 u16,
181 u16_chunk,
182 u16_chunk_with_rids,
183 u16_run,
184 u16_run_with_rids,
185 arrow::array::UInt16Array,
186 arrow::datatypes::UInt16Type,
187 arrow::datatypes::DataType::UInt16,
188 u16,
189 |v: u16| v as f64
190 );
191 $macro!(
192 u8,
193 u8_chunk,
194 u8_chunk_with_rids,
195 u8_run,
196 u8_run_with_rids,
197 arrow::array::UInt8Array,
198 arrow::datatypes::UInt8Type,
199 arrow::datatypes::DataType::UInt8,
200 u8,
201 |v: u8| v as f64
202 );
203 $macro!(
204 i64,
205 i64_chunk,
206 i64_chunk_with_rids,
207 i64_run,
208 i64_run_with_rids,
209 arrow::array::Int64Array,
210 arrow::datatypes::Int64Type,
211 arrow::datatypes::DataType::Int64,
212 i64,
213 |v: i64| v as f64
214 );
215 $macro!(
216 i32,
217 i32_chunk,
218 i32_chunk_with_rids,
219 i32_run,
220 i32_run_with_rids,
221 arrow::array::Int32Array,
222 arrow::datatypes::Int32Type,
223 arrow::datatypes::DataType::Int32,
224 i32,
225 |v: i32| v as f64
226 );
227 $macro!(
228 i16,
229 i16_chunk,
230 i16_chunk_with_rids,
231 i16_run,
232 i16_run_with_rids,
233 arrow::array::Int16Array,
234 arrow::datatypes::Int16Type,
235 arrow::datatypes::DataType::Int16,
236 i16,
237 |v: i16| v as f64
238 );
239 $macro!(
240 i8,
241 i8_chunk,
242 i8_chunk_with_rids,
243 i8_run,
244 i8_run_with_rids,
245 arrow::array::Int8Array,
246 arrow::datatypes::Int8Type,
247 arrow::datatypes::DataType::Int8,
248 i8,
249 |v: i8| v as f64
250 );
251 $macro!(
252 f64,
253 f64_chunk,
254 f64_chunk_with_rids,
255 f64_run,
256 f64_run_with_rids,
257 arrow::array::Float64Array,
258 arrow::datatypes::Float64Type,
259 arrow::datatypes::DataType::Float64,
260 f64,
261 |v: f64| v
262 );
263 $macro!(
264 f32,
265 f32_chunk,
266 f32_chunk_with_rids,
267 f32_run,
268 f32_run_with_rids,
269 arrow::array::Float32Array,
270 arrow::datatypes::Float32Type,
271 arrow::datatypes::DataType::Float32,
272 f32,
273 |v: f32| v as f64
274 );
275 $macro!(
276 date64,
277 date64_chunk,
278 date64_chunk_with_rids,
279 date64_run,
280 date64_run_with_rids,
281 arrow::array::Date64Array,
282 arrow::datatypes::Date64Type,
283 arrow::datatypes::DataType::Date64,
284 i64,
285 |v: i64| v as f64
286 );
287 $macro!(
288 date32,
289 date32_chunk,
290 date32_chunk_with_rids,
291 date32_run,
292 date32_run_with_rids,
293 arrow::array::Date32Array,
294 arrow::datatypes::Date32Type,
295 arrow::datatypes::DataType::Date32,
296 i32,
297 |v: i32| v as f64
298 );
299 };
300}
301
302#[macro_export]
303#[rustfmt::skip]
304macro_rules! llkv_for_each_arrow_boolean {
305 ($macro:ident) => {
306 $macro!(
307 bool,
308 bool_chunk,
309 bool_chunk_with_rids,
310 bool_run,
311 bool_run_with_rids,
312 arrow::array::BooleanArray,
313 arrow::datatypes::BooleanType,
314 arrow::datatypes::DataType::Boolean,
315 bool,
316 |v: bool| if v { 1.0 } else { 0.0 }
317 );
318 };
319}
320
321pub fn is_supported_arrow_type(dtype: &arrow::datatypes::DataType) -> bool {
322 use arrow::datatypes::DataType;
323
324 if matches!(dtype, DataType::Utf8 | DataType::LargeUtf8) {
325 return true;
326 }
327
328 let mut matched = false;
329
330 macro_rules! __llkv_match_dtype {
331 (
332 $base:ident,
333 $chunk_fn:ident,
334 $chunk_with_rids_fn:ident,
335 $run_fn:ident,
336 $run_with_rids_fn:ident,
337 $array_ty:ty,
338 $physical_ty:ty,
339 $dtype_expr:expr,
340 $native_ty:ty,
341 $cast_expr:expr
342 ) => {
343 if dtype == &$dtype_expr {
344 matched = true;
345 }
346 };
347 }
348
349 llkv_for_each_arrow_numeric!(__llkv_match_dtype);
350 llkv_for_each_arrow_boolean!(__llkv_match_dtype);
351
352 matched
353}
354
355pub fn supported_arrow_types() -> Vec<arrow::datatypes::DataType> {
356 use arrow::datatypes::DataType;
357
358 let mut types = vec![DataType::Utf8, DataType::LargeUtf8];
359
360 macro_rules! __llkv_push_dtype {
361 (
362 $base:ident,
363 $chunk_fn:ident,
364 $chunk_with_rids_fn:ident,
365 $run_fn:ident,
366 $run_with_rids_fn:ident,
367 $array_ty:ty,
368 $physical_ty:ty,
369 $dtype_expr:expr,
370 $native_ty:ty,
371 $cast_expr:expr
372 ) => {
373 types.push($dtype_expr.clone());
374 };
375 }
376
377 llkv_for_each_arrow_numeric!(__llkv_push_dtype);
378 llkv_for_each_arrow_boolean!(__llkv_push_dtype);
379
380 types
381}
382
383pub fn ensure_supported_arrow_type(dtype: &arrow::datatypes::DataType) -> Result<()> {
384 if is_supported_arrow_type(dtype) {
385 return Ok(());
386 }
387
388 let mut supported = supported_arrow_types()
389 .into_iter()
390 .map(|dtype| format!("{dtype:?}"))
391 .collect::<Vec<_>>();
392 supported.sort();
393 supported.dedup();
394
395 Err(Error::InvalidArgumentError(format!(
396 "unsupported Arrow type {dtype:?}; supported types are {}",
397 supported.join(", ")
398 )))
399}
400
401pub mod gather;
402pub mod parallel;
403pub mod store;
404pub mod types;
405
406pub use llkv_result::{Error, Result};
407pub use store::{
408 ColumnStore, IndexKind, ROW_ID_COLUMN_NAME,
409 scan::{self, ScanBuilder},
410};
411
412pub mod debug {
413 pub use super::store::debug::*;
414}