llkv_column_map/
lib.rs

1//! Columnar storage engine for LLKV.
2//!
3//! This crate provides a low-level columnar storage layer that persists Apache Arrow
4//! [`RecordBatch`]es to disk and supports efficient scans, filters, and updates.
5//! It serves as the foundation for [`llkv-table`] and higher-level query execution.
6//!
7//! # Architecture
8//!
9//! The storage engine is organized into several key components:
10//!
11//! - **[`ColumnStore`]**: Primary interface for storing and retrieving columnar data.
12//!   Manages column descriptors, metadata catalogs, and coordinates with the pager
13//!   for persistent storage.
14//!
15//! - **[`LogicalFieldId`](types::LogicalFieldId)**: Namespaced identifier for columns.
16//!   Combines a namespace (user data, row ID shadow, MVCC metadata), table ID, and
17//!   field ID into a single 64-bit value to prevent collisions.
18//!
19//! - **[`ScanBuilder`]**: Builder pattern for constructing column scans with various
20//!   options (filters, ordering, row ID inclusion).
21//!
22//! - **Visitor Pattern**: Scans emit data through visitor callbacks rather than
23//!   materializing entire columns in memory, enabling streaming and aggregation.
24//!
25//! # Storage Model
26//!
27//! Data is stored in columnar chunks:
28//! - Each column is identified by a `LogicalFieldId`
29//! - Columns are broken into chunks for incremental writes
30//! - Each chunk stores Arrow-serialized data plus metadata (row count, min/max values)
31//! - Shadow columns track row IDs separately from user data
32//! - MVCC columns (`created_by`, `deleted_by`) track transaction visibility
33//!
34//! # Namespaces
35//!
36//! Columns are organized into namespaces to prevent ID collisions:
37//! - `UserData`: Regular table columns
38//! - `RowIdShadow`: Internal row ID tracking for each column
39//! - `TxnCreatedBy`: MVCC transaction that created each row
40//! - `TxnDeletedBy`: MVCC transaction that deleted each row
41//!
42//! # Thread Safety
43//!
44//! `ColumnStore` is thread-safe (`Send + Sync`) with internal locking for
45//! catalog updates. Read operations can occur concurrently; writes are
46//! serialized through the catalog lock.
47//!
48//! [`RecordBatch`]: arrow::record_batch::RecordBatch
49//! [`llkv-table`]: https://docs.rs/llkv-table
50//! [`ColumnStore`]: store::ColumnStore
51//! [`ScanBuilder`]: scan::ScanBuilder
52//!
53//! # Macros and Type Dispatch
54//!
55//! This crate provides macros for efficient type-specific operations without runtime
56//! dispatch overhead. See [`with_integer_arrow_type!`] for details.
57
58// NOTE: rustfmt currently re-indents portions of macro_rules! blocks in this
59// file (observed when running `cargo fmt`). This produces noisy diffs and
60// churn because rustfmt will flip formatting between runs. The problematic
61// locations in this module are the macro_rules! dispatch macros declared
62// below. Until the underlying rustfmt bug is fixed, we intentionally opt out
63// of automatic formatting for those specific macros using `#[rustfmt::skip]`,
64// while keeping the rest of the module formatted normally.
65//
66// Reproduction / debugging tips for contributors:
67// - Run `rustup run stable rustfmt -- --version` to confirm the rustfmt
68//   version, then `cargo fmt` to reproduce the behavior.
69// - Narrow the change by running rustfmt on this file only:
70//     rustfmt llkv-column-map/src/store/scan/unsorted.rs
71// - If you can produce a minimal self-contained example that triggers the
72//   re-indent, open an issue with rustfmt (include rustfmt version and the
73//   minimal example) and link it here.
74//
75// NOTE: Once a minimal reproducer for the rustfmt regression exists, link the
76// upstream issue here and remove the `#[rustfmt::skip]` attributes so the file
77// can return to standard formatting. Progress is tracked at
78// https://github.com/rust-lang/rustfmt/issues/6629#issuecomment-3395446770.
79
80/// Dispatches to type-specific code based on an Arrow `DataType`.
81///
82/// This macro eliminates runtime type checking by expanding to type-specific code
83/// at compile time. It matches the provided `DataType` against supported numeric types
84/// and binds the corresponding Arrow primitive type to the specified identifier.
85///
86/// # Parameters
87///
88/// - `$dtype` - Expression evaluating to `&arrow::datatypes::DataType`
89/// - `$ty` - Identifier to bind the Arrow primitive type to (e.g., `UInt64Type`)
90/// - `$body` - Code to execute with `$ty` bound to the matched type
91/// - `$unsupported` - Fallback expression if the type is not supported
92///
93/// # Performance
94///
95/// This macro is used in hot paths to avoid runtime `match` statements and virtual
96/// dispatch. The compiler generates specialized code for each type.
97#[macro_export]
98#[rustfmt::skip]
99macro_rules! with_integer_arrow_type {
100    ($dtype:expr, |$ty:ident| $body:expr, $unsupported:expr $(,)?) => {{
101        use std::borrow::Borrow;
102
103        let dtype_value = $dtype;
104        let dtype_ref: &arrow::datatypes::DataType = dtype_value.borrow();
105        let mut result: Option<_> = None;
106
107        macro_rules! __llkv_dispatch_integer_arrow_type {
108            (
109                        $base:ident,
110                        $chunk_fn:ident,
111                        $chunk_with_rids_fn:ident,
112                        $run_fn:ident,
113                        $run_with_rids_fn:ident,
114                        $array_ty:ty,
115                        $physical_ty:ty,
116                        $dtype_expr:expr,
117                        $native_ty:ty,
118                        $cast_expr:expr
119                    ) => {
120                if dtype_ref == &$dtype_expr {
121                    type $ty = $physical_ty;
122                    result = Some($body);
123                }
124            };
125        }
126
127        llkv_for_each_arrow_numeric!(__llkv_dispatch_integer_arrow_type);
128
129        result.unwrap_or_else(|| $unsupported)
130    }};
131}
132
133/// Invokes a macro for each supported Arrow numeric type.
134///
135/// This is a helper macro that generates repetitive type-specific code. It calls
136/// the provided macro once for each numeric Arrow type with metadata about that type.
137///
138/// # Macro Arguments Provided to Callback
139///
140/// For each type, the callback macro receives:
141/// 1. Base type name (e.g., `u64`, `i32`, `f64`)
142/// 2. Chunk visitor method name (e.g., `u64_chunk`)
143/// 3. Chunk with row IDs visitor method name (e.g., `u64_chunk_with_rids`)
144/// 4. Run visitor method name (e.g., `u64_run`)
145/// 5. Run with row IDs visitor method name (e.g., `u64_run_with_rids`)
146/// 6. Arrow array type (e.g., `arrow::array::UInt64Array`)
147/// 7. Arrow physical type (e.g., `arrow::datatypes::UInt64Type`)
148/// 8. Arrow DataType enum variant (e.g., `arrow::datatypes::DataType::UInt64`)
149/// 9. Native Rust type (e.g., `u64`)
150/// 10. Cast expression for type conversion
151#[macro_export]
152#[rustfmt::skip]
153macro_rules! llkv_for_each_arrow_numeric {
154    ($macro:ident) => {
155        $macro!(
156            u64,
157            u64_chunk,
158            u64_chunk_with_rids,
159            u64_run,
160            u64_run_with_rids,
161            arrow::array::UInt64Array,
162            arrow::datatypes::UInt64Type,
163            arrow::datatypes::DataType::UInt64,
164            u64,
165            |v: u64| v as f64
166        );
167        $macro!(
168            u32,
169            u32_chunk,
170            u32_chunk_with_rids,
171            u32_run,
172            u32_run_with_rids,
173            arrow::array::UInt32Array,
174            arrow::datatypes::UInt32Type,
175            arrow::datatypes::DataType::UInt32,
176            u32,
177            |v: u32| v as f64
178        );
179        $macro!(
180            u16,
181            u16_chunk,
182            u16_chunk_with_rids,
183            u16_run,
184            u16_run_with_rids,
185            arrow::array::UInt16Array,
186            arrow::datatypes::UInt16Type,
187            arrow::datatypes::DataType::UInt16,
188            u16,
189            |v: u16| v as f64
190        );
191        $macro!(
192            u8,
193            u8_chunk,
194            u8_chunk_with_rids,
195            u8_run,
196            u8_run_with_rids,
197            arrow::array::UInt8Array,
198            arrow::datatypes::UInt8Type,
199            arrow::datatypes::DataType::UInt8,
200            u8,
201            |v: u8| v as f64
202        );
203        $macro!(
204            i64,
205            i64_chunk,
206            i64_chunk_with_rids,
207            i64_run,
208            i64_run_with_rids,
209            arrow::array::Int64Array,
210            arrow::datatypes::Int64Type,
211            arrow::datatypes::DataType::Int64,
212            i64,
213            |v: i64| v as f64
214        );
215        $macro!(
216            i32,
217            i32_chunk,
218            i32_chunk_with_rids,
219            i32_run,
220            i32_run_with_rids,
221            arrow::array::Int32Array,
222            arrow::datatypes::Int32Type,
223            arrow::datatypes::DataType::Int32,
224            i32,
225            |v: i32| v as f64
226        );
227        $macro!(
228            i16,
229            i16_chunk,
230            i16_chunk_with_rids,
231            i16_run,
232            i16_run_with_rids,
233            arrow::array::Int16Array,
234            arrow::datatypes::Int16Type,
235            arrow::datatypes::DataType::Int16,
236            i16,
237            |v: i16| v as f64
238        );
239        $macro!(
240            i8,
241            i8_chunk,
242            i8_chunk_with_rids,
243            i8_run,
244            i8_run_with_rids,
245            arrow::array::Int8Array,
246            arrow::datatypes::Int8Type,
247            arrow::datatypes::DataType::Int8,
248            i8,
249            |v: i8| v as f64
250        );
251        $macro!(
252            f64,
253            f64_chunk,
254            f64_chunk_with_rids,
255            f64_run,
256            f64_run_with_rids,
257            arrow::array::Float64Array,
258            arrow::datatypes::Float64Type,
259            arrow::datatypes::DataType::Float64,
260            f64,
261            |v: f64| v
262        );
263        $macro!(
264            f32,
265            f32_chunk,
266            f32_chunk_with_rids,
267            f32_run,
268            f32_run_with_rids,
269            arrow::array::Float32Array,
270            arrow::datatypes::Float32Type,
271            arrow::datatypes::DataType::Float32,
272            f32,
273            |v: f32| v as f64
274        );
275        $macro!(
276            date64,
277            date64_chunk,
278            date64_chunk_with_rids,
279            date64_run,
280            date64_run_with_rids,
281            arrow::array::Date64Array,
282            arrow::datatypes::Date64Type,
283            arrow::datatypes::DataType::Date64,
284            i64,
285            |v: i64| v as f64
286        );
287        $macro!(
288            date32,
289            date32_chunk,
290            date32_chunk_with_rids,
291            date32_run,
292            date32_run_with_rids,
293            arrow::array::Date32Array,
294            arrow::datatypes::Date32Type,
295            arrow::datatypes::DataType::Date32,
296            i32,
297            |v: i32| v as f64
298        );
299    };
300}
301
302#[macro_export]
303#[rustfmt::skip]
304macro_rules! llkv_for_each_arrow_boolean {
305    ($macro:ident) => {
306        $macro!(
307            bool,
308            bool_chunk,
309            bool_chunk_with_rids,
310            bool_run,
311            bool_run_with_rids,
312            arrow::array::BooleanArray,
313            arrow::datatypes::BooleanType,
314            arrow::datatypes::DataType::Boolean,
315            bool,
316            |v: bool| if v { 1.0 } else { 0.0 }
317        );
318    };
319}
320
321pub fn is_supported_arrow_type(dtype: &arrow::datatypes::DataType) -> bool {
322    use arrow::datatypes::DataType;
323
324    if matches!(dtype, DataType::Utf8 | DataType::LargeUtf8) {
325        return true;
326    }
327
328    let mut matched = false;
329
330    macro_rules! __llkv_match_dtype {
331        (
332            $base:ident,
333            $chunk_fn:ident,
334            $chunk_with_rids_fn:ident,
335            $run_fn:ident,
336            $run_with_rids_fn:ident,
337            $array_ty:ty,
338            $physical_ty:ty,
339            $dtype_expr:expr,
340            $native_ty:ty,
341            $cast_expr:expr
342        ) => {
343            if dtype == &$dtype_expr {
344                matched = true;
345            }
346        };
347    }
348
349    llkv_for_each_arrow_numeric!(__llkv_match_dtype);
350    llkv_for_each_arrow_boolean!(__llkv_match_dtype);
351
352    matched
353}
354
355pub fn supported_arrow_types() -> Vec<arrow::datatypes::DataType> {
356    use arrow::datatypes::DataType;
357
358    let mut types = vec![DataType::Utf8, DataType::LargeUtf8];
359
360    macro_rules! __llkv_push_dtype {
361        (
362            $base:ident,
363            $chunk_fn:ident,
364            $chunk_with_rids_fn:ident,
365            $run_fn:ident,
366            $run_with_rids_fn:ident,
367            $array_ty:ty,
368            $physical_ty:ty,
369            $dtype_expr:expr,
370            $native_ty:ty,
371            $cast_expr:expr
372        ) => {
373            types.push($dtype_expr.clone());
374        };
375    }
376
377    llkv_for_each_arrow_numeric!(__llkv_push_dtype);
378    llkv_for_each_arrow_boolean!(__llkv_push_dtype);
379
380    types
381}
382
383pub fn ensure_supported_arrow_type(dtype: &arrow::datatypes::DataType) -> Result<()> {
384    if is_supported_arrow_type(dtype) {
385        return Ok(());
386    }
387
388    let mut supported = supported_arrow_types()
389        .into_iter()
390        .map(|dtype| format!("{dtype:?}"))
391        .collect::<Vec<_>>();
392    supported.sort();
393    supported.dedup();
394
395    Err(Error::InvalidArgumentError(format!(
396        "unsupported Arrow type {dtype:?}; supported types are {}",
397        supported.join(", ")
398    )))
399}
400
401pub mod gather;
402pub mod parallel;
403pub mod store;
404pub mod types;
405
406pub use llkv_result::{Error, Result};
407pub use store::{
408    ColumnStore, IndexKind, ROW_ID_COLUMN_NAME,
409    scan::{self, ScanBuilder},
410};
411
412pub mod debug {
413    pub use super::store::debug::*;
414}