velesdb_core/
column_store.rs

1//! Column-oriented storage for high-performance metadata filtering.
2//!
3//! This module provides a columnar storage format for frequently filtered fields,
4//! avoiding the overhead of JSON parsing during filter operations.
5//!
6//! # Performance Goals
7//!
8//! - Maintain 50M+ items/sec throughput at 100k items (vs 19M/s with JSON)
9//! - Cache-friendly sequential memory access
10//! - Support for common filter operations: Eq, Gt, Lt, In, Range
11//!
12//! # Architecture
13//!
14//! ```text
15//! ColumnStore
16//! ├── columns: HashMap<field_name, TypedColumn>
17//! │   ├── "category" -> StringColumn(Vec<Option<StringId>>)
18//! │   ├── "price"    -> IntColumn(Vec<Option<i64>>)
19//! │   └── "rating"   -> FloatColumn(Vec<Option<f64>>)
20//! └── string_table: StringTable (interning for strings)
21//! ```
22
23use roaring::RoaringBitmap;
24use rustc_hash::FxHashMap;
25use std::collections::HashMap;
26
27/// Interned string ID for fast equality comparisons.
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
29pub struct StringId(u32);
30
31/// String interning table for fast string comparisons.
32#[derive(Debug, Default)]
33pub struct StringTable {
34    /// String to ID mapping
35    string_to_id: FxHashMap<String, StringId>,
36    /// ID to string mapping (for retrieval)
37    id_to_string: Vec<String>,
38}
39
40impl StringTable {
41    /// Creates a new empty string table.
42    #[must_use]
43    pub fn new() -> Self {
44        Self::default()
45    }
46
47    /// Interns a string, returning its ID.
48    ///
49    /// If the string already exists, returns the existing ID.
50    pub fn intern(&mut self, s: &str) -> StringId {
51        if let Some(&id) = self.string_to_id.get(s) {
52            return id;
53        }
54
55        #[allow(clippy::cast_possible_truncation)]
56        let id = StringId(self.id_to_string.len() as u32);
57        self.id_to_string.push(s.to_string());
58        self.string_to_id.insert(s.to_string(), id);
59        id
60    }
61
62    /// Gets the string for an ID.
63    #[must_use]
64    pub fn get(&self, id: StringId) -> Option<&str> {
65        self.id_to_string.get(id.0 as usize).map(String::as_str)
66    }
67
68    /// Gets the ID for a string without interning.
69    #[must_use]
70    pub fn get_id(&self, s: &str) -> Option<StringId> {
71        self.string_to_id.get(s).copied()
72    }
73
74    /// Returns the number of interned strings.
75    #[must_use]
76    pub fn len(&self) -> usize {
77        self.id_to_string.len()
78    }
79
80    /// Returns true if the table is empty.
81    #[must_use]
82    pub fn is_empty(&self) -> bool {
83        self.id_to_string.is_empty()
84    }
85}
86
87/// A typed column storing values of a specific type.
88#[derive(Debug)]
89pub enum TypedColumn {
90    /// Integer column (i64)
91    Int(Vec<Option<i64>>),
92    /// Float column (f64)
93    Float(Vec<Option<f64>>),
94    /// String column (interned IDs)
95    String(Vec<Option<StringId>>),
96    /// Boolean column
97    Bool(Vec<Option<bool>>),
98}
99
100impl TypedColumn {
101    /// Creates a new integer column with the given capacity.
102    #[must_use]
103    pub fn new_int(capacity: usize) -> Self {
104        Self::Int(Vec::with_capacity(capacity))
105    }
106
107    /// Creates a new float column with the given capacity.
108    #[must_use]
109    pub fn new_float(capacity: usize) -> Self {
110        Self::Float(Vec::with_capacity(capacity))
111    }
112
113    /// Creates a new string column with the given capacity.
114    #[must_use]
115    pub fn new_string(capacity: usize) -> Self {
116        Self::String(Vec::with_capacity(capacity))
117    }
118
119    /// Creates a new boolean column with the given capacity.
120    #[must_use]
121    pub fn new_bool(capacity: usize) -> Self {
122        Self::Bool(Vec::with_capacity(capacity))
123    }
124
125    /// Returns the number of values in the column.
126    #[must_use]
127    pub fn len(&self) -> usize {
128        match self {
129            Self::Int(v) => v.len(),
130            Self::Float(v) => v.len(),
131            Self::String(v) => v.len(),
132            Self::Bool(v) => v.len(),
133        }
134    }
135
136    /// Returns true if the column is empty.
137    #[must_use]
138    pub fn is_empty(&self) -> bool {
139        self.len() == 0
140    }
141
142    /// Pushes a null value to the column.
143    pub fn push_null(&mut self) {
144        match self {
145            Self::Int(v) => v.push(None),
146            Self::Float(v) => v.push(None),
147            Self::String(v) => v.push(None),
148            Self::Bool(v) => v.push(None),
149        }
150    }
151}
152
153/// Column store for high-performance filtering.
154#[derive(Debug, Default)]
155pub struct ColumnStore {
156    /// Columns indexed by field name
157    columns: HashMap<String, TypedColumn>,
158    /// String interning table
159    string_table: StringTable,
160    /// Number of rows
161    row_count: usize,
162}
163
164impl ColumnStore {
165    /// Creates a new empty column store.
166    #[must_use]
167    pub fn new() -> Self {
168        Self::default()
169    }
170
171    /// Creates a column store with pre-defined indexed fields.
172    ///
173    /// # Arguments
174    ///
175    /// * `fields` - List of (`field_name`, `field_type`) tuples
176    #[must_use]
177    pub fn with_schema(fields: &[(&str, ColumnType)]) -> Self {
178        let mut store = Self::new();
179        for (name, col_type) in fields {
180            store.add_column(name, *col_type);
181        }
182        store
183    }
184
185    /// Adds a new column to the store.
186    pub fn add_column(&mut self, name: &str, col_type: ColumnType) {
187        let column = match col_type {
188            ColumnType::Int => TypedColumn::new_int(0),
189            ColumnType::Float => TypedColumn::new_float(0),
190            ColumnType::String => TypedColumn::new_string(0),
191            ColumnType::Bool => TypedColumn::new_bool(0),
192        };
193        self.columns.insert(name.to_string(), column);
194    }
195
196    /// Returns the number of rows in the store.
197    #[must_use]
198    pub fn row_count(&self) -> usize {
199        self.row_count
200    }
201
202    /// Returns the string table for string interning.
203    #[must_use]
204    pub fn string_table(&self) -> &StringTable {
205        &self.string_table
206    }
207
208    /// Returns a mutable reference to the string table.
209    pub fn string_table_mut(&mut self) -> &mut StringTable {
210        &mut self.string_table
211    }
212
213    /// Pushes values for a new row.
214    ///
215    /// Missing fields will be set to null.
216    pub fn push_row(&mut self, values: &[(&str, ColumnValue)]) {
217        // Build a map of provided values
218        let value_map: FxHashMap<&str, &ColumnValue> =
219            values.iter().map(|(k, v)| (*k, v)).collect();
220
221        // Update each column
222        for (name, column) in &mut self.columns {
223            if let Some(value) = value_map.get(name.as_str()) {
224                match value {
225                    ColumnValue::Null => column.push_null(),
226                    ColumnValue::Int(v) => {
227                        if let TypedColumn::Int(col) = column {
228                            col.push(Some(*v));
229                        } else {
230                            column.push_null();
231                        }
232                    }
233                    ColumnValue::Float(v) => {
234                        if let TypedColumn::Float(col) = column {
235                            col.push(Some(*v));
236                        } else {
237                            column.push_null();
238                        }
239                    }
240                    ColumnValue::String(id) => {
241                        if let TypedColumn::String(col) = column {
242                            col.push(Some(*id));
243                        } else {
244                            column.push_null();
245                        }
246                    }
247                    ColumnValue::Bool(v) => {
248                        if let TypedColumn::Bool(col) = column {
249                            col.push(Some(*v));
250                        } else {
251                            column.push_null();
252                        }
253                    }
254                }
255            } else {
256                column.push_null();
257            }
258        }
259
260        self.row_count += 1;
261    }
262
263    /// Gets a column by name.
264    #[must_use]
265    pub fn get_column(&self, name: &str) -> Option<&TypedColumn> {
266        self.columns.get(name)
267    }
268
269    /// Filters rows by equality on an integer column.
270    ///
271    /// Returns a vector of row indices that match.
272    #[must_use]
273    pub fn filter_eq_int(&self, column: &str, value: i64) -> Vec<usize> {
274        let Some(TypedColumn::Int(col)) = self.columns.get(column) else {
275            return Vec::new();
276        };
277
278        col.iter()
279            .enumerate()
280            .filter_map(|(idx, v)| if *v == Some(value) { Some(idx) } else { None })
281            .collect()
282    }
283
284    /// Filters rows by equality on a string column.
285    ///
286    /// Returns a vector of row indices that match.
287    #[must_use]
288    pub fn filter_eq_string(&self, column: &str, value: &str) -> Vec<usize> {
289        let Some(TypedColumn::String(col)) = self.columns.get(column) else {
290            return Vec::new();
291        };
292
293        let Some(string_id) = self.string_table.get_id(value) else {
294            return Vec::new(); // String not in table, no matches
295        };
296
297        col.iter()
298            .enumerate()
299            .filter_map(|(idx, v)| {
300                if *v == Some(string_id) {
301                    Some(idx)
302                } else {
303                    None
304                }
305            })
306            .collect()
307    }
308
309    /// Filters rows by range on an integer column (value > threshold).
310    ///
311    /// Returns a vector of row indices that match.
312    #[must_use]
313    pub fn filter_gt_int(&self, column: &str, threshold: i64) -> Vec<usize> {
314        let Some(TypedColumn::Int(col)) = self.columns.get(column) else {
315            return Vec::new();
316        };
317
318        col.iter()
319            .enumerate()
320            .filter_map(|(idx, v)| match v {
321                Some(val) if *val > threshold => Some(idx),
322                _ => None,
323            })
324            .collect()
325    }
326
327    /// Filters rows by range on an integer column (value < threshold).
328    #[must_use]
329    pub fn filter_lt_int(&self, column: &str, threshold: i64) -> Vec<usize> {
330        let Some(TypedColumn::Int(col)) = self.columns.get(column) else {
331            return Vec::new();
332        };
333
334        col.iter()
335            .enumerate()
336            .filter_map(|(idx, v)| match v {
337                Some(val) if *val < threshold => Some(idx),
338                _ => None,
339            })
340            .collect()
341    }
342
343    /// Filters rows by range on an integer column (low < value < high).
344    #[must_use]
345    pub fn filter_range_int(&self, column: &str, low: i64, high: i64) -> Vec<usize> {
346        let Some(TypedColumn::Int(col)) = self.columns.get(column) else {
347            return Vec::new();
348        };
349
350        col.iter()
351            .enumerate()
352            .filter_map(|(idx, v)| match v {
353                Some(val) if *val > low && *val < high => Some(idx),
354                _ => None,
355            })
356            .collect()
357    }
358
359    /// Filters rows by IN clause on a string column.
360    ///
361    /// Returns a vector of row indices that match any of the values.
362    #[must_use]
363    pub fn filter_in_string(&self, column: &str, values: &[&str]) -> Vec<usize> {
364        let Some(TypedColumn::String(col)) = self.columns.get(column) else {
365            return Vec::new();
366        };
367
368        // Convert string values to IDs
369        let ids: Vec<StringId> = values
370            .iter()
371            .filter_map(|s| self.string_table.get_id(s))
372            .collect();
373
374        if ids.is_empty() {
375            return Vec::new();
376        }
377
378        // Perf: Use HashSet only for large IN clauses (>16 values)
379        // Vec.contains() is faster for small arrays due to cache locality
380        if ids.len() > 16 {
381            let id_set: rustc_hash::FxHashSet<StringId> = ids.into_iter().collect();
382            col.iter()
383                .enumerate()
384                .filter_map(|(idx, v)| match v {
385                    Some(id) if id_set.contains(id) => Some(idx),
386                    _ => None,
387                })
388                .collect()
389        } else {
390            col.iter()
391                .enumerate()
392                .filter_map(|(idx, v)| match v {
393                    Some(id) if ids.contains(id) => Some(idx),
394                    _ => None,
395                })
396                .collect()
397        }
398    }
399
400    /// Counts rows matching equality on an integer column.
401    ///
402    /// More efficient than `filter_eq_int().len()` as it doesn't allocate.
403    #[must_use]
404    pub fn count_eq_int(&self, column: &str, value: i64) -> usize {
405        let Some(TypedColumn::Int(col)) = self.columns.get(column) else {
406            return 0;
407        };
408
409        col.iter().filter(|v| **v == Some(value)).count()
410    }
411
412    /// Counts rows matching equality on a string column.
413    #[must_use]
414    pub fn count_eq_string(&self, column: &str, value: &str) -> usize {
415        let Some(TypedColumn::String(col)) = self.columns.get(column) else {
416            return 0;
417        };
418
419        let Some(string_id) = self.string_table.get_id(value) else {
420            return 0;
421        };
422
423        col.iter().filter(|v| **v == Some(string_id)).count()
424    }
425
426    // =========================================================================
427    // Optimized Bitmap-based Filtering (for 100k+ items)
428    // =========================================================================
429
430    /// Filters rows by equality on an integer column, returning a bitmap.
431    ///
432    /// Uses `RoaringBitmap` for memory-efficient storage of matching indices.
433    /// Useful for combining multiple filters with AND/OR operations.
434    #[must_use]
435    #[allow(clippy::cast_possible_truncation)]
436    pub fn filter_eq_int_bitmap(&self, column: &str, value: i64) -> RoaringBitmap {
437        let Some(TypedColumn::Int(col)) = self.columns.get(column) else {
438            return RoaringBitmap::new();
439        };
440
441        col.iter()
442            .enumerate()
443            .filter_map(|(idx, v)| {
444                if *v == Some(value) {
445                    Some(idx as u32)
446                } else {
447                    None
448                }
449            })
450            .collect()
451    }
452
453    /// Filters rows by equality on a string column, returning a bitmap.
454    #[must_use]
455    #[allow(clippy::cast_possible_truncation)]
456    pub fn filter_eq_string_bitmap(&self, column: &str, value: &str) -> RoaringBitmap {
457        let Some(TypedColumn::String(col)) = self.columns.get(column) else {
458            return RoaringBitmap::new();
459        };
460
461        let Some(string_id) = self.string_table.get_id(value) else {
462            return RoaringBitmap::new();
463        };
464
465        col.iter()
466            .enumerate()
467            .filter_map(|(idx, v)| {
468                if *v == Some(string_id) {
469                    Some(idx as u32)
470                } else {
471                    None
472                }
473            })
474            .collect()
475    }
476
477    /// Filters rows by range on an integer column, returning a bitmap.
478    #[must_use]
479    #[allow(clippy::cast_possible_truncation)]
480    pub fn filter_range_int_bitmap(&self, column: &str, low: i64, high: i64) -> RoaringBitmap {
481        let Some(TypedColumn::Int(col)) = self.columns.get(column) else {
482            return RoaringBitmap::new();
483        };
484
485        col.iter()
486            .enumerate()
487            .filter_map(|(idx, v)| match v {
488                Some(val) if *val > low && *val < high => Some(idx as u32),
489                _ => None,
490            })
491            .collect()
492    }
493
494    /// Combines two filter results using AND.
495    ///
496    /// Returns indices that are in both bitmaps.
497    #[must_use]
498    pub fn bitmap_and(a: &RoaringBitmap, b: &RoaringBitmap) -> RoaringBitmap {
499        a & b
500    }
501
502    /// Combines two filter results using OR.
503    ///
504    /// Returns indices that are in either bitmap.
505    #[must_use]
506    pub fn bitmap_or(a: &RoaringBitmap, b: &RoaringBitmap) -> RoaringBitmap {
507        a | b
508    }
509}
510
511/// Column type for schema definition.
512#[derive(Debug, Clone, Copy, PartialEq, Eq)]
513pub enum ColumnType {
514    /// 64-bit signed integer
515    Int,
516    /// 64-bit floating point
517    Float,
518    /// Interned string
519    String,
520    /// Boolean
521    Bool,
522}
523
524/// A value that can be stored in a column.
525#[derive(Debug, Clone)]
526pub enum ColumnValue {
527    /// Integer value
528    Int(i64),
529    /// Float value
530    Float(f64),
531    /// String ID (must be interned first)
532    String(StringId),
533    /// Boolean value
534    Bool(bool),
535    /// Null value
536    Null,
537}