Skip to main content

sochdb_storage/
row_format.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Slot-Based Columnar Row Storage (PAX/Hybrid Format) - Recommendation 1
19//!
20//! ## Problem
21//!
22//! Current implementation stores each row as HashMap<String, SochValue>, which incurs:
23//! - 48 bytes minimum HashMap overhead per row
24//! - 24 bytes per String key (pointer + length + capacity)
25//! - 16-32 bytes per SochValue (enum tag + union)
26//! - Heap fragmentation from individual allocations
27//!
28//! For a 4-column row: 48 + 4×(24 + 24) = 240 bytes vs SQLite's ~50 bytes
29//!
30//! ## Solution
31//!
32//! Slot-based row format: fixed header + variable payload
33//! - Row ID (8 bytes)
34//! - Null bitmap (2 bytes for 16 columns)
35//! - Slot count (1 byte)
36//! - Flags (1 byte for deleted, MVCC markers)
37//! - MVCC timestamps (16 bytes)
38//! - Slot array: [offset: u16, len: u16] per column
39//! - Data follows slots contiguously
40//!
41//! ## Performance Analysis
42//!
43//! | Component    | Current   | Proposed       | Savings |
44//! |--------------|-----------|----------------|---------|
45//! | Header       | 48 (HashMap)| 28 (SlotRow)  | 42%     |
46//! | 4 String keys| 96        | 0 (schema-indexed)| 100% |
47//! | 4 SochValues | 128       | 32 (raw bytes) | 75%     |
48//! | Total        | 272 bytes | 60 bytes       | 78%     |
49//!
50//! Cache analysis: With 64KB pages and 60-byte rows, we get 1,092 rows per page
51//! vs current ~240 rows per page with HashMap approach.
52
53/// Minimum slot row header size (without slots)
54pub const SLOT_ROW_HEADER_SIZE: usize = 28;
55
56/// Maximum columns supported by null bitmap (u16)
57pub const MAX_SLOT_COLUMNS: usize = 16;
58
59/// Slot row flags
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61#[repr(u8)]
62pub enum SlotRowFlags {
63    /// Normal row
64    Normal = 0,
65    /// Deleted (tombstone)
66    Deleted = 1,
67    /// Uncommitted (MVCC)
68    Uncommitted = 2,
69    /// Compressed
70    Compressed = 4,
71}
72
73impl SlotRowFlags {
74    pub fn is_deleted(&self) -> bool {
75        (*self as u8) & 1 != 0
76    }
77
78    pub fn is_uncommitted(&self) -> bool {
79        (*self as u8) & 2 != 0
80    }
81
82    pub fn is_compressed(&self) -> bool {
83        (*self as u8) & 4 != 0
84    }
85}
86
87/// Slot entry: offset and length for a column value
88#[derive(Debug, Clone, Copy)]
89#[repr(C, packed)]
90pub struct Slot {
91    /// Offset from data start (u16 allows 64KB max row size)
92    pub offset: u16,
93    /// Length of value in bytes
94    pub len: u16,
95}
96
97impl Slot {
98    pub const SIZE: usize = 4;
99
100    #[inline]
101    pub fn new(offset: u16, len: u16) -> Self {
102        Self { offset, len }
103    }
104
105    #[inline]
106    pub fn is_null(&self) -> bool {
107        self.len == 0 && self.offset == 0xFFFF
108    }
109
110    #[inline]
111    pub fn null() -> Self {
112        Self {
113            offset: 0xFFFF,
114            len: 0,
115        }
116    }
117}
118
119/// Slot-based row format with O(1) field access
120///
121/// ## Memory Layout
122///
123/// ```text
124/// ┌───────────────────────────────────────────────────────┐
125/// │ Header (28 bytes)                                      │
126/// │  row_id: u64           (8 bytes)                       │
127/// │  null_bitmap: u16      (2 bytes)                       │
128/// │  slot_count: u8        (1 byte)                        │
129/// │  flags: u8             (1 byte)                        │
130/// │  txn_start: u64        (8 bytes) MVCC                  │
131/// │  txn_end: u64          (8 bytes) MVCC                  │
132/// ├───────────────────────────────────────────────────────┤
133/// │ Slots: [offset: u16, len: u16] × slot_count           │
134/// ├───────────────────────────────────────────────────────┤
135/// │ Data: contiguous column values                         │
136/// └───────────────────────────────────────────────────────┘
137/// ```
138#[repr(C)]
139pub struct SlotRow {
140    /// Row identifier
141    row_id: u64,
142    /// Null bitmap (bit i = 1 means column i is NULL)
143    null_bitmap: u16,
144    /// Number of columns (slots)
145    slot_count: u8,
146    /// Row flags (deleted, uncommitted, etc.)
147    flags: u8,
148    /// MVCC: Transaction that created this version
149    txn_start: u64,
150    /// MVCC: Transaction that deleted this version (u64::MAX = not deleted)
151    txn_end: u64,
152    /// Raw storage for slots + data
153    /// Layout: [Slot; slot_count][data bytes...]
154    storage: Vec<u8>,
155}
156
157impl SlotRow {
158    /// Create a new slot row with specified column count
159    pub fn new(row_id: u64, slot_count: u8) -> Self {
160        assert!((slot_count as usize) <= MAX_SLOT_COLUMNS);
161        let slots_size = (slot_count as usize) * Slot::SIZE;
162        Self {
163            row_id,
164            null_bitmap: 0,
165            slot_count,
166            flags: SlotRowFlags::Normal as u8,
167            txn_start: 0,
168            txn_end: u64::MAX,
169            storage: vec![0u8; slots_size],
170        }
171    }
172
173    /// Create from raw values
174    pub fn from_values(row_id: u64, values: &[Option<&[u8]>]) -> Self {
175        let slot_count = values.len().min(MAX_SLOT_COLUMNS) as u8;
176        let slots_size = (slot_count as usize) * Slot::SIZE;
177
178        // Calculate total data size
179        let data_size: usize = values.iter().map(|v| v.map(|b| b.len()).unwrap_or(0)).sum();
180
181        let mut storage = vec![0u8; slots_size + data_size];
182        let mut null_bitmap = 0u16;
183        let mut data_offset = 0u16;
184
185        // Write slots and data
186        for (i, value) in values.iter().enumerate() {
187            let slot = match value {
188                Some(data) => {
189                    let slot = Slot::new(data_offset, data.len() as u16);
190                    // Write data
191                    let data_start = slots_size + data_offset as usize;
192                    storage[data_start..data_start + data.len()].copy_from_slice(data);
193                    data_offset += data.len() as u16;
194                    slot
195                }
196                None => {
197                    null_bitmap |= 1 << i;
198                    Slot::null()
199                }
200            };
201
202            // Write slot
203            let slot_start = i * Slot::SIZE;
204            storage[slot_start..slot_start + 2].copy_from_slice(&slot.offset.to_le_bytes());
205            storage[slot_start + 2..slot_start + 4].copy_from_slice(&slot.len.to_le_bytes());
206        }
207
208        Self {
209            row_id,
210            null_bitmap,
211            slot_count,
212            flags: SlotRowFlags::Normal as u8,
213            txn_start: 0,
214            txn_end: u64::MAX,
215            storage,
216        }
217    }
218
219    /// Get row ID
220    #[inline]
221    pub fn row_id(&self) -> u64 {
222        self.row_id
223    }
224
225    /// Get column count
226    #[inline]
227    pub fn column_count(&self) -> usize {
228        self.slot_count as usize
229    }
230
231    /// Check if column is NULL - O(1)
232    #[inline]
233    pub fn is_null(&self, column_idx: usize) -> bool {
234        if column_idx >= self.slot_count as usize {
235            return true;
236        }
237        (self.null_bitmap & (1 << column_idx)) != 0
238    }
239
240    /// Get slot for a column - O(1)
241    #[inline]
242    fn get_slot(&self, column_idx: usize) -> Option<Slot> {
243        if column_idx >= self.slot_count as usize {
244            return None;
245        }
246        let slot_start = column_idx * Slot::SIZE;
247        if slot_start + Slot::SIZE > self.storage.len() {
248            return None;
249        }
250
251        let offset = u16::from_le_bytes([self.storage[slot_start], self.storage[slot_start + 1]]);
252        let len = u16::from_le_bytes([self.storage[slot_start + 2], self.storage[slot_start + 3]]);
253
254        Some(Slot { offset, len })
255    }
256
257    /// Get column value as bytes - O(1)
258    ///
259    /// This is the key performance advantage: direct offset arithmetic
260    /// instead of HashMap lookup (~50ns → ~2ns)
261    #[inline]
262    pub fn get_bytes(&self, column_idx: usize) -> Option<&[u8]> {
263        if self.is_null(column_idx) {
264            return None;
265        }
266
267        let slot = self.get_slot(column_idx)?;
268        if slot.is_null() {
269            return None;
270        }
271
272        let slots_size = (self.slot_count as usize) * Slot::SIZE;
273        let data_start = slots_size + slot.offset as usize;
274        let data_end = data_start + slot.len as usize;
275
276        if data_end > self.storage.len() {
277            return None;
278        }
279
280        Some(&self.storage[data_start..data_end])
281    }
282
283    /// Get column as i64 - O(1)
284    #[inline]
285    pub fn get_i64(&self, column_idx: usize) -> Option<i64> {
286        let bytes = self.get_bytes(column_idx)?;
287        if bytes.len() != 8 {
288            return None;
289        }
290        Some(i64::from_le_bytes(bytes.try_into().ok()?))
291    }
292
293    /// Get column as u64 - O(1)
294    #[inline]
295    pub fn get_u64(&self, column_idx: usize) -> Option<u64> {
296        let bytes = self.get_bytes(column_idx)?;
297        if bytes.len() != 8 {
298            return None;
299        }
300        Some(u64::from_le_bytes(bytes.try_into().ok()?))
301    }
302
303    /// Get column as f64 - O(1)
304    #[inline]
305    pub fn get_f64(&self, column_idx: usize) -> Option<f64> {
306        let bytes = self.get_bytes(column_idx)?;
307        if bytes.len() != 8 {
308            return None;
309        }
310        Some(f64::from_le_bytes(bytes.try_into().ok()?))
311    }
312
313    /// Get column as bool - O(1)
314    #[inline]
315    pub fn get_bool(&self, column_idx: usize) -> Option<bool> {
316        let bytes = self.get_bytes(column_idx)?;
317        if bytes.is_empty() {
318            return None;
319        }
320        Some(bytes[0] != 0)
321    }
322
323    /// Get column as string - O(1)
324    #[inline]
325    pub fn get_str(&self, column_idx: usize) -> Option<&str> {
326        let bytes = self.get_bytes(column_idx)?;
327        std::str::from_utf8(bytes).ok()
328    }
329
330    /// Set MVCC timestamps
331    pub fn set_mvcc(&mut self, txn_start: u64, txn_end: u64) {
332        self.txn_start = txn_start;
333        self.txn_end = txn_end;
334    }
335
336    /// Get MVCC start timestamp
337    #[inline]
338    pub fn txn_start(&self) -> u64 {
339        self.txn_start
340    }
341
342    /// Get MVCC end timestamp
343    #[inline]
344    pub fn txn_end(&self) -> u64 {
345        self.txn_end
346    }
347
348    /// Check if visible at snapshot
349    #[inline]
350    pub fn is_visible_at(&self, snapshot_ts: u64) -> bool {
351        self.txn_start < snapshot_ts && snapshot_ts <= self.txn_end
352    }
353
354    /// Set deleted flag
355    pub fn set_deleted(&mut self) {
356        self.flags |= SlotRowFlags::Deleted as u8;
357    }
358
359    /// Check if deleted
360    #[inline]
361    pub fn is_deleted(&self) -> bool {
362        (self.flags & SlotRowFlags::Deleted as u8) != 0
363    }
364
365    /// Get total memory size
366    pub fn memory_size(&self) -> usize {
367        SLOT_ROW_HEADER_SIZE + self.storage.len()
368    }
369
370    /// Serialize to bytes
371    pub fn to_bytes(&self) -> Vec<u8> {
372        let total_size = SLOT_ROW_HEADER_SIZE + self.storage.len();
373        let mut buf = Vec::with_capacity(total_size);
374
375        buf.extend_from_slice(&self.row_id.to_le_bytes());
376        buf.extend_from_slice(&self.null_bitmap.to_le_bytes());
377        buf.push(self.slot_count);
378        buf.push(self.flags);
379        buf.extend_from_slice(&self.txn_start.to_le_bytes());
380        buf.extend_from_slice(&self.txn_end.to_le_bytes());
381        buf.extend_from_slice(&self.storage);
382
383        buf
384    }
385
386    /// Deserialize from bytes
387    pub fn from_bytes(data: &[u8]) -> Option<Self> {
388        if data.len() < SLOT_ROW_HEADER_SIZE {
389            return None;
390        }
391
392        let row_id = u64::from_le_bytes(data[0..8].try_into().ok()?);
393        let null_bitmap = u16::from_le_bytes(data[8..10].try_into().ok()?);
394        let slot_count = data[10];
395        let flags = data[11];
396        let txn_start = u64::from_le_bytes(data[12..20].try_into().ok()?);
397        let txn_end = u64::from_le_bytes(data[20..28].try_into().ok()?);
398        let storage = data[28..].to_vec();
399
400        Some(Self {
401            row_id,
402            null_bitmap,
403            slot_count,
404            flags,
405            txn_start,
406            txn_end,
407            storage,
408        })
409    }
410}
411
412impl std::fmt::Debug for SlotRow {
413    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
414        f.debug_struct("SlotRow")
415            .field("row_id", &self.row_id)
416            .field("slot_count", &self.slot_count)
417            .field("null_bitmap", &format!("{:016b}", self.null_bitmap))
418            .field("flags", &self.flags)
419            .field("txn_start", &self.txn_start)
420            .field("txn_end", &self.txn_end)
421            .field("storage_len", &self.storage.len())
422            .finish()
423    }
424}
425
426// =============================================================================
427// Arena-Based Slot Row Allocation
428// =============================================================================
429
430/// Arena allocator for SlotRow storage
431///
432/// Allocates rows from contiguous memory blocks to:
433/// - Reduce heap fragmentation
434/// - Improve cache locality
435/// - Enable efficient bulk deallocation
436pub struct SlotRowArena {
437    /// Memory blocks
438    blocks: Vec<Vec<u8>>,
439    /// Current block index
440    current_block: usize,
441    /// Current offset in current block
442    current_offset: usize,
443    /// Block size (default 64KB)
444    block_size: usize,
445    /// Total bytes allocated
446    total_allocated: usize,
447}
448
449impl SlotRowArena {
450    /// Default block size (64KB - L2 cache friendly)
451    pub const DEFAULT_BLOCK_SIZE: usize = 64 * 1024;
452
453    pub fn new() -> Self {
454        Self::with_block_size(Self::DEFAULT_BLOCK_SIZE)
455    }
456
457    pub fn with_block_size(block_size: usize) -> Self {
458        Self {
459            blocks: vec![vec![0u8; block_size]],
460            current_block: 0,
461            current_offset: 0,
462            block_size,
463            total_allocated: 0,
464        }
465    }
466
467    /// Allocate space for a row
468    pub fn allocate(&mut self, size: usize) -> &mut [u8] {
469        if self.current_offset + size > self.block_size {
470            // Need new block
471            self.blocks.push(vec![0u8; self.block_size.max(size)]);
472            self.current_block = self.blocks.len() - 1;
473            self.current_offset = 0;
474        }
475
476        let start = self.current_offset;
477        self.current_offset += size;
478        self.total_allocated += size;
479
480        &mut self.blocks[self.current_block][start..start + size]
481    }
482
483    /// Store a SlotRow and return handle
484    pub fn store(&mut self, row: &SlotRow) -> SlotRowHandle {
485        let bytes = row.to_bytes();
486        let slot = self.allocate(bytes.len());
487        slot.copy_from_slice(&bytes);
488
489        SlotRowHandle {
490            block_idx: self.current_block,
491            offset: self.current_offset - bytes.len(),
492            len: bytes.len(),
493        }
494    }
495
496    /// Get row from handle
497    pub fn get(&self, handle: &SlotRowHandle) -> Option<SlotRow> {
498        let block = self.blocks.get(handle.block_idx)?;
499        let data = block.get(handle.offset..handle.offset + handle.len)?;
500        SlotRow::from_bytes(data)
501    }
502
503    /// Get total allocated bytes
504    pub fn total_allocated(&self) -> usize {
505        self.total_allocated
506    }
507
508    /// Get number of blocks
509    pub fn block_count(&self) -> usize {
510        self.blocks.len()
511    }
512
513    /// Reset arena (keeps allocated memory)
514    pub fn reset(&mut self) {
515        self.current_block = 0;
516        self.current_offset = 0;
517        self.total_allocated = 0;
518    }
519}
520
521impl Default for SlotRowArena {
522    fn default() -> Self {
523        Self::new()
524    }
525}
526
527/// Handle to a SlotRow stored in arena
528#[derive(Debug, Clone, Copy)]
529pub struct SlotRowHandle {
530    block_idx: usize,
531    offset: usize,
532    len: usize,
533}
534
535impl SlotRowHandle {
536    pub fn len(&self) -> usize {
537        self.len
538    }
539
540    pub fn is_empty(&self) -> bool {
541        self.len == 0
542    }
543}
544
545#[cfg(test)]
546mod tests {
547    use super::*;
548
549    #[test]
550    fn test_slot_row_basic() {
551        let row = SlotRow::from_values(
552            1,
553            &[
554                Some(b"hello"),
555                Some(&42i64.to_le_bytes()),
556                None,
557                Some(b"world"),
558            ],
559        );
560
561        assert_eq!(row.row_id(), 1);
562        assert_eq!(row.column_count(), 4);
563        assert!(!row.is_null(0));
564        assert!(!row.is_null(1));
565        assert!(row.is_null(2));
566        assert!(!row.is_null(3));
567    }
568
569    #[test]
570    fn test_slot_row_get_bytes() {
571        let row = SlotRow::from_values(1, &[Some(b"hello"), Some(b"world")]);
572
573        assert_eq!(row.get_bytes(0), Some(b"hello".as_slice()));
574        assert_eq!(row.get_bytes(1), Some(b"world".as_slice()));
575        assert_eq!(row.get_bytes(2), None);
576    }
577
578    #[test]
579    #[allow(clippy::approx_constant)] // 3.14 is arbitrary test data, not PI
580    fn test_slot_row_get_typed() {
581        let row = SlotRow::from_values(
582            1,
583            &[
584                Some(&42i64.to_le_bytes()),
585                Some(&3.14f64.to_le_bytes()),
586                Some(&1u8.to_le_bytes()),
587            ],
588        );
589
590        assert_eq!(row.get_i64(0), Some(42));
591        assert_eq!(row.get_f64(1), Some(3.14));
592        assert_eq!(row.get_bool(2), Some(true));
593    }
594
595    #[test]
596    fn test_slot_row_get_str() {
597        let row = SlotRow::from_values(1, &[Some(b"hello world")]);
598
599        assert_eq!(row.get_str(0), Some("hello world"));
600    }
601
602    #[test]
603    fn test_slot_row_mvcc() {
604        let mut row = SlotRow::from_values(1, &[Some(b"test")]);
605        row.set_mvcc(100, 200);
606
607        assert_eq!(row.txn_start(), 100);
608        assert_eq!(row.txn_end(), 200);
609        assert!(row.is_visible_at(150));
610        assert!(!row.is_visible_at(50));
611        assert!(!row.is_visible_at(250));
612    }
613
614    #[test]
615    fn test_slot_row_serialize() {
616        let row = SlotRow::from_values(42, &[Some(b"hello"), Some(&123i64.to_le_bytes()), None]);
617
618        let bytes = row.to_bytes();
619        let restored = SlotRow::from_bytes(&bytes).unwrap();
620
621        assert_eq!(restored.row_id(), 42);
622        assert_eq!(restored.get_bytes(0), Some(b"hello".as_slice()));
623        assert_eq!(restored.get_i64(1), Some(123));
624        assert!(restored.is_null(2));
625    }
626
627    #[test]
628    fn test_slot_row_memory_size() {
629        let row = SlotRow::from_values(1, &[Some(b"hello"), Some(b"world")]);
630
631        // Header (28) + 2 slots (8) + data (10) = 46 bytes
632        // Much smaller than HashMap equivalent (~150+ bytes)
633        let size = row.memory_size();
634        assert!(size < 100, "SlotRow size {} should be < 100 bytes", size);
635    }
636
637    #[test]
638    fn test_slot_row_arena() {
639        let mut arena = SlotRowArena::new();
640
641        let row1 = SlotRow::from_values(1, &[Some(b"hello")]);
642        let row2 = SlotRow::from_values(2, &[Some(b"world")]);
643
644        let h1 = arena.store(&row1);
645        let h2 = arena.store(&row2);
646
647        let r1 = arena.get(&h1).unwrap();
648        let r2 = arena.get(&h2).unwrap();
649
650        assert_eq!(r1.row_id(), 1);
651        assert_eq!(r2.row_id(), 2);
652        assert_eq!(r1.get_str(0), Some("hello"));
653        assert_eq!(r2.get_str(0), Some("world"));
654    }
655
656    #[test]
657    fn test_slot_row_arena_many_rows() {
658        // Use small block size to force multiple blocks
659        let mut arena = SlotRowArena::with_block_size(1024);
660        let mut handles = Vec::new();
661
662        for i in 0..1000 {
663            let row = SlotRow::from_values(
664                i,
665                &[
666                    Some(&i.to_le_bytes()),
667                    Some(format!("row_{}", i).as_bytes()),
668                ],
669            );
670            handles.push(arena.store(&row));
671        }
672
673        // Verify all rows
674        for (i, handle) in handles.iter().enumerate() {
675            let row = arena.get(handle).unwrap();
676            assert_eq!(row.row_id(), i as u64);
677            assert_eq!(row.get_u64(0), Some(i as u64));
678        }
679
680        // Should use multiple blocks (1000 rows * ~50 bytes = ~50KB > 1024)
681        assert!(arena.block_count() > 1);
682    }
683}