Skip to main content

sochdb_storage/
row_format.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Slot-Based Columnar Row Storage (PAX/Hybrid Format) - Recommendation 1
19//!
20//! ## Problem
21//!
22//! Current implementation stores each row as HashMap<String, SochValue>, which incurs:
23//! - 48 bytes minimum HashMap overhead per row
24//! - 24 bytes per String key (pointer + length + capacity)
25//! - 16-32 bytes per SochValue (enum tag + union)
26//! - Heap fragmentation from individual allocations
27//!
28//! For a 4-column row: 48 + 4×(24 + 24) = 240 bytes vs SQLite's ~50 bytes
29//!
30//! ## Solution
31//!
32//! Slot-based row format: fixed header + variable payload
33//! - Row ID (8 bytes)
34//! - Null bitmap (2 bytes for 16 columns)
35//! - Slot count (1 byte)
36//! - Flags (1 byte for deleted, MVCC markers)
37//! - MVCC timestamps (16 bytes)
38//! - Slot array: [offset: u16, len: u16] per column
39//! - Data follows slots contiguously
40//!
41//! ## Performance Analysis
42//!
43//! | Component    | Current   | Proposed       | Savings |
44//! |--------------|-----------|----------------|---------|
45//! | Header       | 48 (HashMap)| 28 (SlotRow)  | 42%     |
46//! | 4 String keys| 96        | 0 (schema-indexed)| 100% |
47//! | 4 SochValues | 128       | 32 (raw bytes) | 75%     |
48//! | Total        | 272 bytes | 60 bytes       | 78%     |
49//!
50//! Cache analysis: With 64KB pages and 60-byte rows, we get 1,092 rows per page
51//! vs current ~240 rows per page with HashMap approach.
52
53/// Minimum slot row header size (without slots)
54pub const SLOT_ROW_HEADER_SIZE: usize = 28;
55
56/// Maximum columns supported by null bitmap (u16)
57pub const MAX_SLOT_COLUMNS: usize = 16;
58
59/// Slot row flags
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61#[repr(u8)]
62pub enum SlotRowFlags {
63    /// Normal row
64    Normal = 0,
65    /// Deleted (tombstone)
66    Deleted = 1,
67    /// Uncommitted (MVCC)
68    Uncommitted = 2,
69    /// Compressed
70    Compressed = 4,
71}
72
73impl SlotRowFlags {
74    pub fn is_deleted(&self) -> bool {
75        (*self as u8) & 1 != 0
76    }
77
78    pub fn is_uncommitted(&self) -> bool {
79        (*self as u8) & 2 != 0
80    }
81
82    pub fn is_compressed(&self) -> bool {
83        (*self as u8) & 4 != 0
84    }
85}
86
87/// Slot entry: offset and length for a column value
88#[derive(Debug, Clone, Copy)]
89#[repr(C, packed)]
90pub struct Slot {
91    /// Offset from data start (u16 allows 64KB max row size)
92    pub offset: u16,
93    /// Length of value in bytes
94    pub len: u16,
95}
96
97impl Slot {
98    pub const SIZE: usize = 4;
99
100    #[inline]
101    pub fn new(offset: u16, len: u16) -> Self {
102        Self { offset, len }
103    }
104
105    #[inline]
106    pub fn is_null(&self) -> bool {
107        self.len == 0 && self.offset == 0xFFFF
108    }
109
110    #[inline]
111    pub fn null() -> Self {
112        Self { offset: 0xFFFF, len: 0 }
113    }
114}
115
116/// Slot-based row format with O(1) field access
117///
118/// ## Memory Layout
119///
120/// ```text
121/// ┌───────────────────────────────────────────────────────┐
122/// │ Header (28 bytes)                                      │
123/// │  row_id: u64           (8 bytes)                       │
124/// │  null_bitmap: u16      (2 bytes)                       │
125/// │  slot_count: u8        (1 byte)                        │
126/// │  flags: u8             (1 byte)                        │
127/// │  txn_start: u64        (8 bytes) MVCC                  │
128/// │  txn_end: u64          (8 bytes) MVCC                  │
129/// ├───────────────────────────────────────────────────────┤
130/// │ Slots: [offset: u16, len: u16] × slot_count           │
131/// ├───────────────────────────────────────────────────────┤
132/// │ Data: contiguous column values                         │
133/// └───────────────────────────────────────────────────────┘
134/// ```
135#[repr(C)]
136pub struct SlotRow {
137    /// Row identifier
138    row_id: u64,
139    /// Null bitmap (bit i = 1 means column i is NULL)
140    null_bitmap: u16,
141    /// Number of columns (slots)
142    slot_count: u8,
143    /// Row flags (deleted, uncommitted, etc.)
144    flags: u8,
145    /// MVCC: Transaction that created this version
146    txn_start: u64,
147    /// MVCC: Transaction that deleted this version (u64::MAX = not deleted)
148    txn_end: u64,
149    /// Raw storage for slots + data
150    /// Layout: [Slot; slot_count][data bytes...]
151    storage: Vec<u8>,
152}
153
154impl SlotRow {
155    /// Create a new slot row with specified column count
156    pub fn new(row_id: u64, slot_count: u8) -> Self {
157        assert!((slot_count as usize) <= MAX_SLOT_COLUMNS);
158        let slots_size = (slot_count as usize) * Slot::SIZE;
159        Self {
160            row_id,
161            null_bitmap: 0,
162            slot_count,
163            flags: SlotRowFlags::Normal as u8,
164            txn_start: 0,
165            txn_end: u64::MAX,
166            storage: vec![0u8; slots_size],
167        }
168    }
169
170    /// Create from raw values
171    pub fn from_values(row_id: u64, values: &[Option<&[u8]>]) -> Self {
172        let slot_count = values.len().min(MAX_SLOT_COLUMNS) as u8;
173        let slots_size = (slot_count as usize) * Slot::SIZE;
174        
175        // Calculate total data size
176        let data_size: usize = values.iter()
177            .map(|v| v.map(|b| b.len()).unwrap_or(0))
178            .sum();
179        
180        let mut storage = vec![0u8; slots_size + data_size];
181        let mut null_bitmap = 0u16;
182        let mut data_offset = 0u16;
183        
184        // Write slots and data
185        for (i, value) in values.iter().enumerate() {
186            let slot = match value {
187                Some(data) => {
188                    let slot = Slot::new(data_offset, data.len() as u16);
189                    // Write data
190                    let data_start = slots_size + data_offset as usize;
191                    storage[data_start..data_start + data.len()].copy_from_slice(data);
192                    data_offset += data.len() as u16;
193                    slot
194                }
195                None => {
196                    null_bitmap |= 1 << i;
197                    Slot::null()
198                }
199            };
200            
201            // Write slot
202            let slot_start = i * Slot::SIZE;
203            storage[slot_start..slot_start + 2].copy_from_slice(&slot.offset.to_le_bytes());
204            storage[slot_start + 2..slot_start + 4].copy_from_slice(&slot.len.to_le_bytes());
205        }
206        
207        Self {
208            row_id,
209            null_bitmap,
210            slot_count,
211            flags: SlotRowFlags::Normal as u8,
212            txn_start: 0,
213            txn_end: u64::MAX,
214            storage,
215        }
216    }
217
218    /// Get row ID
219    #[inline]
220    pub fn row_id(&self) -> u64 {
221        self.row_id
222    }
223
224    /// Get column count
225    #[inline]
226    pub fn column_count(&self) -> usize {
227        self.slot_count as usize
228    }
229
230    /// Check if column is NULL - O(1)
231    #[inline]
232    pub fn is_null(&self, column_idx: usize) -> bool {
233        if column_idx >= self.slot_count as usize {
234            return true;
235        }
236        (self.null_bitmap & (1 << column_idx)) != 0
237    }
238
239    /// Get slot for a column - O(1)
240    #[inline]
241    fn get_slot(&self, column_idx: usize) -> Option<Slot> {
242        if column_idx >= self.slot_count as usize {
243            return None;
244        }
245        let slot_start = column_idx * Slot::SIZE;
246        if slot_start + Slot::SIZE > self.storage.len() {
247            return None;
248        }
249        
250        let offset = u16::from_le_bytes([
251            self.storage[slot_start],
252            self.storage[slot_start + 1],
253        ]);
254        let len = u16::from_le_bytes([
255            self.storage[slot_start + 2],
256            self.storage[slot_start + 3],
257        ]);
258        
259        Some(Slot { offset, len })
260    }
261
262    /// Get column value as bytes - O(1)
263    ///
264    /// This is the key performance advantage: direct offset arithmetic
265    /// instead of HashMap lookup (~50ns → ~2ns)
266    #[inline]
267    pub fn get_bytes(&self, column_idx: usize) -> Option<&[u8]> {
268        if self.is_null(column_idx) {
269            return None;
270        }
271        
272        let slot = self.get_slot(column_idx)?;
273        if slot.is_null() {
274            return None;
275        }
276        
277        let slots_size = (self.slot_count as usize) * Slot::SIZE;
278        let data_start = slots_size + slot.offset as usize;
279        let data_end = data_start + slot.len as usize;
280        
281        if data_end > self.storage.len() {
282            return None;
283        }
284        
285        Some(&self.storage[data_start..data_end])
286    }
287
288    /// Get column as i64 - O(1)
289    #[inline]
290    pub fn get_i64(&self, column_idx: usize) -> Option<i64> {
291        let bytes = self.get_bytes(column_idx)?;
292        if bytes.len() != 8 {
293            return None;
294        }
295        Some(i64::from_le_bytes(bytes.try_into().ok()?))
296    }
297
298    /// Get column as u64 - O(1)
299    #[inline]
300    pub fn get_u64(&self, column_idx: usize) -> Option<u64> {
301        let bytes = self.get_bytes(column_idx)?;
302        if bytes.len() != 8 {
303            return None;
304        }
305        Some(u64::from_le_bytes(bytes.try_into().ok()?))
306    }
307
308    /// Get column as f64 - O(1)
309    #[inline]
310    pub fn get_f64(&self, column_idx: usize) -> Option<f64> {
311        let bytes = self.get_bytes(column_idx)?;
312        if bytes.len() != 8 {
313            return None;
314        }
315        Some(f64::from_le_bytes(bytes.try_into().ok()?))
316    }
317
318    /// Get column as bool - O(1)
319    #[inline]
320    pub fn get_bool(&self, column_idx: usize) -> Option<bool> {
321        let bytes = self.get_bytes(column_idx)?;
322        if bytes.is_empty() {
323            return None;
324        }
325        Some(bytes[0] != 0)
326    }
327
328    /// Get column as string - O(1)
329    #[inline]
330    pub fn get_str(&self, column_idx: usize) -> Option<&str> {
331        let bytes = self.get_bytes(column_idx)?;
332        std::str::from_utf8(bytes).ok()
333    }
334
335    /// Set MVCC timestamps
336    pub fn set_mvcc(&mut self, txn_start: u64, txn_end: u64) {
337        self.txn_start = txn_start;
338        self.txn_end = txn_end;
339    }
340
341    /// Get MVCC start timestamp
342    #[inline]
343    pub fn txn_start(&self) -> u64 {
344        self.txn_start
345    }
346
347    /// Get MVCC end timestamp
348    #[inline]
349    pub fn txn_end(&self) -> u64 {
350        self.txn_end
351    }
352
353    /// Check if visible at snapshot
354    #[inline]
355    pub fn is_visible_at(&self, snapshot_ts: u64) -> bool {
356        self.txn_start < snapshot_ts && snapshot_ts <= self.txn_end
357    }
358
359    /// Set deleted flag
360    pub fn set_deleted(&mut self) {
361        self.flags |= SlotRowFlags::Deleted as u8;
362    }
363
364    /// Check if deleted
365    #[inline]
366    pub fn is_deleted(&self) -> bool {
367        (self.flags & SlotRowFlags::Deleted as u8) != 0
368    }
369
370    /// Get total memory size
371    pub fn memory_size(&self) -> usize {
372        SLOT_ROW_HEADER_SIZE + self.storage.len()
373    }
374
375    /// Serialize to bytes
376    pub fn to_bytes(&self) -> Vec<u8> {
377        let total_size = SLOT_ROW_HEADER_SIZE + self.storage.len();
378        let mut buf = Vec::with_capacity(total_size);
379        
380        buf.extend_from_slice(&self.row_id.to_le_bytes());
381        buf.extend_from_slice(&self.null_bitmap.to_le_bytes());
382        buf.push(self.slot_count);
383        buf.push(self.flags);
384        buf.extend_from_slice(&self.txn_start.to_le_bytes());
385        buf.extend_from_slice(&self.txn_end.to_le_bytes());
386        buf.extend_from_slice(&self.storage);
387        
388        buf
389    }
390
391    /// Deserialize from bytes
392    pub fn from_bytes(data: &[u8]) -> Option<Self> {
393        if data.len() < SLOT_ROW_HEADER_SIZE {
394            return None;
395        }
396        
397        let row_id = u64::from_le_bytes(data[0..8].try_into().ok()?);
398        let null_bitmap = u16::from_le_bytes(data[8..10].try_into().ok()?);
399        let slot_count = data[10];
400        let flags = data[11];
401        let txn_start = u64::from_le_bytes(data[12..20].try_into().ok()?);
402        let txn_end = u64::from_le_bytes(data[20..28].try_into().ok()?);
403        let storage = data[28..].to_vec();
404        
405        Some(Self {
406            row_id,
407            null_bitmap,
408            slot_count,
409            flags,
410            txn_start,
411            txn_end,
412            storage,
413        })
414    }
415}
416
417impl std::fmt::Debug for SlotRow {
418    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
419        f.debug_struct("SlotRow")
420            .field("row_id", &self.row_id)
421            .field("slot_count", &self.slot_count)
422            .field("null_bitmap", &format!("{:016b}", self.null_bitmap))
423            .field("flags", &self.flags)
424            .field("txn_start", &self.txn_start)
425            .field("txn_end", &self.txn_end)
426            .field("storage_len", &self.storage.len())
427            .finish()
428    }
429}
430
431// =============================================================================
432// Arena-Based Slot Row Allocation
433// =============================================================================
434
435/// Arena allocator for SlotRow storage
436///
437/// Allocates rows from contiguous memory blocks to:
438/// - Reduce heap fragmentation
439/// - Improve cache locality
440/// - Enable efficient bulk deallocation
441pub struct SlotRowArena {
442    /// Memory blocks
443    blocks: Vec<Vec<u8>>,
444    /// Current block index
445    current_block: usize,
446    /// Current offset in current block
447    current_offset: usize,
448    /// Block size (default 64KB)
449    block_size: usize,
450    /// Total bytes allocated
451    total_allocated: usize,
452}
453
454impl SlotRowArena {
455    /// Default block size (64KB - L2 cache friendly)
456    pub const DEFAULT_BLOCK_SIZE: usize = 64 * 1024;
457
458    pub fn new() -> Self {
459        Self::with_block_size(Self::DEFAULT_BLOCK_SIZE)
460    }
461
462    pub fn with_block_size(block_size: usize) -> Self {
463        Self {
464            blocks: vec![vec![0u8; block_size]],
465            current_block: 0,
466            current_offset: 0,
467            block_size,
468            total_allocated: 0,
469        }
470    }
471
472    /// Allocate space for a row
473    pub fn allocate(&mut self, size: usize) -> &mut [u8] {
474        if self.current_offset + size > self.block_size {
475            // Need new block
476            self.blocks.push(vec![0u8; self.block_size.max(size)]);
477            self.current_block = self.blocks.len() - 1;
478            self.current_offset = 0;
479        }
480        
481        let start = self.current_offset;
482        self.current_offset += size;
483        self.total_allocated += size;
484        
485        &mut self.blocks[self.current_block][start..start + size]
486    }
487
488    /// Store a SlotRow and return handle
489    pub fn store(&mut self, row: &SlotRow) -> SlotRowHandle {
490        let bytes = row.to_bytes();
491        let slot = self.allocate(bytes.len());
492        slot.copy_from_slice(&bytes);
493        
494        SlotRowHandle {
495            block_idx: self.current_block,
496            offset: self.current_offset - bytes.len(),
497            len: bytes.len(),
498        }
499    }
500
501    /// Get row from handle
502    pub fn get(&self, handle: &SlotRowHandle) -> Option<SlotRow> {
503        let block = self.blocks.get(handle.block_idx)?;
504        let data = block.get(handle.offset..handle.offset + handle.len)?;
505        SlotRow::from_bytes(data)
506    }
507
508    /// Get total allocated bytes
509    pub fn total_allocated(&self) -> usize {
510        self.total_allocated
511    }
512
513    /// Get number of blocks
514    pub fn block_count(&self) -> usize {
515        self.blocks.len()
516    }
517
518    /// Reset arena (keeps allocated memory)
519    pub fn reset(&mut self) {
520        self.current_block = 0;
521        self.current_offset = 0;
522        self.total_allocated = 0;
523    }
524}
525
526impl Default for SlotRowArena {
527    fn default() -> Self {
528        Self::new()
529    }
530}
531
532/// Handle to a SlotRow stored in arena
533#[derive(Debug, Clone, Copy)]
534pub struct SlotRowHandle {
535    block_idx: usize,
536    offset: usize,
537    len: usize,
538}
539
540impl SlotRowHandle {
541    pub fn len(&self) -> usize {
542        self.len
543    }
544
545    pub fn is_empty(&self) -> bool {
546        self.len == 0
547    }
548}
549
550#[cfg(test)]
551mod tests {
552    use super::*;
553
554    #[test]
555    fn test_slot_row_basic() {
556        let row = SlotRow::from_values(1, &[
557            Some(b"hello"),
558            Some(&42i64.to_le_bytes()),
559            None,
560            Some(b"world"),
561        ]);
562        
563        assert_eq!(row.row_id(), 1);
564        assert_eq!(row.column_count(), 4);
565        assert!(!row.is_null(0));
566        assert!(!row.is_null(1));
567        assert!(row.is_null(2));
568        assert!(!row.is_null(3));
569    }
570
571    #[test]
572    fn test_slot_row_get_bytes() {
573        let row = SlotRow::from_values(1, &[
574            Some(b"hello"),
575            Some(b"world"),
576        ]);
577        
578        assert_eq!(row.get_bytes(0), Some(b"hello".as_slice()));
579        assert_eq!(row.get_bytes(1), Some(b"world".as_slice()));
580        assert_eq!(row.get_bytes(2), None);
581    }
582
583    #[test]
584    fn test_slot_row_get_typed() {
585        let row = SlotRow::from_values(1, &[
586            Some(&42i64.to_le_bytes()),
587            Some(&3.14f64.to_le_bytes()),
588            Some(&1u8.to_le_bytes()),
589        ]);
590        
591        assert_eq!(row.get_i64(0), Some(42));
592        assert_eq!(row.get_f64(1), Some(3.14));
593        assert_eq!(row.get_bool(2), Some(true));
594    }
595
596    #[test]
597    fn test_slot_row_get_str() {
598        let row = SlotRow::from_values(1, &[
599            Some(b"hello world"),
600        ]);
601        
602        assert_eq!(row.get_str(0), Some("hello world"));
603    }
604
605    #[test]
606    fn test_slot_row_mvcc() {
607        let mut row = SlotRow::from_values(1, &[Some(b"test")]);
608        row.set_mvcc(100, 200);
609        
610        assert_eq!(row.txn_start(), 100);
611        assert_eq!(row.txn_end(), 200);
612        assert!(row.is_visible_at(150));
613        assert!(!row.is_visible_at(50));
614        assert!(!row.is_visible_at(250));
615    }
616
617    #[test]
618    fn test_slot_row_serialize() {
619        let row = SlotRow::from_values(42, &[
620            Some(b"hello"),
621            Some(&123i64.to_le_bytes()),
622            None,
623        ]);
624        
625        let bytes = row.to_bytes();
626        let restored = SlotRow::from_bytes(&bytes).unwrap();
627        
628        assert_eq!(restored.row_id(), 42);
629        assert_eq!(restored.get_bytes(0), Some(b"hello".as_slice()));
630        assert_eq!(restored.get_i64(1), Some(123));
631        assert!(restored.is_null(2));
632    }
633
634    #[test]
635    fn test_slot_row_memory_size() {
636        let row = SlotRow::from_values(1, &[
637            Some(b"hello"),
638            Some(b"world"),
639        ]);
640        
641        // Header (28) + 2 slots (8) + data (10) = 46 bytes
642        // Much smaller than HashMap equivalent (~150+ bytes)
643        let size = row.memory_size();
644        assert!(size < 100, "SlotRow size {} should be < 100 bytes", size);
645    }
646
647    #[test]
648    fn test_slot_row_arena() {
649        let mut arena = SlotRowArena::new();
650        
651        let row1 = SlotRow::from_values(1, &[Some(b"hello")]);
652        let row2 = SlotRow::from_values(2, &[Some(b"world")]);
653        
654        let h1 = arena.store(&row1);
655        let h2 = arena.store(&row2);
656        
657        let r1 = arena.get(&h1).unwrap();
658        let r2 = arena.get(&h2).unwrap();
659        
660        assert_eq!(r1.row_id(), 1);
661        assert_eq!(r2.row_id(), 2);
662        assert_eq!(r1.get_str(0), Some("hello"));
663        assert_eq!(r2.get_str(0), Some("world"));
664    }
665
666    #[test]
667    fn test_slot_row_arena_many_rows() {
668        // Use small block size to force multiple blocks
669        let mut arena = SlotRowArena::with_block_size(1024);
670        let mut handles = Vec::new();
671        
672        for i in 0..1000 {
673            let row = SlotRow::from_values(i, &[
674                Some(&i.to_le_bytes()),
675                Some(format!("row_{}", i).as_bytes()),
676            ]);
677            handles.push(arena.store(&row));
678        }
679        
680        // Verify all rows
681        for (i, handle) in handles.iter().enumerate() {
682            let row = arena.get(handle).unwrap();
683            assert_eq!(row.row_id(), i as u64);
684            assert_eq!(row.get_u64(0), Some(i as u64));
685        }
686        
687        // Should use multiple blocks (1000 rows * ~50 bytes = ~50KB > 1024)
688        assert!(arena.block_count() > 1);
689    }
690}