sochdb_storage/
row_format.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Slot-Based Columnar Row Storage (PAX/Hybrid Format) - Recommendation 1
16//!
17//! ## Problem
18//!
19//! Current implementation stores each row as HashMap<String, SochValue>, which incurs:
20//! - 48 bytes minimum HashMap overhead per row
21//! - 24 bytes per String key (pointer + length + capacity)
22//! - 16-32 bytes per SochValue (enum tag + union)
23//! - Heap fragmentation from individual allocations
24//!
25//! For a 4-column row: 48 + 4×(24 + 24) = 240 bytes vs SQLite's ~50 bytes
26//!
27//! ## Solution
28//!
29//! Slot-based row format: fixed header + variable payload
30//! - Row ID (8 bytes)
31//! - Null bitmap (2 bytes for 16 columns)
32//! - Slot count (1 byte)
33//! - Flags (1 byte for deleted, MVCC markers)
34//! - MVCC timestamps (16 bytes)
35//! - Slot array: [offset: u16, len: u16] per column
36//! - Data follows slots contiguously
37//!
38//! ## Performance Analysis
39//!
40//! | Component    | Current   | Proposed       | Savings |
41//! |--------------|-----------|----------------|---------|
42//! | Header       | 48 (HashMap)| 28 (SlotRow)  | 42%     |
43//! | 4 String keys| 96        | 0 (schema-indexed)| 100% |
44//! | 4 SochValues | 128       | 32 (raw bytes) | 75%     |
45//! | Total        | 272 bytes | 60 bytes       | 78%     |
46//!
47//! Cache analysis: With 64KB pages and 60-byte rows, we get 1,092 rows per page
48//! vs current ~240 rows per page with HashMap approach.
49
50/// Minimum slot row header size (without slots)
51pub const SLOT_ROW_HEADER_SIZE: usize = 28;
52
53/// Maximum columns supported by null bitmap (u16)
54pub const MAX_SLOT_COLUMNS: usize = 16;
55
56/// Slot row flags
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58#[repr(u8)]
59pub enum SlotRowFlags {
60    /// Normal row
61    Normal = 0,
62    /// Deleted (tombstone)
63    Deleted = 1,
64    /// Uncommitted (MVCC)
65    Uncommitted = 2,
66    /// Compressed
67    Compressed = 4,
68}
69
70impl SlotRowFlags {
71    pub fn is_deleted(&self) -> bool {
72        (*self as u8) & 1 != 0
73    }
74
75    pub fn is_uncommitted(&self) -> bool {
76        (*self as u8) & 2 != 0
77    }
78
79    pub fn is_compressed(&self) -> bool {
80        (*self as u8) & 4 != 0
81    }
82}
83
84/// Slot entry: offset and length for a column value
85#[derive(Debug, Clone, Copy)]
86#[repr(C, packed)]
87pub struct Slot {
88    /// Offset from data start (u16 allows 64KB max row size)
89    pub offset: u16,
90    /// Length of value in bytes
91    pub len: u16,
92}
93
94impl Slot {
95    pub const SIZE: usize = 4;
96
97    #[inline]
98    pub fn new(offset: u16, len: u16) -> Self {
99        Self { offset, len }
100    }
101
102    #[inline]
103    pub fn is_null(&self) -> bool {
104        self.len == 0 && self.offset == 0xFFFF
105    }
106
107    #[inline]
108    pub fn null() -> Self {
109        Self { offset: 0xFFFF, len: 0 }
110    }
111}
112
113/// Slot-based row format with O(1) field access
114///
115/// ## Memory Layout
116///
117/// ```text
118/// ┌───────────────────────────────────────────────────────┐
119/// │ Header (28 bytes)                                      │
120/// │  row_id: u64           (8 bytes)                       │
121/// │  null_bitmap: u16      (2 bytes)                       │
122/// │  slot_count: u8        (1 byte)                        │
123/// │  flags: u8             (1 byte)                        │
124/// │  txn_start: u64        (8 bytes) MVCC                  │
125/// │  txn_end: u64          (8 bytes) MVCC                  │
126/// ├───────────────────────────────────────────────────────┤
127/// │ Slots: [offset: u16, len: u16] × slot_count           │
128/// ├───────────────────────────────────────────────────────┤
129/// │ Data: contiguous column values                         │
130/// └───────────────────────────────────────────────────────┘
131/// ```
132#[repr(C)]
133pub struct SlotRow {
134    /// Row identifier
135    row_id: u64,
136    /// Null bitmap (bit i = 1 means column i is NULL)
137    null_bitmap: u16,
138    /// Number of columns (slots)
139    slot_count: u8,
140    /// Row flags (deleted, uncommitted, etc.)
141    flags: u8,
142    /// MVCC: Transaction that created this version
143    txn_start: u64,
144    /// MVCC: Transaction that deleted this version (u64::MAX = not deleted)
145    txn_end: u64,
146    /// Raw storage for slots + data
147    /// Layout: [Slot; slot_count][data bytes...]
148    storage: Vec<u8>,
149}
150
151impl SlotRow {
152    /// Create a new slot row with specified column count
153    pub fn new(row_id: u64, slot_count: u8) -> Self {
154        assert!((slot_count as usize) <= MAX_SLOT_COLUMNS);
155        let slots_size = (slot_count as usize) * Slot::SIZE;
156        Self {
157            row_id,
158            null_bitmap: 0,
159            slot_count,
160            flags: SlotRowFlags::Normal as u8,
161            txn_start: 0,
162            txn_end: u64::MAX,
163            storage: vec![0u8; slots_size],
164        }
165    }
166
167    /// Create from raw values
168    pub fn from_values(row_id: u64, values: &[Option<&[u8]>]) -> Self {
169        let slot_count = values.len().min(MAX_SLOT_COLUMNS) as u8;
170        let slots_size = (slot_count as usize) * Slot::SIZE;
171        
172        // Calculate total data size
173        let data_size: usize = values.iter()
174            .map(|v| v.map(|b| b.len()).unwrap_or(0))
175            .sum();
176        
177        let mut storage = vec![0u8; slots_size + data_size];
178        let mut null_bitmap = 0u16;
179        let mut data_offset = 0u16;
180        
181        // Write slots and data
182        for (i, value) in values.iter().enumerate() {
183            let slot = match value {
184                Some(data) => {
185                    let slot = Slot::new(data_offset, data.len() as u16);
186                    // Write data
187                    let data_start = slots_size + data_offset as usize;
188                    storage[data_start..data_start + data.len()].copy_from_slice(data);
189                    data_offset += data.len() as u16;
190                    slot
191                }
192                None => {
193                    null_bitmap |= 1 << i;
194                    Slot::null()
195                }
196            };
197            
198            // Write slot
199            let slot_start = i * Slot::SIZE;
200            storage[slot_start..slot_start + 2].copy_from_slice(&slot.offset.to_le_bytes());
201            storage[slot_start + 2..slot_start + 4].copy_from_slice(&slot.len.to_le_bytes());
202        }
203        
204        Self {
205            row_id,
206            null_bitmap,
207            slot_count,
208            flags: SlotRowFlags::Normal as u8,
209            txn_start: 0,
210            txn_end: u64::MAX,
211            storage,
212        }
213    }
214
215    /// Get row ID
216    #[inline]
217    pub fn row_id(&self) -> u64 {
218        self.row_id
219    }
220
221    /// Get column count
222    #[inline]
223    pub fn column_count(&self) -> usize {
224        self.slot_count as usize
225    }
226
227    /// Check if column is NULL - O(1)
228    #[inline]
229    pub fn is_null(&self, column_idx: usize) -> bool {
230        if column_idx >= self.slot_count as usize {
231            return true;
232        }
233        (self.null_bitmap & (1 << column_idx)) != 0
234    }
235
236    /// Get slot for a column - O(1)
237    #[inline]
238    fn get_slot(&self, column_idx: usize) -> Option<Slot> {
239        if column_idx >= self.slot_count as usize {
240            return None;
241        }
242        let slot_start = column_idx * Slot::SIZE;
243        if slot_start + Slot::SIZE > self.storage.len() {
244            return None;
245        }
246        
247        let offset = u16::from_le_bytes([
248            self.storage[slot_start],
249            self.storage[slot_start + 1],
250        ]);
251        let len = u16::from_le_bytes([
252            self.storage[slot_start + 2],
253            self.storage[slot_start + 3],
254        ]);
255        
256        Some(Slot { offset, len })
257    }
258
259    /// Get column value as bytes - O(1)
260    ///
261    /// This is the key performance advantage: direct offset arithmetic
262    /// instead of HashMap lookup (~50ns → ~2ns)
263    #[inline]
264    pub fn get_bytes(&self, column_idx: usize) -> Option<&[u8]> {
265        if self.is_null(column_idx) {
266            return None;
267        }
268        
269        let slot = self.get_slot(column_idx)?;
270        if slot.is_null() {
271            return None;
272        }
273        
274        let slots_size = (self.slot_count as usize) * Slot::SIZE;
275        let data_start = slots_size + slot.offset as usize;
276        let data_end = data_start + slot.len as usize;
277        
278        if data_end > self.storage.len() {
279            return None;
280        }
281        
282        Some(&self.storage[data_start..data_end])
283    }
284
285    /// Get column as i64 - O(1)
286    #[inline]
287    pub fn get_i64(&self, column_idx: usize) -> Option<i64> {
288        let bytes = self.get_bytes(column_idx)?;
289        if bytes.len() != 8 {
290            return None;
291        }
292        Some(i64::from_le_bytes(bytes.try_into().ok()?))
293    }
294
295    /// Get column as u64 - O(1)
296    #[inline]
297    pub fn get_u64(&self, column_idx: usize) -> Option<u64> {
298        let bytes = self.get_bytes(column_idx)?;
299        if bytes.len() != 8 {
300            return None;
301        }
302        Some(u64::from_le_bytes(bytes.try_into().ok()?))
303    }
304
305    /// Get column as f64 - O(1)
306    #[inline]
307    pub fn get_f64(&self, column_idx: usize) -> Option<f64> {
308        let bytes = self.get_bytes(column_idx)?;
309        if bytes.len() != 8 {
310            return None;
311        }
312        Some(f64::from_le_bytes(bytes.try_into().ok()?))
313    }
314
315    /// Get column as bool - O(1)
316    #[inline]
317    pub fn get_bool(&self, column_idx: usize) -> Option<bool> {
318        let bytes = self.get_bytes(column_idx)?;
319        if bytes.is_empty() {
320            return None;
321        }
322        Some(bytes[0] != 0)
323    }
324
325    /// Get column as string - O(1)
326    #[inline]
327    pub fn get_str(&self, column_idx: usize) -> Option<&str> {
328        let bytes = self.get_bytes(column_idx)?;
329        std::str::from_utf8(bytes).ok()
330    }
331
332    /// Set MVCC timestamps
333    pub fn set_mvcc(&mut self, txn_start: u64, txn_end: u64) {
334        self.txn_start = txn_start;
335        self.txn_end = txn_end;
336    }
337
338    /// Get MVCC start timestamp
339    #[inline]
340    pub fn txn_start(&self) -> u64 {
341        self.txn_start
342    }
343
344    /// Get MVCC end timestamp
345    #[inline]
346    pub fn txn_end(&self) -> u64 {
347        self.txn_end
348    }
349
350    /// Check if visible at snapshot
351    #[inline]
352    pub fn is_visible_at(&self, snapshot_ts: u64) -> bool {
353        self.txn_start < snapshot_ts && snapshot_ts <= self.txn_end
354    }
355
356    /// Set deleted flag
357    pub fn set_deleted(&mut self) {
358        self.flags |= SlotRowFlags::Deleted as u8;
359    }
360
361    /// Check if deleted
362    #[inline]
363    pub fn is_deleted(&self) -> bool {
364        (self.flags & SlotRowFlags::Deleted as u8) != 0
365    }
366
367    /// Get total memory size
368    pub fn memory_size(&self) -> usize {
369        SLOT_ROW_HEADER_SIZE + self.storage.len()
370    }
371
372    /// Serialize to bytes
373    pub fn to_bytes(&self) -> Vec<u8> {
374        let total_size = SLOT_ROW_HEADER_SIZE + self.storage.len();
375        let mut buf = Vec::with_capacity(total_size);
376        
377        buf.extend_from_slice(&self.row_id.to_le_bytes());
378        buf.extend_from_slice(&self.null_bitmap.to_le_bytes());
379        buf.push(self.slot_count);
380        buf.push(self.flags);
381        buf.extend_from_slice(&self.txn_start.to_le_bytes());
382        buf.extend_from_slice(&self.txn_end.to_le_bytes());
383        buf.extend_from_slice(&self.storage);
384        
385        buf
386    }
387
388    /// Deserialize from bytes
389    pub fn from_bytes(data: &[u8]) -> Option<Self> {
390        if data.len() < SLOT_ROW_HEADER_SIZE {
391            return None;
392        }
393        
394        let row_id = u64::from_le_bytes(data[0..8].try_into().ok()?);
395        let null_bitmap = u16::from_le_bytes(data[8..10].try_into().ok()?);
396        let slot_count = data[10];
397        let flags = data[11];
398        let txn_start = u64::from_le_bytes(data[12..20].try_into().ok()?);
399        let txn_end = u64::from_le_bytes(data[20..28].try_into().ok()?);
400        let storage = data[28..].to_vec();
401        
402        Some(Self {
403            row_id,
404            null_bitmap,
405            slot_count,
406            flags,
407            txn_start,
408            txn_end,
409            storage,
410        })
411    }
412}
413
414impl std::fmt::Debug for SlotRow {
415    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
416        f.debug_struct("SlotRow")
417            .field("row_id", &self.row_id)
418            .field("slot_count", &self.slot_count)
419            .field("null_bitmap", &format!("{:016b}", self.null_bitmap))
420            .field("flags", &self.flags)
421            .field("txn_start", &self.txn_start)
422            .field("txn_end", &self.txn_end)
423            .field("storage_len", &self.storage.len())
424            .finish()
425    }
426}
427
428// =============================================================================
429// Arena-Based Slot Row Allocation
430// =============================================================================
431
432/// Arena allocator for SlotRow storage
433///
434/// Allocates rows from contiguous memory blocks to:
435/// - Reduce heap fragmentation
436/// - Improve cache locality
437/// - Enable efficient bulk deallocation
438pub struct SlotRowArena {
439    /// Memory blocks
440    blocks: Vec<Vec<u8>>,
441    /// Current block index
442    current_block: usize,
443    /// Current offset in current block
444    current_offset: usize,
445    /// Block size (default 64KB)
446    block_size: usize,
447    /// Total bytes allocated
448    total_allocated: usize,
449}
450
451impl SlotRowArena {
452    /// Default block size (64KB - L2 cache friendly)
453    pub const DEFAULT_BLOCK_SIZE: usize = 64 * 1024;
454
455    pub fn new() -> Self {
456        Self::with_block_size(Self::DEFAULT_BLOCK_SIZE)
457    }
458
459    pub fn with_block_size(block_size: usize) -> Self {
460        Self {
461            blocks: vec![vec![0u8; block_size]],
462            current_block: 0,
463            current_offset: 0,
464            block_size,
465            total_allocated: 0,
466        }
467    }
468
469    /// Allocate space for a row
470    pub fn allocate(&mut self, size: usize) -> &mut [u8] {
471        if self.current_offset + size > self.block_size {
472            // Need new block
473            self.blocks.push(vec![0u8; self.block_size.max(size)]);
474            self.current_block = self.blocks.len() - 1;
475            self.current_offset = 0;
476        }
477        
478        let start = self.current_offset;
479        self.current_offset += size;
480        self.total_allocated += size;
481        
482        &mut self.blocks[self.current_block][start..start + size]
483    }
484
485    /// Store a SlotRow and return handle
486    pub fn store(&mut self, row: &SlotRow) -> SlotRowHandle {
487        let bytes = row.to_bytes();
488        let slot = self.allocate(bytes.len());
489        slot.copy_from_slice(&bytes);
490        
491        SlotRowHandle {
492            block_idx: self.current_block,
493            offset: self.current_offset - bytes.len(),
494            len: bytes.len(),
495        }
496    }
497
498    /// Get row from handle
499    pub fn get(&self, handle: &SlotRowHandle) -> Option<SlotRow> {
500        let block = self.blocks.get(handle.block_idx)?;
501        let data = block.get(handle.offset..handle.offset + handle.len)?;
502        SlotRow::from_bytes(data)
503    }
504
505    /// Get total allocated bytes
506    pub fn total_allocated(&self) -> usize {
507        self.total_allocated
508    }
509
510    /// Get number of blocks
511    pub fn block_count(&self) -> usize {
512        self.blocks.len()
513    }
514
515    /// Reset arena (keeps allocated memory)
516    pub fn reset(&mut self) {
517        self.current_block = 0;
518        self.current_offset = 0;
519        self.total_allocated = 0;
520    }
521}
522
523impl Default for SlotRowArena {
524    fn default() -> Self {
525        Self::new()
526    }
527}
528
529/// Handle to a SlotRow stored in arena
530#[derive(Debug, Clone, Copy)]
531pub struct SlotRowHandle {
532    block_idx: usize,
533    offset: usize,
534    len: usize,
535}
536
537impl SlotRowHandle {
538    pub fn len(&self) -> usize {
539        self.len
540    }
541
542    pub fn is_empty(&self) -> bool {
543        self.len == 0
544    }
545}
546
547#[cfg(test)]
548mod tests {
549    use super::*;
550
551    #[test]
552    fn test_slot_row_basic() {
553        let row = SlotRow::from_values(1, &[
554            Some(b"hello"),
555            Some(&42i64.to_le_bytes()),
556            None,
557            Some(b"world"),
558        ]);
559        
560        assert_eq!(row.row_id(), 1);
561        assert_eq!(row.column_count(), 4);
562        assert!(!row.is_null(0));
563        assert!(!row.is_null(1));
564        assert!(row.is_null(2));
565        assert!(!row.is_null(3));
566    }
567
568    #[test]
569    fn test_slot_row_get_bytes() {
570        let row = SlotRow::from_values(1, &[
571            Some(b"hello"),
572            Some(b"world"),
573        ]);
574        
575        assert_eq!(row.get_bytes(0), Some(b"hello".as_slice()));
576        assert_eq!(row.get_bytes(1), Some(b"world".as_slice()));
577        assert_eq!(row.get_bytes(2), None);
578    }
579
580    #[test]
581    fn test_slot_row_get_typed() {
582        let row = SlotRow::from_values(1, &[
583            Some(&42i64.to_le_bytes()),
584            Some(&3.14f64.to_le_bytes()),
585            Some(&1u8.to_le_bytes()),
586        ]);
587        
588        assert_eq!(row.get_i64(0), Some(42));
589        assert_eq!(row.get_f64(1), Some(3.14));
590        assert_eq!(row.get_bool(2), Some(true));
591    }
592
593    #[test]
594    fn test_slot_row_get_str() {
595        let row = SlotRow::from_values(1, &[
596            Some(b"hello world"),
597        ]);
598        
599        assert_eq!(row.get_str(0), Some("hello world"));
600    }
601
602    #[test]
603    fn test_slot_row_mvcc() {
604        let mut row = SlotRow::from_values(1, &[Some(b"test")]);
605        row.set_mvcc(100, 200);
606        
607        assert_eq!(row.txn_start(), 100);
608        assert_eq!(row.txn_end(), 200);
609        assert!(row.is_visible_at(150));
610        assert!(!row.is_visible_at(50));
611        assert!(!row.is_visible_at(250));
612    }
613
614    #[test]
615    fn test_slot_row_serialize() {
616        let row = SlotRow::from_values(42, &[
617            Some(b"hello"),
618            Some(&123i64.to_le_bytes()),
619            None,
620        ]);
621        
622        let bytes = row.to_bytes();
623        let restored = SlotRow::from_bytes(&bytes).unwrap();
624        
625        assert_eq!(restored.row_id(), 42);
626        assert_eq!(restored.get_bytes(0), Some(b"hello".as_slice()));
627        assert_eq!(restored.get_i64(1), Some(123));
628        assert!(restored.is_null(2));
629    }
630
631    #[test]
632    fn test_slot_row_memory_size() {
633        let row = SlotRow::from_values(1, &[
634            Some(b"hello"),
635            Some(b"world"),
636        ]);
637        
638        // Header (28) + 2 slots (8) + data (10) = 46 bytes
639        // Much smaller than HashMap equivalent (~150+ bytes)
640        let size = row.memory_size();
641        assert!(size < 100, "SlotRow size {} should be < 100 bytes", size);
642    }
643
644    #[test]
645    fn test_slot_row_arena() {
646        let mut arena = SlotRowArena::new();
647        
648        let row1 = SlotRow::from_values(1, &[Some(b"hello")]);
649        let row2 = SlotRow::from_values(2, &[Some(b"world")]);
650        
651        let h1 = arena.store(&row1);
652        let h2 = arena.store(&row2);
653        
654        let r1 = arena.get(&h1).unwrap();
655        let r2 = arena.get(&h2).unwrap();
656        
657        assert_eq!(r1.row_id(), 1);
658        assert_eq!(r2.row_id(), 2);
659        assert_eq!(r1.get_str(0), Some("hello"));
660        assert_eq!(r2.get_str(0), Some("world"));
661    }
662
663    #[test]
664    fn test_slot_row_arena_many_rows() {
665        // Use small block size to force multiple blocks
666        let mut arena = SlotRowArena::with_block_size(1024);
667        let mut handles = Vec::new();
668        
669        for i in 0..1000 {
670            let row = SlotRow::from_values(i, &[
671                Some(&i.to_le_bytes()),
672                Some(format!("row_{}", i).as_bytes()),
673            ]);
674            handles.push(arena.store(&row));
675        }
676        
677        // Verify all rows
678        for (i, handle) in handles.iter().enumerate() {
679            let row = arena.get(handle).unwrap();
680            assert_eq!(row.row_id(), i as u64);
681            assert_eq!(row.get_u64(0), Some(i as u64));
682        }
683        
684        // Should use multiple blocks (1000 rows * ~50 bytes = ~50KB > 1024)
685        assert!(arena.block_count() > 1);
686    }
687}