Skip to main content

hermes_core/structures/
sstable_index.rs

1//! Memory-efficient SSTable index structures
2//!
3//! This module provides two approaches for memory-efficient block indexing:
4//!
5//! ## Option 1: FST-based Index (native feature)
6//! Uses a Finite State Transducer to map keys to block ordinals. The FST can be
7//! mmap'd directly without parsing into heap-allocated structures.
8//!
9//! ## Option 2: Mmap'd Raw Index
10//! Keeps the prefix-compressed block index as raw bytes and decodes entries
11//! on-demand during binary search. No heap allocation for the index.
12//!
13//! Both approaches use a compact BlockAddrStore with bitpacked offsets/lengths.
14
15use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
16use std::io::{self, Write};
17use std::ops::Range;
18
19use crate::directories::OwnedBytes;
20
21/// Block address - offset and length in the data section
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub struct BlockAddr {
24    pub offset: u64,
25    pub length: u32,
26}
27
28impl BlockAddr {
29    pub fn byte_range(&self) -> Range<u64> {
30        self.offset..self.offset + self.length as u64
31    }
32}
33
34/// Compact storage for block addresses using delta + bitpacking
35///
36/// Memory layout:
37/// - Header: num_blocks (u32) + offset_bits (u8) + length_bits (u8)
38/// - Bitpacked data: offsets and lengths interleaved
39///
40/// Uses delta encoding for offsets (blocks are sequential) and
41/// stores lengths directly (typically similar sizes).
42#[derive(Debug)]
43pub struct BlockAddrStore {
44    num_blocks: u32,
45    offset_bits: u8,
46    length_bits: u8,
47    /// Eagerly decoded addresses for O(1) random access
48    addrs: Vec<BlockAddr>,
49}
50
51impl BlockAddrStore {
52    /// Build from a list of block addresses
53    pub fn build(addrs: &[BlockAddr]) -> io::Result<Vec<u8>> {
54        if addrs.is_empty() {
55            let mut buf = Vec::with_capacity(6);
56            buf.write_u32::<LittleEndian>(0)?;
57            buf.write_u8(0)?;
58            buf.write_u8(0)?;
59            return Ok(buf);
60        }
61
62        // Compute delta offsets and find max values for bit width
63        let mut deltas = Vec::with_capacity(addrs.len());
64        let mut prev_end: u64 = 0;
65        let mut max_delta: u64 = 0;
66        let mut max_length: u32 = 0;
67
68        for addr in addrs {
69            // Delta from end of previous block (handles gaps)
70            let delta = addr.offset.saturating_sub(prev_end);
71            deltas.push(delta);
72            max_delta = max_delta.max(delta);
73            max_length = max_length.max(addr.length);
74            prev_end = addr.offset + addr.length as u64;
75        }
76
77        // Compute bit widths
78        let offset_bits = if max_delta == 0 {
79            1
80        } else {
81            (64 - max_delta.leading_zeros()) as u8
82        };
83        let length_bits = if max_length == 0 {
84            1
85        } else {
86            (32 - max_length.leading_zeros()) as u8
87        };
88
89        // Calculate packed size
90        let bits_per_entry = offset_bits as usize + length_bits as usize;
91        let total_bits = bits_per_entry * addrs.len();
92        let packed_bytes = total_bits.div_ceil(8);
93
94        let mut buf = Vec::with_capacity(6 + packed_bytes);
95        buf.write_u32::<LittleEndian>(addrs.len() as u32)?;
96        buf.write_u8(offset_bits)?;
97        buf.write_u8(length_bits)?;
98
99        // Bitpack the data
100        let mut bit_writer = BitWriter::new(&mut buf);
101        for (i, addr) in addrs.iter().enumerate() {
102            bit_writer.write(deltas[i], offset_bits)?;
103            bit_writer.write(addr.length as u64, length_bits)?;
104        }
105        bit_writer.flush()?;
106
107        Ok(buf)
108    }
109
110    /// Load from raw bytes — eagerly decodes all addresses for O(1) access
111    pub fn load(data: OwnedBytes) -> io::Result<Self> {
112        if data.len() < 6 {
113            return Err(io::Error::new(
114                io::ErrorKind::InvalidData,
115                "BlockAddrStore data too short",
116            ));
117        }
118
119        let mut reader = data.as_slice();
120        let num_blocks = reader.read_u32::<LittleEndian>()?;
121        let offset_bits = reader.read_u8()?;
122        let length_bits = reader.read_u8()?;
123
124        // Eagerly decode all block addresses once at load time
125        let packed_data = &data.as_slice()[6..];
126        let mut bit_reader = BitReader::new(packed_data);
127        let mut addrs = Vec::with_capacity(num_blocks as usize);
128        let mut current_offset: u64 = 0;
129
130        for _ in 0..num_blocks {
131            if let (Ok(delta), Ok(length)) =
132                (bit_reader.read(offset_bits), bit_reader.read(length_bits))
133            {
134                current_offset += delta;
135                addrs.push(BlockAddr {
136                    offset: current_offset,
137                    length: length as u32,
138                });
139                current_offset += length;
140            }
141        }
142
143        Ok(Self {
144            num_blocks,
145            offset_bits,
146            length_bits,
147            addrs,
148        })
149    }
150
151    /// Number of blocks
152    pub fn len(&self) -> usize {
153        self.num_blocks as usize
154    }
155
156    /// Check if empty
157    pub fn is_empty(&self) -> bool {
158        self.num_blocks == 0
159    }
160
161    /// Get block address by index — O(1) from eagerly decoded array
162    #[inline]
163    pub fn get(&self, idx: usize) -> Option<BlockAddr> {
164        self.addrs.get(idx).copied()
165    }
166
167    /// Get all block addresses
168    pub fn all(&self) -> Vec<BlockAddr> {
169        self.addrs.clone()
170    }
171}
172
173/// FST-based block index (Option 1)
174///
175/// Maps keys to block ordinals using an FST. The FST bytes can be mmap'd
176/// directly without any parsing or heap allocation.
177#[cfg(feature = "native")]
178pub struct FstBlockIndex {
179    fst: fst::Map<OwnedBytes>,
180    block_addrs: BlockAddrStore,
181}
182
183#[cfg(feature = "native")]
184impl FstBlockIndex {
185    /// Build FST index from keys and block addresses
186    pub fn build(entries: &[(Vec<u8>, BlockAddr)]) -> io::Result<Vec<u8>> {
187        use fst::MapBuilder;
188
189        // Build FST mapping keys to block ordinals
190        let mut fst_builder = MapBuilder::memory();
191        for (i, (key, _)) in entries.iter().enumerate() {
192            fst_builder
193                .insert(key, i as u64)
194                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
195        }
196        let fst_bytes = fst_builder
197            .into_inner()
198            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
199
200        // Build block address store
201        let addrs: Vec<BlockAddr> = entries.iter().map(|(_, addr)| *addr).collect();
202        let addr_bytes = BlockAddrStore::build(&addrs)?;
203
204        // Combine: fst_len (u32) + fst_bytes + addr_bytes
205        let mut result = Vec::with_capacity(4 + fst_bytes.len() + addr_bytes.len());
206        result.write_u32::<LittleEndian>(fst_bytes.len() as u32)?;
207        result.extend_from_slice(&fst_bytes);
208        result.extend_from_slice(&addr_bytes);
209
210        Ok(result)
211    }
212
213    /// Load from raw bytes
214    pub fn load(data: OwnedBytes) -> io::Result<Self> {
215        if data.len() < 4 {
216            return Err(io::Error::new(
217                io::ErrorKind::InvalidData,
218                "FstBlockIndex data too short",
219            ));
220        }
221
222        let fst_len = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
223
224        if data.len() < 4 + fst_len {
225            return Err(io::Error::new(
226                io::ErrorKind::InvalidData,
227                "FstBlockIndex FST data truncated",
228            ));
229        }
230
231        let fst_data = data.slice(4..4 + fst_len);
232        let addr_data = data.slice(4 + fst_len..data.len());
233
234        let fst =
235            fst::Map::new(fst_data).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
236        let block_addrs = BlockAddrStore::load(addr_data)?;
237
238        Ok(Self { fst, block_addrs })
239    }
240
241    /// Look up the block index for a key
242    /// Returns the block ordinal that could contain this key.
243    /// O(key_len) via FST exact lookup + single stream step.
244    pub fn locate(&self, key: &[u8]) -> Option<usize> {
245        // Fast exact match — O(key_len), no stream allocation
246        if let Some(ordinal) = self.fst.get(key) {
247            return Some(ordinal as usize);
248        }
249
250        // Find the first block whose first_key > target (single stream step)
251        use fst::{IntoStreamer, Streamer};
252        let mut stream = self.fst.range().gt(key).into_stream();
253        match stream.next() {
254            Some((_, ordinal)) if ordinal > 0 => Some(ordinal as usize - 1),
255            Some(_) => None, // key < first block's first key
256            None => {
257                // No key > target → target is after all keys; use last block
258                let len = self.fst.len();
259                if len > 0 { Some(len - 1) } else { None }
260            }
261        }
262    }
263
264    /// Get block address by ordinal
265    pub fn get_addr(&self, ordinal: usize) -> Option<BlockAddr> {
266        self.block_addrs.get(ordinal)
267    }
268
269    /// Number of blocks
270    pub fn len(&self) -> usize {
271        self.block_addrs.len()
272    }
273
274    /// Check if empty
275    pub fn is_empty(&self) -> bool {
276        self.block_addrs.is_empty()
277    }
278
279    /// Get all block addresses
280    pub fn all_addrs(&self) -> Vec<BlockAddr> {
281        self.block_addrs.all()
282    }
283}
284
285/// Mmap'd raw block index (Option 2)
286///
287/// Keeps the prefix-compressed block index as raw bytes and decodes
288/// entries on-demand. Uses restart points every R entries for O(log N)
289/// lookup via binary search instead of O(N) linear scan.
290pub struct MmapBlockIndex {
291    data: OwnedBytes,
292    num_blocks: u32,
293    block_addrs: BlockAddrStore,
294    /// Offset where the prefix-compressed keys start
295    keys_offset: usize,
296    /// Offset where the keys section ends (restart array begins)
297    keys_end: usize,
298    /// Byte offset in data where the restart offsets array starts
299    restart_array_offset: usize,
300    /// Number of restart points
301    restart_count: usize,
302    /// Restart interval (R) — a restart point every R entries
303    restart_interval: usize,
304}
305
306/// Restart interval: store full (uncompressed) key every R entries
307const RESTART_INTERVAL: usize = 16;
308
309impl MmapBlockIndex {
310    /// Build mmap-friendly index from entries.
311    ///
312    /// Format: `num_blocks (u32) | BlockAddrStore | prefix-compressed keys
313    /// (with restart points) | restart_offsets[..] | restart_count (u32) | restart_interval (u16)`
314    pub fn build(entries: &[(Vec<u8>, BlockAddr)]) -> io::Result<Vec<u8>> {
315        if entries.is_empty() {
316            let mut buf = Vec::with_capacity(16);
317            buf.write_u32::<LittleEndian>(0)?; // num_blocks
318            buf.extend_from_slice(&BlockAddrStore::build(&[])?);
319            // Empty restart array + footer
320            buf.write_u32::<LittleEndian>(0)?; // restart_count
321            buf.write_u16::<LittleEndian>(RESTART_INTERVAL as u16)?;
322            return Ok(buf);
323        }
324
325        // Build block address store
326        let addrs: Vec<BlockAddr> = entries.iter().map(|(_, addr)| *addr).collect();
327        let addr_bytes = BlockAddrStore::build(&addrs)?;
328
329        // Build prefix-compressed keys with restart points
330        let mut keys_buf = Vec::new();
331        let mut prev_key: Vec<u8> = Vec::new();
332        let mut restart_offsets: Vec<u32> = Vec::new();
333
334        for (i, (key, _)) in entries.iter().enumerate() {
335            let is_restart = i % RESTART_INTERVAL == 0;
336
337            if is_restart {
338                restart_offsets.push(keys_buf.len() as u32);
339                // Store full key (no prefix compression)
340                write_vint(&mut keys_buf, 0)?;
341                write_vint(&mut keys_buf, key.len() as u64)?;
342                keys_buf.extend_from_slice(key);
343            } else {
344                let prefix_len = common_prefix_len(&prev_key, key);
345                let suffix = &key[prefix_len..];
346                write_vint(&mut keys_buf, prefix_len as u64)?;
347                write_vint(&mut keys_buf, suffix.len() as u64)?;
348                keys_buf.extend_from_slice(suffix);
349            }
350
351            prev_key.clear();
352            prev_key.extend_from_slice(key);
353        }
354
355        // Combine: num_blocks + addr_bytes + keys + restart_offsets + footer
356        let restart_count = restart_offsets.len();
357        let mut result =
358            Vec::with_capacity(4 + addr_bytes.len() + keys_buf.len() + restart_count * 4 + 6);
359        result.write_u32::<LittleEndian>(entries.len() as u32)?;
360        result.extend_from_slice(&addr_bytes);
361        result.extend_from_slice(&keys_buf);
362
363        // Write restart offsets array
364        for &off in &restart_offsets {
365            result.write_u32::<LittleEndian>(off)?;
366        }
367
368        // Write footer
369        result.write_u32::<LittleEndian>(restart_count as u32)?;
370        result.write_u16::<LittleEndian>(RESTART_INTERVAL as u16)?;
371
372        Ok(result)
373    }
374
375    /// Load from raw bytes
376    pub fn load(data: OwnedBytes) -> io::Result<Self> {
377        if data.len() < 4 {
378            return Err(io::Error::new(
379                io::ErrorKind::InvalidData,
380                "MmapBlockIndex data too short",
381            ));
382        }
383
384        let num_blocks = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
385
386        // Load block addresses
387        let addr_data_start = 4;
388        let remaining = data.slice(addr_data_start..data.len());
389        let block_addrs = BlockAddrStore::load(remaining.clone())?;
390
391        // Calculate where keys start
392        let bits_per_entry = block_addrs.offset_bits as usize + block_addrs.length_bits as usize;
393        let total_bits = bits_per_entry * num_blocks as usize;
394        let addr_packed_size = total_bits.div_ceil(8);
395        let keys_offset = addr_data_start + 6 + addr_packed_size; // 6 = header of BlockAddrStore
396
397        // Read footer (last 6 bytes: restart_count u32 + restart_interval u16)
398        if data.len() < keys_offset + 6 {
399            return Err(io::Error::new(
400                io::ErrorKind::InvalidData,
401                "MmapBlockIndex missing restart footer",
402            ));
403        }
404        let footer_start = data.len() - 6;
405        let restart_count = u32::from_le_bytes([
406            data[footer_start],
407            data[footer_start + 1],
408            data[footer_start + 2],
409            data[footer_start + 3],
410        ]) as usize;
411        let restart_interval =
412            u16::from_le_bytes([data[footer_start + 4], data[footer_start + 5]]) as usize;
413
414        // Restart offsets array: restart_count × 4 bytes, just before footer
415        let restart_array_offset = footer_start - restart_count * 4;
416
417        // Keys section spans from keys_offset to restart_array_offset
418        let keys_end = restart_array_offset;
419
420        Ok(Self {
421            data,
422            num_blocks,
423            block_addrs,
424            keys_offset,
425            keys_end,
426            restart_array_offset,
427            restart_count,
428            restart_interval,
429        })
430    }
431
432    /// Read restart offset at given index directly from mmap'd data
433    #[inline]
434    fn restart_offset(&self, idx: usize) -> u32 {
435        let pos = self.restart_array_offset + idx * 4;
436        u32::from_le_bytes([
437            self.data[pos],
438            self.data[pos + 1],
439            self.data[pos + 2],
440            self.data[pos + 3],
441        ])
442    }
443
444    /// Decode the full key at a restart point (prefix_len is always 0)
445    fn decode_restart_key<'a>(&self, keys_data: &'a [u8], restart_idx: usize) -> &'a [u8] {
446        let offset = self.restart_offset(restart_idx) as usize;
447        let mut reader = &keys_data[offset..];
448
449        let prefix_len = read_vint(&mut reader).unwrap_or(0) as usize;
450        debug_assert_eq!(prefix_len, 0, "restart point should have prefix_len=0");
451        let suffix_len = read_vint(&mut reader).unwrap_or(0) as usize;
452
453        // reader now points to the suffix bytes
454        &reader[..suffix_len]
455    }
456
457    /// O(log(N/R) + R) lookup using binary search on restart points, then
458    /// linear scan with prefix decompression within the interval.
459    pub fn locate(&self, target: &[u8]) -> Option<usize> {
460        if self.num_blocks == 0 {
461            return None;
462        }
463
464        let keys_data = &self.data.as_slice()[self.keys_offset..self.keys_end];
465
466        // Binary search on restart points to find the interval
467        let mut lo = 0usize;
468        let mut hi = self.restart_count;
469
470        while lo < hi {
471            let mid = lo + (hi - lo) / 2;
472            let key = self.decode_restart_key(keys_data, mid);
473            match key.cmp(target) {
474                std::cmp::Ordering::Equal => {
475                    return Some(mid * self.restart_interval);
476                }
477                std::cmp::Ordering::Less => lo = mid + 1,
478                std::cmp::Ordering::Greater => hi = mid,
479            }
480        }
481
482        // lo is the first restart point whose key > target (or restart_count)
483        // Search in the interval starting at restart (lo - 1), or 0 if lo == 0
484        if lo == 0 {
485            // target < first restart key — might be before all keys
486            // but we still need to scan from the beginning
487        }
488
489        let restart_idx = if lo > 0 { lo - 1 } else { 0 };
490        let start_ordinal = restart_idx * self.restart_interval;
491        let end_ordinal = if restart_idx + 1 < self.restart_count {
492            (restart_idx + 1) * self.restart_interval
493        } else {
494            self.num_blocks as usize
495        };
496
497        // Linear scan from restart point through at most R entries
498        let scan_offset = self.restart_offset(restart_idx) as usize;
499        let mut reader = &keys_data[scan_offset..];
500        let mut current_key = Vec::new();
501        let mut last_le_block: Option<usize> = None;
502
503        for i in start_ordinal..end_ordinal {
504            let prefix_len = match read_vint(&mut reader) {
505                Ok(v) => v as usize,
506                Err(_) => break,
507            };
508            let suffix_len = match read_vint(&mut reader) {
509                Ok(v) => v as usize,
510                Err(_) => break,
511            };
512
513            current_key.truncate(prefix_len);
514            if suffix_len > reader.len() {
515                break;
516            }
517            current_key.extend_from_slice(&reader[..suffix_len]);
518            reader = &reader[suffix_len..];
519
520            match current_key.as_slice().cmp(target) {
521                std::cmp::Ordering::Equal => return Some(i),
522                std::cmp::Ordering::Less => last_le_block = Some(i),
523                std::cmp::Ordering::Greater => return last_le_block,
524            }
525        }
526
527        last_le_block
528    }
529
530    /// Get block address by ordinal
531    pub fn get_addr(&self, ordinal: usize) -> Option<BlockAddr> {
532        self.block_addrs.get(ordinal)
533    }
534
535    /// Number of blocks
536    pub fn len(&self) -> usize {
537        self.num_blocks as usize
538    }
539
540    /// Check if empty
541    pub fn is_empty(&self) -> bool {
542        self.num_blocks == 0
543    }
544
545    /// Get all block addresses
546    pub fn all_addrs(&self) -> Vec<BlockAddr> {
547        self.block_addrs.all()
548    }
549
550    /// Decode all keys (for debugging/merging)
551    pub fn all_keys(&self) -> Vec<Vec<u8>> {
552        let mut result = Vec::with_capacity(self.num_blocks as usize);
553        let keys_data = &self.data.as_slice()[self.keys_offset..self.keys_end];
554        let mut reader = keys_data;
555        let mut current_key = Vec::new();
556
557        for _ in 0..self.num_blocks {
558            let prefix_len = match read_vint(&mut reader) {
559                Ok(v) => v as usize,
560                Err(_) => break,
561            };
562            let suffix_len = match read_vint(&mut reader) {
563                Ok(v) => v as usize,
564                Err(_) => break,
565            };
566
567            current_key.truncate(prefix_len);
568            if suffix_len > reader.len() {
569                break;
570            }
571            current_key.extend_from_slice(&reader[..suffix_len]);
572            reader = &reader[suffix_len..];
573
574            result.push(current_key.clone());
575        }
576
577        result
578    }
579}
580
581/// Unified block index that can use either FST or mmap'd raw index
582pub enum BlockIndex {
583    #[cfg(feature = "native")]
584    Fst(FstBlockIndex),
585    Mmap(MmapBlockIndex),
586}
587
588impl BlockIndex {
589    /// Locate the block that could contain the key
590    pub fn locate(&self, key: &[u8]) -> Option<usize> {
591        match self {
592            #[cfg(feature = "native")]
593            BlockIndex::Fst(idx) => idx.locate(key),
594            BlockIndex::Mmap(idx) => idx.locate(key),
595        }
596    }
597
598    /// Get block address by ordinal
599    pub fn get_addr(&self, ordinal: usize) -> Option<BlockAddr> {
600        match self {
601            #[cfg(feature = "native")]
602            BlockIndex::Fst(idx) => idx.get_addr(ordinal),
603            BlockIndex::Mmap(idx) => idx.get_addr(ordinal),
604        }
605    }
606
607    /// Number of blocks
608    pub fn len(&self) -> usize {
609        match self {
610            #[cfg(feature = "native")]
611            BlockIndex::Fst(idx) => idx.len(),
612            BlockIndex::Mmap(idx) => idx.len(),
613        }
614    }
615
616    /// Check if empty
617    pub fn is_empty(&self) -> bool {
618        self.len() == 0
619    }
620
621    /// Get all block addresses
622    pub fn all_addrs(&self) -> Vec<BlockAddr> {
623        match self {
624            #[cfg(feature = "native")]
625            BlockIndex::Fst(idx) => idx.all_addrs(),
626            BlockIndex::Mmap(idx) => idx.all_addrs(),
627        }
628    }
629}
630
631// ============================================================================
632// Helper functions
633// ============================================================================
634
635fn common_prefix_len(a: &[u8], b: &[u8]) -> usize {
636    a.iter().zip(b.iter()).take_while(|(x, y)| x == y).count()
637}
638
639fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
640    loop {
641        let byte = (value & 0x7F) as u8;
642        value >>= 7;
643        if value == 0 {
644            writer.write_all(&[byte])?;
645            return Ok(());
646        } else {
647            writer.write_all(&[byte | 0x80])?;
648        }
649    }
650}
651
652fn read_vint(reader: &mut &[u8]) -> io::Result<u64> {
653    let mut result = 0u64;
654    let mut shift = 0;
655
656    loop {
657        if reader.is_empty() {
658            return Err(io::Error::new(
659                io::ErrorKind::UnexpectedEof,
660                "Unexpected end of varint",
661            ));
662        }
663        let byte = reader[0];
664        *reader = &reader[1..];
665        result |= ((byte & 0x7F) as u64) << shift;
666        if byte & 0x80 == 0 {
667            return Ok(result);
668        }
669        shift += 7;
670        if shift >= 64 {
671            return Err(io::Error::new(
672                io::ErrorKind::InvalidData,
673                "Varint too long",
674            ));
675        }
676    }
677}
678
679/// Simple bit writer for packing
680struct BitWriter<'a> {
681    output: &'a mut Vec<u8>,
682    buffer: u64,
683    bits_in_buffer: u8,
684}
685
686impl<'a> BitWriter<'a> {
687    fn new(output: &'a mut Vec<u8>) -> Self {
688        Self {
689            output,
690            buffer: 0,
691            bits_in_buffer: 0,
692        }
693    }
694
695    fn write(&mut self, value: u64, num_bits: u8) -> io::Result<()> {
696        debug_assert!(num_bits <= 64);
697
698        self.buffer |= value << self.bits_in_buffer;
699        self.bits_in_buffer += num_bits;
700
701        while self.bits_in_buffer >= 8 {
702            self.output.push(self.buffer as u8);
703            self.buffer >>= 8;
704            self.bits_in_buffer -= 8;
705        }
706
707        Ok(())
708    }
709
710    fn flush(&mut self) -> io::Result<()> {
711        if self.bits_in_buffer > 0 {
712            self.output.push(self.buffer as u8);
713            self.buffer = 0;
714            self.bits_in_buffer = 0;
715        }
716        Ok(())
717    }
718}
719
720/// Simple bit reader for unpacking
721struct BitReader<'a> {
722    data: &'a [u8],
723    byte_pos: usize,
724    bit_pos: u8,
725}
726
727impl<'a> BitReader<'a> {
728    fn new(data: &'a [u8]) -> Self {
729        Self {
730            data,
731            byte_pos: 0,
732            bit_pos: 0,
733        }
734    }
735
736    fn read(&mut self, num_bits: u8) -> io::Result<u64> {
737        if num_bits == 0 {
738            return Ok(0);
739        }
740
741        let mut result: u64 = 0;
742        let mut bits_read: u8 = 0;
743
744        while bits_read < num_bits {
745            if self.byte_pos >= self.data.len() {
746                return Err(io::Error::new(
747                    io::ErrorKind::UnexpectedEof,
748                    "Not enough bits",
749                ));
750            }
751
752            let bits_available = 8 - self.bit_pos;
753            let bits_to_read = (num_bits - bits_read).min(bits_available);
754            // Handle edge case where bits_to_read == 8 to avoid overflow
755            let mask = if bits_to_read >= 8 {
756                0xFF
757            } else {
758                (1u8 << bits_to_read) - 1
759            };
760            let bits = (self.data[self.byte_pos] >> self.bit_pos) & mask;
761
762            result |= (bits as u64) << bits_read;
763            bits_read += bits_to_read;
764            self.bit_pos += bits_to_read;
765
766            if self.bit_pos >= 8 {
767                self.byte_pos += 1;
768                self.bit_pos = 0;
769            }
770        }
771
772        Ok(result)
773    }
774}
775
776#[cfg(test)]
777mod tests {
778    use super::*;
779
780    #[test]
781    fn test_block_addr_store_roundtrip() {
782        let addrs = vec![
783            BlockAddr {
784                offset: 0,
785                length: 1000,
786            },
787            BlockAddr {
788                offset: 1000,
789                length: 1500,
790            },
791            BlockAddr {
792                offset: 2500,
793                length: 800,
794            },
795            BlockAddr {
796                offset: 3300,
797                length: 2000,
798            },
799        ];
800
801        let bytes = BlockAddrStore::build(&addrs).unwrap();
802        let store = BlockAddrStore::load(OwnedBytes::new(bytes)).unwrap();
803
804        assert_eq!(store.len(), 4);
805        for (i, expected) in addrs.iter().enumerate() {
806            let actual = store.get(i).unwrap();
807            assert_eq!(actual.offset, expected.offset, "offset mismatch at {}", i);
808            assert_eq!(actual.length, expected.length, "length mismatch at {}", i);
809        }
810    }
811
812    #[test]
813    fn test_block_addr_store_empty() {
814        let bytes = BlockAddrStore::build(&[]).unwrap();
815        let store = BlockAddrStore::load(OwnedBytes::new(bytes)).unwrap();
816        assert_eq!(store.len(), 0);
817        assert!(store.get(0).is_none());
818    }
819
820    #[test]
821    fn test_mmap_block_index_roundtrip() {
822        let entries = vec![
823            (
824                b"aaa".to_vec(),
825                BlockAddr {
826                    offset: 0,
827                    length: 100,
828                },
829            ),
830            (
831                b"bbb".to_vec(),
832                BlockAddr {
833                    offset: 100,
834                    length: 150,
835                },
836            ),
837            (
838                b"ccc".to_vec(),
839                BlockAddr {
840                    offset: 250,
841                    length: 200,
842                },
843            ),
844        ];
845
846        let bytes = MmapBlockIndex::build(&entries).unwrap();
847        let index = MmapBlockIndex::load(OwnedBytes::new(bytes)).unwrap();
848
849        assert_eq!(index.len(), 3);
850
851        // Test locate
852        assert_eq!(index.locate(b"aaa"), Some(0));
853        assert_eq!(index.locate(b"bbb"), Some(1));
854        assert_eq!(index.locate(b"ccc"), Some(2));
855        assert_eq!(index.locate(b"aab"), Some(0)); // Between aaa and bbb
856        assert_eq!(index.locate(b"ddd"), Some(2)); // After all keys
857        assert_eq!(index.locate(b"000"), None); // Before all keys
858    }
859
860    #[cfg(feature = "native")]
861    #[test]
862    fn test_fst_block_index_roundtrip() {
863        let entries = vec![
864            (
865                b"aaa".to_vec(),
866                BlockAddr {
867                    offset: 0,
868                    length: 100,
869                },
870            ),
871            (
872                b"bbb".to_vec(),
873                BlockAddr {
874                    offset: 100,
875                    length: 150,
876                },
877            ),
878            (
879                b"ccc".to_vec(),
880                BlockAddr {
881                    offset: 250,
882                    length: 200,
883                },
884            ),
885        ];
886
887        let bytes = FstBlockIndex::build(&entries).unwrap();
888        let index = FstBlockIndex::load(OwnedBytes::new(bytes)).unwrap();
889
890        assert_eq!(index.len(), 3);
891
892        // Test locate
893        assert_eq!(index.locate(b"aaa"), Some(0));
894        assert_eq!(index.locate(b"bbb"), Some(1));
895        assert_eq!(index.locate(b"ccc"), Some(2));
896        assert_eq!(index.locate(b"aab"), Some(0)); // Between aaa and bbb
897        assert_eq!(index.locate(b"ddd"), Some(2)); // After all keys
898    }
899
900    #[test]
901    fn test_bit_writer_reader() {
902        let mut buf = Vec::new();
903        let mut writer = BitWriter::new(&mut buf);
904
905        writer.write(5, 3).unwrap(); // 101
906        writer.write(3, 2).unwrap(); // 11
907        writer.write(15, 4).unwrap(); // 1111
908        writer.flush().unwrap();
909
910        let mut reader = BitReader::new(&buf);
911        assert_eq!(reader.read(3).unwrap(), 5);
912        assert_eq!(reader.read(2).unwrap(), 3);
913        assert_eq!(reader.read(4).unwrap(), 15);
914    }
915}