1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
16use std::io::{self, Write};
17use std::ops::Range;
18
19use crate::directories::OwnedBytes;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub struct BlockAddr {
24 pub offset: u64,
25 pub length: u32,
26}
27
28impl BlockAddr {
29 pub fn byte_range(&self) -> Range<u64> {
30 self.offset..self.offset + self.length as u64
31 }
32}
33
34#[derive(Debug)]
43pub struct BlockAddrStore {
44 data: OwnedBytes,
45 num_blocks: u32,
46 offset_bits: u8,
47 length_bits: u8,
48 header_size: usize,
49}
50
51impl BlockAddrStore {
52 pub fn build(addrs: &[BlockAddr]) -> io::Result<Vec<u8>> {
54 if addrs.is_empty() {
55 let mut buf = Vec::with_capacity(6);
56 buf.write_u32::<LittleEndian>(0)?;
57 buf.write_u8(0)?;
58 buf.write_u8(0)?;
59 return Ok(buf);
60 }
61
62 let mut deltas = Vec::with_capacity(addrs.len());
64 let mut prev_end: u64 = 0;
65 let mut max_delta: u64 = 0;
66 let mut max_length: u32 = 0;
67
68 for addr in addrs {
69 let delta = addr.offset.saturating_sub(prev_end);
71 deltas.push(delta);
72 max_delta = max_delta.max(delta);
73 max_length = max_length.max(addr.length);
74 prev_end = addr.offset + addr.length as u64;
75 }
76
77 let offset_bits = if max_delta == 0 {
79 1
80 } else {
81 (64 - max_delta.leading_zeros()) as u8
82 };
83 let length_bits = if max_length == 0 {
84 1
85 } else {
86 (32 - max_length.leading_zeros()) as u8
87 };
88
89 let bits_per_entry = offset_bits as usize + length_bits as usize;
91 let total_bits = bits_per_entry * addrs.len();
92 let packed_bytes = total_bits.div_ceil(8);
93
94 let mut buf = Vec::with_capacity(6 + packed_bytes);
95 buf.write_u32::<LittleEndian>(addrs.len() as u32)?;
96 buf.write_u8(offset_bits)?;
97 buf.write_u8(length_bits)?;
98
99 let mut bit_writer = BitWriter::new(&mut buf);
101 for (i, addr) in addrs.iter().enumerate() {
102 bit_writer.write(deltas[i], offset_bits)?;
103 bit_writer.write(addr.length as u64, length_bits)?;
104 }
105 bit_writer.flush()?;
106
107 Ok(buf)
108 }
109
110 pub fn load(data: OwnedBytes) -> io::Result<Self> {
112 if data.len() < 6 {
113 return Err(io::Error::new(
114 io::ErrorKind::InvalidData,
115 "BlockAddrStore data too short",
116 ));
117 }
118
119 let mut reader = data.as_slice();
120 let num_blocks = reader.read_u32::<LittleEndian>()?;
121 let offset_bits = reader.read_u8()?;
122 let length_bits = reader.read_u8()?;
123
124 Ok(Self {
125 data,
126 num_blocks,
127 offset_bits,
128 length_bits,
129 header_size: 6,
130 })
131 }
132
133 pub fn len(&self) -> usize {
135 self.num_blocks as usize
136 }
137
138 pub fn is_empty(&self) -> bool {
140 self.num_blocks == 0
141 }
142
143 pub fn get(&self, idx: usize) -> Option<BlockAddr> {
145 if idx >= self.num_blocks as usize {
146 return None;
147 }
148
149 let packed_data = &self.data.as_slice()[self.header_size..];
150
151 let mut bit_reader = BitReader::new(packed_data);
153 let mut current_offset: u64 = 0;
154
155 for i in 0..=idx {
156 let delta = bit_reader.read(self.offset_bits).ok()?;
157 let length = bit_reader.read(self.length_bits).ok()? as u32;
158
159 current_offset += delta;
160
161 if i == idx {
162 return Some(BlockAddr {
163 offset: current_offset,
164 length,
165 });
166 }
167
168 current_offset += length as u64;
169 }
170
171 None
172 }
173
174 pub fn all(&self) -> Vec<BlockAddr> {
176 let mut result = Vec::with_capacity(self.num_blocks as usize);
177 let packed_data = &self.data.as_slice()[self.header_size..];
178 let mut bit_reader = BitReader::new(packed_data);
179 let mut current_offset: u64 = 0;
180
181 for _ in 0..self.num_blocks {
182 if let (Ok(delta), Ok(length)) = (
183 bit_reader.read(self.offset_bits),
184 bit_reader.read(self.length_bits),
185 ) {
186 current_offset += delta;
187 result.push(BlockAddr {
188 offset: current_offset,
189 length: length as u32,
190 });
191 current_offset += length;
192 }
193 }
194
195 result
196 }
197}
198
199#[cfg(feature = "native")]
204pub struct FstBlockIndex {
205 fst: fst::Map<OwnedBytes>,
206 block_addrs: BlockAddrStore,
207}
208
209#[cfg(feature = "native")]
210impl FstBlockIndex {
211 pub fn build(entries: &[(Vec<u8>, BlockAddr)]) -> io::Result<Vec<u8>> {
213 use fst::MapBuilder;
214
215 let mut fst_builder = MapBuilder::memory();
217 for (i, (key, _)) in entries.iter().enumerate() {
218 fst_builder
219 .insert(key, i as u64)
220 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
221 }
222 let fst_bytes = fst_builder
223 .into_inner()
224 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
225
226 let addrs: Vec<BlockAddr> = entries.iter().map(|(_, addr)| *addr).collect();
228 let addr_bytes = BlockAddrStore::build(&addrs)?;
229
230 let mut result = Vec::with_capacity(4 + fst_bytes.len() + addr_bytes.len());
232 result.write_u32::<LittleEndian>(fst_bytes.len() as u32)?;
233 result.extend_from_slice(&fst_bytes);
234 result.extend_from_slice(&addr_bytes);
235
236 Ok(result)
237 }
238
239 pub fn load(data: OwnedBytes) -> io::Result<Self> {
241 if data.len() < 4 {
242 return Err(io::Error::new(
243 io::ErrorKind::InvalidData,
244 "FstBlockIndex data too short",
245 ));
246 }
247
248 let fst_len = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
249
250 if data.len() < 4 + fst_len {
251 return Err(io::Error::new(
252 io::ErrorKind::InvalidData,
253 "FstBlockIndex FST data truncated",
254 ));
255 }
256
257 let fst_data = data.slice(4..4 + fst_len);
258 let addr_data = data.slice(4 + fst_len..data.len());
259
260 let fst =
261 fst::Map::new(fst_data).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
262 let block_addrs = BlockAddrStore::load(addr_data)?;
263
264 Ok(Self { fst, block_addrs })
265 }
266
267 pub fn locate(&self, key: &[u8]) -> Option<usize> {
270 use fst::{IntoStreamer, Streamer};
273
274 let mut stream = self.fst.range().ge(key).into_stream();
275
276 if let Some((found_key, ordinal)) = stream.next()
278 && found_key == key
279 {
280 return Some(ordinal as usize);
281 }
282
283 let mut stream = self.fst.range().lt(key).into_stream();
285 let mut last_ordinal = None;
286 while let Some((_, ordinal)) = stream.next() {
287 last_ordinal = Some(ordinal as usize);
288 }
289
290 last_ordinal
291 }
292
293 pub fn get_addr(&self, ordinal: usize) -> Option<BlockAddr> {
295 self.block_addrs.get(ordinal)
296 }
297
298 pub fn len(&self) -> usize {
300 self.block_addrs.len()
301 }
302
303 pub fn is_empty(&self) -> bool {
305 self.block_addrs.is_empty()
306 }
307
308 pub fn all_addrs(&self) -> Vec<BlockAddr> {
310 self.block_addrs.all()
311 }
312}
313
314pub struct MmapBlockIndex {
319 data: OwnedBytes,
320 num_blocks: u32,
321 block_addrs: BlockAddrStore,
322 keys_offset: usize,
324}
325
326impl MmapBlockIndex {
327 pub fn build(entries: &[(Vec<u8>, BlockAddr)]) -> io::Result<Vec<u8>> {
329 if entries.is_empty() {
330 let mut buf = Vec::with_capacity(10);
331 buf.write_u32::<LittleEndian>(0)?; buf.extend_from_slice(&BlockAddrStore::build(&[])?);
333 return Ok(buf);
334 }
335
336 let addrs: Vec<BlockAddr> = entries.iter().map(|(_, addr)| *addr).collect();
338 let addr_bytes = BlockAddrStore::build(&addrs)?;
339
340 let mut keys_buf = Vec::new();
342 let mut prev_key: Vec<u8> = Vec::new();
343
344 for (key, _) in entries {
345 let prefix_len = common_prefix_len(&prev_key, key);
346 let suffix = &key[prefix_len..];
347
348 write_vint(&mut keys_buf, prefix_len as u64)?;
349 write_vint(&mut keys_buf, suffix.len() as u64)?;
350 keys_buf.extend_from_slice(suffix);
351
352 prev_key.clear();
353 prev_key.extend_from_slice(key);
354 }
355
356 let mut result = Vec::with_capacity(4 + addr_bytes.len() + keys_buf.len());
358 result.write_u32::<LittleEndian>(entries.len() as u32)?;
359 result.extend_from_slice(&addr_bytes);
360 result.extend_from_slice(&keys_buf);
361
362 Ok(result)
363 }
364
365 pub fn load(data: OwnedBytes) -> io::Result<Self> {
367 if data.len() < 4 {
368 return Err(io::Error::new(
369 io::ErrorKind::InvalidData,
370 "MmapBlockIndex data too short",
371 ));
372 }
373
374 let num_blocks = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
375
376 let addr_data_start = 4;
378 let remaining = data.slice(addr_data_start..data.len());
379 let block_addrs = BlockAddrStore::load(remaining.clone())?;
380
381 let bits_per_entry = block_addrs.offset_bits as usize + block_addrs.length_bits as usize;
383 let total_bits = bits_per_entry * num_blocks as usize;
384 let addr_packed_size = total_bits.div_ceil(8);
385 let keys_offset = addr_data_start + 6 + addr_packed_size; Ok(Self {
388 data,
389 num_blocks,
390 block_addrs,
391 keys_offset,
392 })
393 }
394
395 pub fn locate(&self, target: &[u8]) -> Option<usize> {
398 if self.num_blocks == 0 {
399 return None;
400 }
401
402 let keys_data = &self.data.as_slice()[self.keys_offset..];
406 let mut reader = keys_data;
407 let mut current_key = Vec::new();
408 let mut last_le_block: Option<usize> = None;
409
410 for i in 0..self.num_blocks as usize {
411 let prefix_len = match read_vint(&mut reader) {
412 Ok(v) => v as usize,
413 Err(_) => break,
414 };
415 let suffix_len = match read_vint(&mut reader) {
416 Ok(v) => v as usize,
417 Err(_) => break,
418 };
419
420 current_key.truncate(prefix_len);
421 if suffix_len > reader.len() {
422 break;
423 }
424 current_key.extend_from_slice(&reader[..suffix_len]);
425 reader = &reader[suffix_len..];
426
427 match current_key.as_slice().cmp(target) {
428 std::cmp::Ordering::Equal => return Some(i),
429 std::cmp::Ordering::Less => last_le_block = Some(i),
430 std::cmp::Ordering::Greater => {
431 return last_le_block;
433 }
434 }
435 }
436
437 last_le_block
439 }
440
441 pub fn get_addr(&self, ordinal: usize) -> Option<BlockAddr> {
443 self.block_addrs.get(ordinal)
444 }
445
446 pub fn len(&self) -> usize {
448 self.num_blocks as usize
449 }
450
451 pub fn is_empty(&self) -> bool {
453 self.num_blocks == 0
454 }
455
456 pub fn all_addrs(&self) -> Vec<BlockAddr> {
458 self.block_addrs.all()
459 }
460
461 pub fn all_keys(&self) -> Vec<Vec<u8>> {
463 let mut result = Vec::with_capacity(self.num_blocks as usize);
464 let keys_data = &self.data.as_slice()[self.keys_offset..];
465 let mut reader = keys_data;
466 let mut current_key = Vec::new();
467
468 for _ in 0..self.num_blocks {
469 let prefix_len = match read_vint(&mut reader) {
470 Ok(v) => v as usize,
471 Err(_) => break,
472 };
473 let suffix_len = match read_vint(&mut reader) {
474 Ok(v) => v as usize,
475 Err(_) => break,
476 };
477
478 current_key.truncate(prefix_len);
479 if suffix_len > reader.len() {
480 break;
481 }
482 current_key.extend_from_slice(&reader[..suffix_len]);
483 reader = &reader[suffix_len..];
484
485 result.push(current_key.clone());
486 }
487
488 result
489 }
490}
491
492pub enum BlockIndex {
494 #[cfg(feature = "native")]
495 Fst(FstBlockIndex),
496 Mmap(MmapBlockIndex),
497}
498
499impl BlockIndex {
500 pub fn locate(&self, key: &[u8]) -> Option<usize> {
502 match self {
503 #[cfg(feature = "native")]
504 BlockIndex::Fst(idx) => idx.locate(key),
505 BlockIndex::Mmap(idx) => idx.locate(key),
506 }
507 }
508
509 pub fn get_addr(&self, ordinal: usize) -> Option<BlockAddr> {
511 match self {
512 #[cfg(feature = "native")]
513 BlockIndex::Fst(idx) => idx.get_addr(ordinal),
514 BlockIndex::Mmap(idx) => idx.get_addr(ordinal),
515 }
516 }
517
518 pub fn len(&self) -> usize {
520 match self {
521 #[cfg(feature = "native")]
522 BlockIndex::Fst(idx) => idx.len(),
523 BlockIndex::Mmap(idx) => idx.len(),
524 }
525 }
526
527 pub fn is_empty(&self) -> bool {
529 self.len() == 0
530 }
531
532 pub fn all_addrs(&self) -> Vec<BlockAddr> {
534 match self {
535 #[cfg(feature = "native")]
536 BlockIndex::Fst(idx) => idx.all_addrs(),
537 BlockIndex::Mmap(idx) => idx.all_addrs(),
538 }
539 }
540}
541
542fn common_prefix_len(a: &[u8], b: &[u8]) -> usize {
547 a.iter().zip(b.iter()).take_while(|(x, y)| x == y).count()
548}
549
550fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
551 loop {
552 let byte = (value & 0x7F) as u8;
553 value >>= 7;
554 if value == 0 {
555 writer.write_all(&[byte])?;
556 return Ok(());
557 } else {
558 writer.write_all(&[byte | 0x80])?;
559 }
560 }
561}
562
563fn read_vint(reader: &mut &[u8]) -> io::Result<u64> {
564 let mut result = 0u64;
565 let mut shift = 0;
566
567 loop {
568 if reader.is_empty() {
569 return Err(io::Error::new(
570 io::ErrorKind::UnexpectedEof,
571 "Unexpected end of varint",
572 ));
573 }
574 let byte = reader[0];
575 *reader = &reader[1..];
576 result |= ((byte & 0x7F) as u64) << shift;
577 if byte & 0x80 == 0 {
578 return Ok(result);
579 }
580 shift += 7;
581 if shift >= 64 {
582 return Err(io::Error::new(
583 io::ErrorKind::InvalidData,
584 "Varint too long",
585 ));
586 }
587 }
588}
589
590struct BitWriter<'a> {
592 output: &'a mut Vec<u8>,
593 buffer: u64,
594 bits_in_buffer: u8,
595}
596
597impl<'a> BitWriter<'a> {
598 fn new(output: &'a mut Vec<u8>) -> Self {
599 Self {
600 output,
601 buffer: 0,
602 bits_in_buffer: 0,
603 }
604 }
605
606 fn write(&mut self, value: u64, num_bits: u8) -> io::Result<()> {
607 debug_assert!(num_bits <= 64);
608
609 self.buffer |= value << self.bits_in_buffer;
610 self.bits_in_buffer += num_bits;
611
612 while self.bits_in_buffer >= 8 {
613 self.output.push(self.buffer as u8);
614 self.buffer >>= 8;
615 self.bits_in_buffer -= 8;
616 }
617
618 Ok(())
619 }
620
621 fn flush(&mut self) -> io::Result<()> {
622 if self.bits_in_buffer > 0 {
623 self.output.push(self.buffer as u8);
624 self.buffer = 0;
625 self.bits_in_buffer = 0;
626 }
627 Ok(())
628 }
629}
630
631struct BitReader<'a> {
633 data: &'a [u8],
634 byte_pos: usize,
635 bit_pos: u8,
636}
637
638impl<'a> BitReader<'a> {
639 fn new(data: &'a [u8]) -> Self {
640 Self {
641 data,
642 byte_pos: 0,
643 bit_pos: 0,
644 }
645 }
646
647 fn read(&mut self, num_bits: u8) -> io::Result<u64> {
648 if num_bits == 0 {
649 return Ok(0);
650 }
651
652 let mut result: u64 = 0;
653 let mut bits_read: u8 = 0;
654
655 while bits_read < num_bits {
656 if self.byte_pos >= self.data.len() {
657 return Err(io::Error::new(
658 io::ErrorKind::UnexpectedEof,
659 "Not enough bits",
660 ));
661 }
662
663 let bits_available = 8 - self.bit_pos;
664 let bits_to_read = (num_bits - bits_read).min(bits_available);
665 let mask = if bits_to_read >= 8 {
667 0xFF
668 } else {
669 (1u8 << bits_to_read) - 1
670 };
671 let bits = (self.data[self.byte_pos] >> self.bit_pos) & mask;
672
673 result |= (bits as u64) << bits_read;
674 bits_read += bits_to_read;
675 self.bit_pos += bits_to_read;
676
677 if self.bit_pos >= 8 {
678 self.byte_pos += 1;
679 self.bit_pos = 0;
680 }
681 }
682
683 Ok(result)
684 }
685}
686
687#[cfg(test)]
688mod tests {
689 use super::*;
690
691 #[test]
692 fn test_block_addr_store_roundtrip() {
693 let addrs = vec![
694 BlockAddr {
695 offset: 0,
696 length: 1000,
697 },
698 BlockAddr {
699 offset: 1000,
700 length: 1500,
701 },
702 BlockAddr {
703 offset: 2500,
704 length: 800,
705 },
706 BlockAddr {
707 offset: 3300,
708 length: 2000,
709 },
710 ];
711
712 let bytes = BlockAddrStore::build(&addrs).unwrap();
713 let store = BlockAddrStore::load(OwnedBytes::new(bytes)).unwrap();
714
715 assert_eq!(store.len(), 4);
716 for (i, expected) in addrs.iter().enumerate() {
717 let actual = store.get(i).unwrap();
718 assert_eq!(actual.offset, expected.offset, "offset mismatch at {}", i);
719 assert_eq!(actual.length, expected.length, "length mismatch at {}", i);
720 }
721 }
722
723 #[test]
724 fn test_block_addr_store_empty() {
725 let bytes = BlockAddrStore::build(&[]).unwrap();
726 let store = BlockAddrStore::load(OwnedBytes::new(bytes)).unwrap();
727 assert_eq!(store.len(), 0);
728 assert!(store.get(0).is_none());
729 }
730
731 #[test]
732 fn test_mmap_block_index_roundtrip() {
733 let entries = vec![
734 (
735 b"aaa".to_vec(),
736 BlockAddr {
737 offset: 0,
738 length: 100,
739 },
740 ),
741 (
742 b"bbb".to_vec(),
743 BlockAddr {
744 offset: 100,
745 length: 150,
746 },
747 ),
748 (
749 b"ccc".to_vec(),
750 BlockAddr {
751 offset: 250,
752 length: 200,
753 },
754 ),
755 ];
756
757 let bytes = MmapBlockIndex::build(&entries).unwrap();
758 let index = MmapBlockIndex::load(OwnedBytes::new(bytes)).unwrap();
759
760 assert_eq!(index.len(), 3);
761
762 assert_eq!(index.locate(b"aaa"), Some(0));
764 assert_eq!(index.locate(b"bbb"), Some(1));
765 assert_eq!(index.locate(b"ccc"), Some(2));
766 assert_eq!(index.locate(b"aab"), Some(0)); assert_eq!(index.locate(b"ddd"), Some(2)); assert_eq!(index.locate(b"000"), None); }
770
771 #[cfg(feature = "native")]
772 #[test]
773 fn test_fst_block_index_roundtrip() {
774 let entries = vec![
775 (
776 b"aaa".to_vec(),
777 BlockAddr {
778 offset: 0,
779 length: 100,
780 },
781 ),
782 (
783 b"bbb".to_vec(),
784 BlockAddr {
785 offset: 100,
786 length: 150,
787 },
788 ),
789 (
790 b"ccc".to_vec(),
791 BlockAddr {
792 offset: 250,
793 length: 200,
794 },
795 ),
796 ];
797
798 let bytes = FstBlockIndex::build(&entries).unwrap();
799 let index = FstBlockIndex::load(OwnedBytes::new(bytes)).unwrap();
800
801 assert_eq!(index.len(), 3);
802
803 assert_eq!(index.locate(b"aaa"), Some(0));
805 assert_eq!(index.locate(b"bbb"), Some(1));
806 assert_eq!(index.locate(b"ccc"), Some(2));
807 assert_eq!(index.locate(b"aab"), Some(0)); assert_eq!(index.locate(b"ddd"), Some(2)); }
810
811 #[test]
812 fn test_bit_writer_reader() {
813 let mut buf = Vec::new();
814 let mut writer = BitWriter::new(&mut buf);
815
816 writer.write(5, 3).unwrap(); writer.write(3, 2).unwrap(); writer.write(15, 4).unwrap(); writer.flush().unwrap();
820
821 let mut reader = BitReader::new(&buf);
822 assert_eq!(reader.read(3).unwrap(), 5);
823 assert_eq!(reader.read(2).unwrap(), 3);
824 assert_eq!(reader.read(4).unwrap(), 15);
825 }
826}