Skip to main content

binseq/cbq/core/
index.rs

1use bytemuck::{Pod, Zeroable};
2use zstd::stream::copy_encode;
3
4use crate::{Result, error::CbqError};
5
6use super::{BlockHeader, FileHeader, INDEX_MAGIC};
7
8/// The header for a compressed index.
9///
10/// This is stored identically in memory and on disk.
11#[derive(Debug, Clone, Copy, Zeroable, Pod)]
12#[repr(C)]
13pub struct IndexHeader {
14    /// Magic number identifying the index format
15    magic: [u8; 8],
16
17    /// Number of bytes in the uncompressed index
18    pub(crate) u_bytes: u64,
19
20    /// Number of bytes in the compressed index
21    pub(crate) z_bytes: u64,
22}
23impl IndexHeader {
24    /// Creates a new index header
25    #[must_use]
26    pub fn new(u_bytes: u64, z_bytes: u64) -> Self {
27        Self {
28            magic: *INDEX_MAGIC,
29            u_bytes,
30            z_bytes,
31        }
32    }
33
34    #[must_use]
35    pub fn as_bytes(&self) -> &[u8] {
36        bytemuck::bytes_of(self)
37    }
38
39    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
40        let header: Self = *bytemuck::from_bytes(bytes);
41        if header.magic != *INDEX_MAGIC {
42            return Err(CbqError::InvalidIndexHeaderMagic.into());
43        }
44        Ok(header)
45    }
46}
47
48/// The footer for a compressed index.
49///
50/// This is stored identically in memory and on disk.
51#[derive(Debug, Clone, Copy, Zeroable, Pod)]
52#[repr(C)]
53pub struct IndexFooter {
54    /// Number of bytes in the compressed index
55    pub(crate) bytes: u64,
56
57    /// Magic number identifying the index format
58    magic: [u8; 8],
59}
60
61impl IndexFooter {
62    /// Creates a new index footer
63    #[must_use]
64    pub fn new(bytes: u64) -> Self {
65        Self {
66            bytes,
67            magic: *INDEX_MAGIC,
68        }
69    }
70    #[must_use]
71    pub fn as_bytes(&self) -> &[u8] {
72        bytemuck::bytes_of(self)
73    }
74    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
75        let footer: Self = *bytemuck::from_bytes(bytes);
76        if footer.magic != *INDEX_MAGIC {
77            return Err(CbqError::InvalidIndexFooterMagic.into());
78        }
79        Ok(footer)
80    }
81}
82
83/// An index of block ranges for quick lookups
84#[derive(Clone)]
85pub struct Index {
86    ranges: Vec<BlockRange>,
87}
88impl Index {
89    /// Builds the index from a list of block headers
90    #[must_use]
91    pub fn from_block_headers(block_headers: &[BlockHeader]) -> Self {
92        let mut offset = size_of::<FileHeader>() as u64;
93        let mut cumulative_records = 0;
94        let mut ranges = Vec::default();
95        for block_header in block_headers {
96            let range = BlockRange::new(offset, cumulative_records + block_header.num_records);
97            offset += (size_of::<BlockHeader>() + block_header.block_len()) as u64;
98            cumulative_records += block_header.num_records;
99            ranges.push(range);
100        }
101        Self { ranges }
102    }
103
104    /// Returns the byte representation of the index
105    #[must_use]
106    pub fn as_bytes(&self) -> &[u8] {
107        bytemuck::cast_slice(&self.ranges)
108    }
109
110    /// Builds the index from a byte slice
111    pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
112        let ranges = match bytemuck::try_cast_slice(bytes) {
113            Ok(ranges) => ranges.to_vec(),
114            Err(_) => return Err(CbqError::IndexCastingError.into()),
115        };
116        Ok(Self { ranges })
117    }
118
119    /// Returns the size of the index in bytes
120    #[must_use]
121    pub fn size(&self) -> u64 {
122        self.as_bytes().len() as u64
123    }
124
125    /// Encodes the index into a ZSTD-compressed byte array
126    pub fn encoded(&self) -> Result<Vec<u8>> {
127        let mut encoded = Vec::default();
128        copy_encode(self.as_bytes(), &mut encoded, 0)?;
129        Ok(encoded)
130    }
131
132    /// Returns the number of records in the index
133    #[must_use]
134    pub fn num_records(&self) -> usize {
135        self.ranges
136            .last()
137            .map_or(0, |range| range.cumulative_records as usize)
138    }
139
140    /// Returns the number of blocks in the index
141    #[must_use]
142    pub fn num_blocks(&self) -> usize {
143        self.ranges.len()
144    }
145
146    #[must_use]
147    pub fn iter_blocks(&self) -> BlockIter<'_> {
148        BlockIter {
149            index: self,
150            pos: 0,
151        }
152    }
153
154    #[must_use]
155    pub fn average_block_size(&self) -> f64 {
156        let mut block_iter = self.iter_blocks();
157        let Some(mut last_block) = block_iter.next() else {
158            return 0.0;
159        };
160        let mut total_size = 0.0;
161        let mut count = 0;
162        for block in block_iter {
163            let last_block_size = block.offset - last_block.offset;
164            total_size += last_block_size as f64;
165            count += 1;
166            last_block = block;
167        }
168        total_size / f64::from(count)
169    }
170
171    pub fn pprint(&self) {
172        for block in self.iter_blocks() {
173            println!("{block:?}");
174        }
175    }
176}
177
178pub struct BlockIter<'a> {
179    index: &'a Index,
180    pos: usize,
181}
182impl Iterator for BlockIter<'_> {
183    type Item = BlockRange;
184
185    fn next(&mut self) -> Option<Self::Item> {
186        if self.pos >= self.index.num_blocks() {
187            None
188        } else {
189            let block = self.index.ranges[self.pos];
190            self.pos += 1;
191            Some(block)
192        }
193    }
194}
195
196/// A struct representing a block range in a CBQ file and stored in the [`Index`](crate::cbq::Index)
197///
198/// This is stored identically in memory and on disk.
199#[derive(Clone, Copy, Debug, PartialEq, Eq, Zeroable, Pod, Default)]
200#[repr(C)]
201pub struct BlockRange {
202    /// Byte offset of this block
203    pub(crate) offset: u64,
204
205    /// Number of records up to and including this block
206    pub(crate) cumulative_records: u64,
207}
208impl BlockRange {
209    #[must_use]
210    pub fn new(offset: u64, cumulative_records: u64) -> Self {
211        Self {
212            offset,
213            cumulative_records,
214        }
215    }
216}