Skip to main content

nodedb_fts/
block.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! 128-document posting block with compressed storage.
4//!
5//! Each `PostingBlock` holds up to 128 postings. Data is stored as:
6//! - Delta-encoded, bitpacked surrogate IDs (raw `u32` on disk; typed
7//!   `Surrogate` in memory)
8//! - Bitpacked term frequencies (u32)
9//! - SmallFloat fieldnorm bytes (1 byte per doc)
10//! - Per-doc position lists (variable length)
11//!
12//! This is the storage unit for BMW (Block-Max WAND): each block carries
13//! `block_max_tf` and `block_min_fieldnorm` for skip-level pruning.
14//!
15//! `Surrogate` is the global, monotonically-allocated row identity shared
16//! across every engine. The on-disk encoding stays raw `u32` for delta
17//! compression and SIMD unpack; in-memory blocks expose typed `Surrogate`
18//! values so cross-engine bitmap intersections stay type-safe.
19
20use nodedb_types::Surrogate;
21
22use crate::codec::{bitpack, delta, smallfloat};
23
24/// Maximum number of documents per posting block.
25pub const BLOCK_SIZE: usize = 128;
26
27/// A compact posting entry keyed on a `Surrogate` row identity.
28#[derive(Debug, Clone)]
29pub struct CompactPosting {
30    pub doc_id: Surrogate,
31    pub term_freq: u32,
32    pub fieldnorm: u8,
33    pub positions: Vec<u32>,
34}
35
36/// A block of up to 128 postings in compressed form.
37///
38/// Provides both in-memory access (via `decode`) and serialized byte
39/// representation (via `to_bytes` / `from_bytes`).
40#[derive(Debug, Clone)]
41pub struct PostingBlock {
42    /// Sorted surrogate IDs in this block.
43    pub doc_ids: Vec<Surrogate>,
44    /// Term frequencies, parallel to `doc_ids`.
45    pub term_freqs: Vec<u32>,
46    /// SmallFloat-encoded fieldnorms, parallel to `doc_ids`.
47    pub fieldnorms: Vec<u8>,
48    /// Per-document position lists, parallel to `doc_ids`.
49    pub positions: Vec<Vec<u32>>,
50}
51
52/// Block-level metadata for BMW skip index.
53#[derive(Debug, Clone, Copy)]
54pub struct BlockMeta {
55    /// Last (largest) surrogate ID in this block.
56    pub last_doc_id: Surrogate,
57    /// Number of documents in this block.
58    pub doc_count: u16,
59    /// Maximum term frequency in this block (for upper bound scoring).
60    pub block_max_tf: u32,
61    /// Minimum fieldnorm in this block (shortest doc — highest BM25 potential).
62    pub block_min_fieldnorm: u8,
63}
64
65impl PostingBlock {
66    /// Create a block from a slice of compact postings.
67    ///
68    /// Postings MUST be sorted by doc_id. Panics in debug if unsorted.
69    /// Truncates to `BLOCK_SIZE` if more are provided.
70    pub fn from_postings(postings: &[CompactPosting]) -> Self {
71        let n = postings.len().min(BLOCK_SIZE);
72        debug_assert!(
73            postings[..n].windows(2).all(|w| w[0].doc_id <= w[1].doc_id),
74            "postings must be sorted by surrogate"
75        );
76
77        Self {
78            doc_ids: postings[..n].iter().map(|p| p.doc_id).collect(),
79            term_freqs: postings[..n].iter().map(|p| p.term_freq).collect(),
80            fieldnorms: postings[..n].iter().map(|p| p.fieldnorm).collect(),
81            positions: postings[..n].iter().map(|p| p.positions.clone()).collect(),
82        }
83    }
84
85    /// Number of documents in this block.
86    pub fn len(&self) -> usize {
87        self.doc_ids.len()
88    }
89
90    /// Whether the block is empty.
91    pub fn is_empty(&self) -> bool {
92        self.doc_ids.is_empty()
93    }
94
95    /// Compute block metadata for BMW skip index.
96    pub fn meta(&self) -> BlockMeta {
97        BlockMeta {
98            last_doc_id: self.doc_ids.last().copied().unwrap_or(Surrogate::ZERO),
99            doc_count: self.doc_ids.len() as u16,
100            block_max_tf: self.term_freqs.iter().copied().max().unwrap_or(0),
101            block_min_fieldnorm: self.fieldnorms.iter().copied().min().unwrap_or(0),
102        }
103    }
104
105    /// Serialize the block to bytes.
106    ///
107    /// Layout:
108    /// ```text
109    /// [doc_count: u16 LE]
110    /// [packed_doc_id_deltas: len u32 LE + bytes]
111    /// [packed_term_freqs: len u32 LE + bytes]
112    /// [fieldnorms: doc_count bytes]
113    /// [position_data: for each doc, count u16 LE + packed positions]
114    /// ```
115    pub fn to_bytes(&self) -> Vec<u8> {
116        let mut buf = Vec::new();
117
118        // Doc count.
119        buf.extend_from_slice(&(self.doc_ids.len() as u16).to_le_bytes());
120
121        // Delta-encoded, bitpacked surrogate IDs (raw u32 on disk).
122        let raw_ids: Vec<u32> = self.doc_ids.iter().map(|s| s.0).collect();
123        let deltas = delta::encode(&raw_ids);
124        let packed_ids = bitpack::pack(&deltas);
125        buf.extend_from_slice(&(packed_ids.len() as u32).to_le_bytes());
126        buf.extend_from_slice(&packed_ids);
127
128        // Bitpacked term frequencies.
129        let packed_freqs = bitpack::pack(&self.term_freqs);
130        buf.extend_from_slice(&(packed_freqs.len() as u32).to_le_bytes());
131        buf.extend_from_slice(&packed_freqs);
132
133        // Fieldnorms (raw bytes, 1 per doc).
134        buf.extend_from_slice(&self.fieldnorms);
135
136        // Position data: for each doc, [count: u16 LE][packed positions].
137        for positions in &self.positions {
138            buf.extend_from_slice(&(positions.len() as u16).to_le_bytes());
139            if !positions.is_empty() {
140                let packed_pos = bitpack::pack(positions);
141                buf.extend_from_slice(&(packed_pos.len() as u16).to_le_bytes());
142                buf.extend_from_slice(&packed_pos);
143            } else {
144                buf.extend_from_slice(&0u16.to_le_bytes());
145            }
146        }
147
148        buf
149    }
150
151    /// Deserialize a block from bytes. Returns `None` if malformed.
152    pub fn from_bytes(buf: &[u8]) -> Option<Self> {
153        let mut pos = 0;
154
155        // Doc count.
156        if buf.len() < 2 {
157            return None;
158        }
159        let doc_count = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
160        pos += 2;
161
162        if doc_count == 0 {
163            return Some(Self {
164                doc_ids: Vec::new(),
165                term_freqs: Vec::new(),
166                fieldnorms: Vec::new(),
167                positions: Vec::new(),
168            });
169        }
170
171        // Packed doc ID deltas.
172        if pos + 4 > buf.len() {
173            return None;
174        }
175        let ids_len =
176            u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]) as usize;
177        pos += 4;
178        if pos + ids_len > buf.len() {
179            return None;
180        }
181        let packed_deltas = &buf[pos..pos + ids_len];
182        let deltas = bitpack::unpack(packed_deltas);
183        let raw_ids = delta::decode(&deltas);
184        let doc_ids: Vec<Surrogate> = raw_ids.into_iter().map(Surrogate).collect();
185        pos += ids_len;
186
187        // Packed term frequencies.
188        if pos + 4 > buf.len() {
189            return None;
190        }
191        let freqs_len =
192            u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]) as usize;
193        pos += 4;
194        if pos + freqs_len > buf.len() {
195            return None;
196        }
197        let term_freqs = bitpack::unpack(&buf[pos..pos + freqs_len]);
198        pos += freqs_len;
199
200        // Fieldnorms.
201        if pos + doc_count > buf.len() {
202            return None;
203        }
204        let fieldnorms = buf[pos..pos + doc_count].to_vec();
205        pos += doc_count;
206
207        // Position data.
208        // no-governor: cold block decode; doc_count from block header (≤ 128), fixed-tiny in practice
209        let mut positions = Vec::with_capacity(doc_count);
210        for _ in 0..doc_count {
211            if pos + 2 > buf.len() {
212                return None;
213            }
214            let num_positions = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
215            pos += 2;
216
217            if num_positions == 0 {
218                if pos + 2 > buf.len() {
219                    return None;
220                }
221                pos += 2; // Skip packed_len (0).
222                positions.push(Vec::new());
223            } else {
224                if pos + 2 > buf.len() {
225                    return None;
226                }
227                let packed_pos_len = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
228                pos += 2;
229                if pos + packed_pos_len > buf.len() {
230                    return None;
231                }
232                let doc_positions = bitpack::unpack(&buf[pos..pos + packed_pos_len]);
233                pos += packed_pos_len;
234                positions.push(doc_positions);
235            }
236        }
237
238        if doc_ids.len() != doc_count || term_freqs.len() != doc_count {
239            return None;
240        }
241
242        Some(Self {
243            doc_ids,
244            term_freqs,
245            fieldnorms,
246            positions,
247        })
248    }
249}
250
251/// Split a list of compact postings into 128-doc blocks.
252pub fn into_blocks(mut postings: Vec<CompactPosting>) -> Vec<PostingBlock> {
253    postings.sort_by_key(|p| p.doc_id);
254    postings
255        .chunks(BLOCK_SIZE)
256        .map(PostingBlock::from_postings)
257        .collect()
258}
259
260/// Encode a document length as a SmallFloat fieldnorm byte.
261pub fn encode_fieldnorm(doc_length: u32) -> u8 {
262    smallfloat::encode(doc_length)
263}
264
265/// Decode a SmallFloat fieldnorm byte back to approximate document length.
266pub fn decode_fieldnorm(byte: u8) -> u32 {
267    smallfloat::decode(byte)
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273
274    fn make_postings(n: usize) -> Vec<CompactPosting> {
275        (0..n)
276            .map(|i| CompactPosting {
277                doc_id: Surrogate((i * 3) as u32),
278                term_freq: (i % 5 + 1) as u32,
279                fieldnorm: smallfloat::encode((i * 10 + 20) as u32),
280                positions: vec![i as u32, (i + 5) as u32],
281            })
282            .collect()
283    }
284
285    #[test]
286    fn block_roundtrip() {
287        let postings = make_postings(50);
288        let block = PostingBlock::from_postings(&postings);
289        assert_eq!(block.len(), 50);
290
291        let bytes = block.to_bytes();
292        let restored = PostingBlock::from_bytes(&bytes).unwrap();
293
294        assert_eq!(restored.doc_ids, block.doc_ids);
295        assert_eq!(restored.term_freqs, block.term_freqs);
296        assert_eq!(restored.fieldnorms, block.fieldnorms);
297        assert_eq!(restored.positions, block.positions);
298    }
299
300    #[test]
301    fn block_128_roundtrip() {
302        let postings = make_postings(128);
303        let block = PostingBlock::from_postings(&postings);
304        assert_eq!(block.len(), 128);
305
306        let bytes = block.to_bytes();
307        let restored = PostingBlock::from_bytes(&bytes).unwrap();
308        assert_eq!(restored.doc_ids, block.doc_ids);
309    }
310
311    #[test]
312    fn block_meta() {
313        let postings = make_postings(10);
314        let block = PostingBlock::from_postings(&postings);
315        let meta = block.meta();
316
317        assert_eq!(meta.doc_count, 10);
318        assert_eq!(meta.last_doc_id, Surrogate(27)); // (9 * 3)
319        assert_eq!(meta.block_max_tf, 5); // max of (i%5+1) for i=0..9
320    }
321
322    #[test]
323    fn into_blocks_splits() {
324        let postings = make_postings(300);
325        let blocks = into_blocks(postings);
326        assert_eq!(blocks.len(), 3); // 128 + 128 + 44
327        assert_eq!(blocks[0].len(), 128);
328        assert_eq!(blocks[1].len(), 128);
329        assert_eq!(blocks[2].len(), 44);
330    }
331
332    #[test]
333    fn empty_block() {
334        let block = PostingBlock::from_postings(&[]);
335        assert!(block.is_empty());
336        let bytes = block.to_bytes();
337        let restored = PostingBlock::from_bytes(&bytes).unwrap();
338        assert!(restored.is_empty());
339    }
340
341    #[test]
342    fn fieldnorm_encode_decode() {
343        for len in [0, 1, 5, 8, 10, 20, 50, 100, 500, 1000, 10_000] {
344            let encoded = encode_fieldnorm(len);
345            let decoded = decode_fieldnorm(encoded);
346            if len <= 8 {
347                assert_eq!(decoded, len, "exact for small: len={len}");
348            } else {
349                assert!(decoded <= len, "decoded {decoded} > original {len}");
350                let error = (len - decoded) as f64 / len as f64;
351                assert!(
352                    error < 0.25,
353                    "len={len}, decoded={decoded}, error={error:.4}"
354                );
355            }
356        }
357    }
358
359    #[test]
360    fn compression_ratio() {
361        // 128 postings with small deltas should compress well.
362        let postings: Vec<CompactPosting> = (0..128)
363            .map(|i| CompactPosting {
364                doc_id: Surrogate(i),
365                term_freq: 1,
366                fieldnorm: smallfloat::encode(100),
367                positions: vec![0],
368            })
369            .collect();
370        let block = PostingBlock::from_postings(&postings);
371        let bytes = block.to_bytes();
372
373        // Uncompressed: 128 * (4 + 4 + 1 + ~8) = ~2176 bytes.
374        // Compressed should be significantly smaller.
375        // Position data is the bulk: 128 docs × (2 pos_count + 2 packed_len + 3 header + 1 data).
376        // The IDs and freqs should be very compact (1-bit deltas, 1-bit freqs).
377        // Total should be well under uncompressed 128*(4+4+1+8) = 2176 bytes.
378        assert!(
379            bytes.len() < 1500,
380            "compressed block should be < 1500 bytes, got {}",
381            bytes.len()
382        );
383    }
384}