1use nodedb_types::Surrogate;
21
22use crate::codec::{bitpack, delta, smallfloat};
23
24pub const BLOCK_SIZE: usize = 128;
26
27#[derive(Debug, Clone)]
29pub struct CompactPosting {
30 pub doc_id: Surrogate,
31 pub term_freq: u32,
32 pub fieldnorm: u8,
33 pub positions: Vec<u32>,
34}
35
36#[derive(Debug, Clone)]
41pub struct PostingBlock {
42 pub doc_ids: Vec<Surrogate>,
44 pub term_freqs: Vec<u32>,
46 pub fieldnorms: Vec<u8>,
48 pub positions: Vec<Vec<u32>>,
50}
51
52#[derive(Debug, Clone, Copy)]
54pub struct BlockMeta {
55 pub last_doc_id: Surrogate,
57 pub doc_count: u16,
59 pub block_max_tf: u32,
61 pub block_min_fieldnorm: u8,
63}
64
65impl PostingBlock {
66 pub fn from_postings(postings: &[CompactPosting]) -> Self {
71 let n = postings.len().min(BLOCK_SIZE);
72 debug_assert!(
73 postings[..n].windows(2).all(|w| w[0].doc_id <= w[1].doc_id),
74 "postings must be sorted by surrogate"
75 );
76
77 Self {
78 doc_ids: postings[..n].iter().map(|p| p.doc_id).collect(),
79 term_freqs: postings[..n].iter().map(|p| p.term_freq).collect(),
80 fieldnorms: postings[..n].iter().map(|p| p.fieldnorm).collect(),
81 positions: postings[..n].iter().map(|p| p.positions.clone()).collect(),
82 }
83 }
84
85 pub fn len(&self) -> usize {
87 self.doc_ids.len()
88 }
89
90 pub fn is_empty(&self) -> bool {
92 self.doc_ids.is_empty()
93 }
94
95 pub fn meta(&self) -> BlockMeta {
97 BlockMeta {
98 last_doc_id: self.doc_ids.last().copied().unwrap_or(Surrogate::ZERO),
99 doc_count: self.doc_ids.len() as u16,
100 block_max_tf: self.term_freqs.iter().copied().max().unwrap_or(0),
101 block_min_fieldnorm: self.fieldnorms.iter().copied().min().unwrap_or(0),
102 }
103 }
104
105 pub fn to_bytes(&self) -> Vec<u8> {
116 let mut buf = Vec::new();
117
118 buf.extend_from_slice(&(self.doc_ids.len() as u16).to_le_bytes());
120
121 let raw_ids: Vec<u32> = self.doc_ids.iter().map(|s| s.0).collect();
123 let deltas = delta::encode(&raw_ids);
124 let packed_ids = bitpack::pack(&deltas);
125 buf.extend_from_slice(&(packed_ids.len() as u32).to_le_bytes());
126 buf.extend_from_slice(&packed_ids);
127
128 let packed_freqs = bitpack::pack(&self.term_freqs);
130 buf.extend_from_slice(&(packed_freqs.len() as u32).to_le_bytes());
131 buf.extend_from_slice(&packed_freqs);
132
133 buf.extend_from_slice(&self.fieldnorms);
135
136 for positions in &self.positions {
138 buf.extend_from_slice(&(positions.len() as u16).to_le_bytes());
139 if !positions.is_empty() {
140 let packed_pos = bitpack::pack(positions);
141 buf.extend_from_slice(&(packed_pos.len() as u16).to_le_bytes());
142 buf.extend_from_slice(&packed_pos);
143 } else {
144 buf.extend_from_slice(&0u16.to_le_bytes());
145 }
146 }
147
148 buf
149 }
150
151 pub fn from_bytes(buf: &[u8]) -> Option<Self> {
153 let mut pos = 0;
154
155 if buf.len() < 2 {
157 return None;
158 }
159 let doc_count = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
160 pos += 2;
161
162 if doc_count == 0 {
163 return Some(Self {
164 doc_ids: Vec::new(),
165 term_freqs: Vec::new(),
166 fieldnorms: Vec::new(),
167 positions: Vec::new(),
168 });
169 }
170
171 if pos + 4 > buf.len() {
173 return None;
174 }
175 let ids_len =
176 u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]) as usize;
177 pos += 4;
178 if pos + ids_len > buf.len() {
179 return None;
180 }
181 let packed_deltas = &buf[pos..pos + ids_len];
182 let deltas = bitpack::unpack(packed_deltas);
183 let raw_ids = delta::decode(&deltas);
184 let doc_ids: Vec<Surrogate> = raw_ids.into_iter().map(Surrogate).collect();
185 pos += ids_len;
186
187 if pos + 4 > buf.len() {
189 return None;
190 }
191 let freqs_len =
192 u32::from_le_bytes([buf[pos], buf[pos + 1], buf[pos + 2], buf[pos + 3]]) as usize;
193 pos += 4;
194 if pos + freqs_len > buf.len() {
195 return None;
196 }
197 let term_freqs = bitpack::unpack(&buf[pos..pos + freqs_len]);
198 pos += freqs_len;
199
200 if pos + doc_count > buf.len() {
202 return None;
203 }
204 let fieldnorms = buf[pos..pos + doc_count].to_vec();
205 pos += doc_count;
206
207 let mut positions = Vec::with_capacity(doc_count);
210 for _ in 0..doc_count {
211 if pos + 2 > buf.len() {
212 return None;
213 }
214 let num_positions = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
215 pos += 2;
216
217 if num_positions == 0 {
218 if pos + 2 > buf.len() {
219 return None;
220 }
221 pos += 2; positions.push(Vec::new());
223 } else {
224 if pos + 2 > buf.len() {
225 return None;
226 }
227 let packed_pos_len = u16::from_le_bytes([buf[pos], buf[pos + 1]]) as usize;
228 pos += 2;
229 if pos + packed_pos_len > buf.len() {
230 return None;
231 }
232 let doc_positions = bitpack::unpack(&buf[pos..pos + packed_pos_len]);
233 pos += packed_pos_len;
234 positions.push(doc_positions);
235 }
236 }
237
238 if doc_ids.len() != doc_count || term_freqs.len() != doc_count {
239 return None;
240 }
241
242 Some(Self {
243 doc_ids,
244 term_freqs,
245 fieldnorms,
246 positions,
247 })
248 }
249}
250
251pub fn into_blocks(mut postings: Vec<CompactPosting>) -> Vec<PostingBlock> {
253 postings.sort_by_key(|p| p.doc_id);
254 postings
255 .chunks(BLOCK_SIZE)
256 .map(PostingBlock::from_postings)
257 .collect()
258}
259
260pub fn encode_fieldnorm(doc_length: u32) -> u8 {
262 smallfloat::encode(doc_length)
263}
264
265pub fn decode_fieldnorm(byte: u8) -> u32 {
267 smallfloat::decode(byte)
268}
269
270#[cfg(test)]
271mod tests {
272 use super::*;
273
274 fn make_postings(n: usize) -> Vec<CompactPosting> {
275 (0..n)
276 .map(|i| CompactPosting {
277 doc_id: Surrogate((i * 3) as u32),
278 term_freq: (i % 5 + 1) as u32,
279 fieldnorm: smallfloat::encode((i * 10 + 20) as u32),
280 positions: vec![i as u32, (i + 5) as u32],
281 })
282 .collect()
283 }
284
285 #[test]
286 fn block_roundtrip() {
287 let postings = make_postings(50);
288 let block = PostingBlock::from_postings(&postings);
289 assert_eq!(block.len(), 50);
290
291 let bytes = block.to_bytes();
292 let restored = PostingBlock::from_bytes(&bytes).unwrap();
293
294 assert_eq!(restored.doc_ids, block.doc_ids);
295 assert_eq!(restored.term_freqs, block.term_freqs);
296 assert_eq!(restored.fieldnorms, block.fieldnorms);
297 assert_eq!(restored.positions, block.positions);
298 }
299
300 #[test]
301 fn block_128_roundtrip() {
302 let postings = make_postings(128);
303 let block = PostingBlock::from_postings(&postings);
304 assert_eq!(block.len(), 128);
305
306 let bytes = block.to_bytes();
307 let restored = PostingBlock::from_bytes(&bytes).unwrap();
308 assert_eq!(restored.doc_ids, block.doc_ids);
309 }
310
311 #[test]
312 fn block_meta() {
313 let postings = make_postings(10);
314 let block = PostingBlock::from_postings(&postings);
315 let meta = block.meta();
316
317 assert_eq!(meta.doc_count, 10);
318 assert_eq!(meta.last_doc_id, Surrogate(27)); assert_eq!(meta.block_max_tf, 5); }
321
322 #[test]
323 fn into_blocks_splits() {
324 let postings = make_postings(300);
325 let blocks = into_blocks(postings);
326 assert_eq!(blocks.len(), 3); assert_eq!(blocks[0].len(), 128);
328 assert_eq!(blocks[1].len(), 128);
329 assert_eq!(blocks[2].len(), 44);
330 }
331
332 #[test]
333 fn empty_block() {
334 let block = PostingBlock::from_postings(&[]);
335 assert!(block.is_empty());
336 let bytes = block.to_bytes();
337 let restored = PostingBlock::from_bytes(&bytes).unwrap();
338 assert!(restored.is_empty());
339 }
340
341 #[test]
342 fn fieldnorm_encode_decode() {
343 for len in [0, 1, 5, 8, 10, 20, 50, 100, 500, 1000, 10_000] {
344 let encoded = encode_fieldnorm(len);
345 let decoded = decode_fieldnorm(encoded);
346 if len <= 8 {
347 assert_eq!(decoded, len, "exact for small: len={len}");
348 } else {
349 assert!(decoded <= len, "decoded {decoded} > original {len}");
350 let error = (len - decoded) as f64 / len as f64;
351 assert!(
352 error < 0.25,
353 "len={len}, decoded={decoded}, error={error:.4}"
354 );
355 }
356 }
357 }
358
359 #[test]
360 fn compression_ratio() {
361 let postings: Vec<CompactPosting> = (0..128)
363 .map(|i| CompactPosting {
364 doc_id: Surrogate(i),
365 term_freq: 1,
366 fieldnorm: smallfloat::encode(100),
367 positions: vec![0],
368 })
369 .collect();
370 let block = PostingBlock::from_postings(&postings);
371 let bytes = block.to_bytes();
372
373 assert!(
379 bytes.len() < 1500,
380 "compressed block should be < 1500 bytes, got {}",
381 bytes.len()
382 );
383 }
384}