1use byteorder::{LittleEndian, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use super::posting_common::{read_vint, write_vint};
12use crate::DocId;
13use crate::directories::OwnedBytes;
14use crate::structures::simd;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct Posting {
19 pub doc_id: DocId,
20 pub term_freq: u32,
21}
22
23#[derive(Debug, Clone, Default)]
25pub struct PostingList {
26 postings: Vec<Posting>,
27}
28
29impl PostingList {
30 pub fn new() -> Self {
31 Self::default()
32 }
33
34 pub fn with_capacity(capacity: usize) -> Self {
35 Self {
36 postings: Vec::with_capacity(capacity),
37 }
38 }
39
40 pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
42 debug_assert!(
43 self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
44 "Postings must be added in sorted order"
45 );
46 self.postings.push(Posting { doc_id, term_freq });
47 }
48
49 pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
51 if let Some(last) = self.postings.last_mut()
52 && last.doc_id == doc_id
53 {
54 last.term_freq += term_freq;
55 return;
56 }
57 self.postings.push(Posting { doc_id, term_freq });
58 }
59
60 pub fn doc_count(&self) -> u32 {
62 self.postings.len() as u32
63 }
64
65 pub fn len(&self) -> usize {
66 self.postings.len()
67 }
68
69 pub fn is_empty(&self) -> bool {
70 self.postings.is_empty()
71 }
72
73 pub fn iter(&self) -> impl Iterator<Item = &Posting> {
74 self.postings.iter()
75 }
76
77 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
79 write_vint(writer, self.postings.len() as u64)?;
81
82 let mut prev_doc_id = 0u32;
83 for posting in &self.postings {
84 let delta = posting.doc_id - prev_doc_id;
86 write_vint(writer, delta as u64)?;
87 write_vint(writer, posting.term_freq as u64)?;
88 prev_doc_id = posting.doc_id;
89 }
90
91 Ok(())
92 }
93
94 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
96 let count = read_vint(reader)? as usize;
97 let mut postings = Vec::with_capacity(count);
98
99 let mut prev_doc_id = 0u32;
100 for _ in 0..count {
101 let delta = read_vint(reader)? as u32;
102 let term_freq = read_vint(reader)? as u32;
103 let doc_id = prev_doc_id + delta;
104 postings.push(Posting { doc_id, term_freq });
105 prev_doc_id = doc_id;
106 }
107
108 Ok(Self { postings })
109 }
110}
111
112pub struct PostingListIterator<'a> {
114 postings: &'a [Posting],
115 position: usize,
116}
117
118impl<'a> PostingListIterator<'a> {
119 pub fn new(posting_list: &'a PostingList) -> Self {
120 Self {
121 postings: &posting_list.postings,
122 position: 0,
123 }
124 }
125
126 pub fn doc(&self) -> DocId {
128 if self.position < self.postings.len() {
129 self.postings[self.position].doc_id
130 } else {
131 TERMINATED
132 }
133 }
134
135 pub fn term_freq(&self) -> u32 {
137 if self.position < self.postings.len() {
138 self.postings[self.position].term_freq
139 } else {
140 0
141 }
142 }
143
144 pub fn advance(&mut self) -> DocId {
146 self.position += 1;
147 self.doc()
148 }
149
150 pub fn seek(&mut self, target: DocId) -> DocId {
152 let remaining = &self.postings[self.position..];
153 let offset = remaining.partition_point(|p| p.doc_id < target);
154 self.position += offset;
155 self.doc()
156 }
157
158 pub fn size_hint(&self) -> usize {
160 self.postings.len().saturating_sub(self.position)
161 }
162}
163
164pub const TERMINATED: DocId = DocId::MAX;
166
167pub const BLOCK_SIZE: usize = 128;
176
177const L1_INTERVAL: usize = 8;
179
180const L0_SIZE: usize = 16;
183
184const L1_SIZE: usize = 4;
186
187const FOOTER_SIZE: usize = 24;
189
190#[inline]
194fn read_l0(bytes: &[u8], idx: usize) -> (u32, u32, u32, f32) {
195 let b = &bytes[idx * L0_SIZE..][..L0_SIZE];
196 let first_doc = u32::from_le_bytes([b[0], b[1], b[2], b[3]]);
197 let last_doc = u32::from_le_bytes([b[4], b[5], b[6], b[7]]);
198 let offset = u32::from_le_bytes([b[8], b[9], b[10], b[11]]);
199 let max_weight = f32::from_le_bytes([b[12], b[13], b[14], b[15]]);
200 (first_doc, last_doc, offset, max_weight)
201}
202
203#[inline]
205fn write_l0(buf: &mut Vec<u8>, first_doc: u32, last_doc: u32, offset: u32, max_weight: f32) {
206 buf.extend_from_slice(&first_doc.to_le_bytes());
207 buf.extend_from_slice(&last_doc.to_le_bytes());
208 buf.extend_from_slice(&offset.to_le_bytes());
209 buf.extend_from_slice(&max_weight.to_le_bytes());
210}
211
212#[inline]
217fn block_data_size(stream: &[u8], pos: usize) -> usize {
218 let count = u16::from_le_bytes(stream[pos..pos + 2].try_into().unwrap()) as usize;
219 let doc_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 6]);
220 let tf_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 7]);
221 let delta_bytes = if count > 1 {
222 (count - 1) * doc_rounded.bytes_per_value()
223 } else {
224 0
225 };
226 8 + delta_bytes + count * tf_rounded.bytes_per_value()
227}
228
229#[derive(Debug, Clone)]
230pub struct BlockPostingList {
231 stream: OwnedBytes,
233 l0_bytes: OwnedBytes,
236 l0_count: usize,
238 l1_docs: Vec<u32>,
241 doc_count: u32,
243 max_tf: u32,
245}
246
247impl BlockPostingList {
248 #[inline]
250 fn read_l0_entry(&self, idx: usize) -> (u32, u32, u32, f32) {
251 read_l0(&self.l0_bytes, idx)
252 }
253
254 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
263 let mut stream: Vec<u8> = Vec::new();
264 let mut l0_buf: Vec<u8> = Vec::new();
265 let mut l1_docs: Vec<u32> = Vec::new();
266 let mut l0_count = 0usize;
267 let mut max_tf = 0u32;
268
269 let postings = &list.postings;
270 let mut i = 0;
271
272 let mut deltas = Vec::with_capacity(BLOCK_SIZE);
274 let mut tf_buf = Vec::with_capacity(BLOCK_SIZE);
275
276 while i < postings.len() {
277 if stream.len() > u32::MAX as usize {
278 return Err(io::Error::new(
279 io::ErrorKind::InvalidData,
280 "posting list stream exceeds u32::MAX bytes",
281 ));
282 }
283 let block_start = stream.len() as u32;
284 let block_end = (i + BLOCK_SIZE).min(postings.len());
285 let block = &postings[i..block_end];
286 let count = block.len();
287
288 let block_max_tf = block.iter().map(|p| p.term_freq).max().unwrap_or(0);
290 max_tf = max_tf.max(block_max_tf);
291
292 let base_doc_id = block.first().unwrap().doc_id;
293 let last_doc_id = block.last().unwrap().doc_id;
294
295 deltas.clear();
297 let mut prev = base_doc_id;
298 for posting in block.iter().skip(1) {
299 deltas.push(posting.doc_id - prev);
300 prev = posting.doc_id;
301 }
302 let max_delta = deltas.iter().copied().max().unwrap_or(0);
303 let doc_id_bits = simd::round_bit_width(simd::bits_needed(max_delta));
304
305 tf_buf.clear();
307 tf_buf.extend(block.iter().map(|p| p.term_freq));
308 let tf_bits = simd::round_bit_width(simd::bits_needed(block_max_tf));
309
310 stream.write_u16::<LittleEndian>(count as u16)?;
312 stream.write_u32::<LittleEndian>(base_doc_id)?;
313 stream.push(doc_id_bits);
314 stream.push(tf_bits);
315
316 if count > 1 {
318 let rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
319 let byte_count = (count - 1) * rounded.bytes_per_value();
320 let start = stream.len();
321 stream.resize(start + byte_count, 0);
322 simd::pack_rounded(&deltas, rounded, &mut stream[start..]);
323 }
324
325 {
327 let rounded = simd::RoundedBitWidth::from_u8(tf_bits);
328 let byte_count = count * rounded.bytes_per_value();
329 let start = stream.len();
330 stream.resize(start + byte_count, 0);
331 simd::pack_rounded(&tf_buf, rounded, &mut stream[start..]);
332 }
333
334 write_l0(
336 &mut l0_buf,
337 base_doc_id,
338 last_doc_id,
339 block_start,
340 block_max_tf as f32,
341 );
342 l0_count += 1;
343
344 if l0_count.is_multiple_of(L1_INTERVAL) {
346 l1_docs.push(last_doc_id);
347 }
348
349 i = block_end;
350 }
351
352 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
354 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
355 l1_docs.push(last_doc);
356 }
357
358 Ok(Self {
359 stream: OwnedBytes::new(stream),
360 l0_bytes: OwnedBytes::new(l0_buf),
361 l0_count,
362 l1_docs,
363 doc_count: postings.len() as u32,
364 max_tf,
365 })
366 }
367
368 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
378 writer.write_all(&self.stream)?;
379 writer.write_all(&self.l0_bytes)?;
380 for &doc in &self.l1_docs {
381 writer.write_u32::<LittleEndian>(doc)?;
382 }
383
384 writer.write_u64::<LittleEndian>(self.stream.len() as u64)?;
386 writer.write_u32::<LittleEndian>(self.l0_count as u32)?;
387 writer.write_u32::<LittleEndian>(self.l1_docs.len() as u32)?;
388 writer.write_u32::<LittleEndian>(self.doc_count)?;
389 writer.write_u32::<LittleEndian>(self.max_tf)?;
390
391 Ok(())
392 }
393
394 pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
396 if raw.len() < FOOTER_SIZE {
397 return Err(io::Error::new(
398 io::ErrorKind::InvalidData,
399 "posting data too short",
400 ));
401 }
402
403 let f = raw.len() - FOOTER_SIZE;
404 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
405 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
406 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
407 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
408 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
409
410 let l0_start = stream_len;
411 let l0_end = l0_start + l0_count * L0_SIZE;
412 let l1_start = l0_end;
413
414 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
415
416 Ok(Self {
417 stream: OwnedBytes::new(raw[..stream_len].to_vec()),
418 l0_bytes: OwnedBytes::new(raw[l0_start..l0_end].to_vec()),
419 l0_count,
420 l1_docs,
421 doc_count,
422 max_tf,
423 })
424 }
425
426 pub fn deserialize_zero_copy(raw: OwnedBytes) -> io::Result<Self> {
430 if raw.len() < FOOTER_SIZE {
431 return Err(io::Error::new(
432 io::ErrorKind::InvalidData,
433 "posting data too short",
434 ));
435 }
436
437 let f = raw.len() - FOOTER_SIZE;
438 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
439 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
440 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
441 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
442 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
443
444 let l0_start = stream_len;
445 let l0_end = l0_start + l0_count * L0_SIZE;
446 let l1_start = l0_end;
447
448 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
449
450 Ok(Self {
451 stream: raw.slice(0..stream_len),
452 l0_bytes: raw.slice(l0_start..l0_end),
453 l0_count,
454 l1_docs,
455 doc_count,
456 max_tf,
457 })
458 }
459
460 fn extract_l1_docs(bytes: &[u8], count: usize) -> Vec<u32> {
462 let mut docs = Vec::with_capacity(count);
463 for i in 0..count {
464 let p = i * L1_SIZE;
465 docs.push(u32::from_le_bytes(bytes[p..p + 4].try_into().unwrap()));
466 }
467 docs
468 }
469
470 pub fn doc_count(&self) -> u32 {
471 self.doc_count
472 }
473
474 pub fn max_tf(&self) -> u32 {
476 self.max_tf
477 }
478
479 pub fn num_blocks(&self) -> usize {
481 self.l0_count
482 }
483
484 pub fn block_max_tf(&self, block_idx: usize) -> Option<u32> {
486 if block_idx >= self.l0_count {
487 return None;
488 }
489 let (_, _, _, max_weight) = self.read_l0_entry(block_idx);
490 Some(max_weight as u32)
491 }
492
493 pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
496 let mut stream: Vec<u8> = Vec::new();
497 let mut l0_buf: Vec<u8> = Vec::new();
498 let mut l1_docs: Vec<u32> = Vec::new();
499 let mut l0_count = 0usize;
500 let mut total_docs = 0u32;
501 let mut max_tf = 0u32;
502
503 for (source, doc_offset) in sources {
504 max_tf = max_tf.max(source.max_tf);
505 for block_idx in 0..source.num_blocks() {
506 let (first_doc, last_doc, offset, max_weight) = source.read_l0_entry(block_idx);
507 let blk_size = block_data_size(&source.stream, offset as usize);
508 let block_bytes = &source.stream[offset as usize..offset as usize + blk_size];
509
510 let count = u16::from_le_bytes(block_bytes[0..2].try_into().unwrap());
511 if stream.len() > u32::MAX as usize {
512 return Err(io::Error::new(
513 io::ErrorKind::InvalidData,
514 "posting list stream exceeds u32::MAX bytes during concatenation",
515 ));
516 }
517 let new_offset = stream.len() as u32;
518
519 stream.write_u16::<LittleEndian>(count)?;
521 stream.write_u32::<LittleEndian>(first_doc + doc_offset)?;
522 stream.extend_from_slice(&block_bytes[6..]);
523
524 let new_last = last_doc + doc_offset;
525 write_l0(
526 &mut l0_buf,
527 first_doc + doc_offset,
528 new_last,
529 new_offset,
530 max_weight,
531 );
532 l0_count += 1;
533 total_docs += count as u32;
534
535 if l0_count.is_multiple_of(L1_INTERVAL) {
536 l1_docs.push(new_last);
537 }
538 }
539 }
540
541 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
543 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
544 l1_docs.push(last_doc);
545 }
546
547 Ok(Self {
548 stream: OwnedBytes::new(stream),
549 l0_bytes: OwnedBytes::new(l0_buf),
550 l0_count,
551 l1_docs,
552 doc_count: total_docs,
553 max_tf,
554 })
555 }
556
557 pub fn concatenate_streaming<W: Write>(
568 sources: &[(&[u8], u32)], writer: &mut W,
570 ) -> io::Result<(u32, usize)> {
571 struct SourceMeta {
572 stream_len: usize,
573 l0_count: usize,
574 }
575
576 let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
577 let mut total_docs = 0u32;
578 let mut merged_max_tf = 0u32;
579
580 for (raw, _) in sources {
581 if raw.len() < FOOTER_SIZE {
582 continue;
583 }
584 let f = raw.len() - FOOTER_SIZE;
585 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
586 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
587 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
589 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
590 total_docs += doc_count;
591 merged_max_tf = merged_max_tf.max(max_tf);
592 metas.push(SourceMeta {
593 stream_len,
594 l0_count,
595 });
596 }
597
598 let mut out_l0: Vec<u8> = Vec::new();
601 let mut out_l1_docs: Vec<u32> = Vec::new();
602 let mut out_l0_count = 0usize;
603 let mut stream_written = 0u64;
604 let mut patch_buf = [0u8; 8];
605
606 for (src_idx, meta) in metas.iter().enumerate() {
607 let (raw, doc_offset) = &sources[src_idx];
608 let l0_base = meta.stream_len; let src_stream = &raw[..meta.stream_len];
610
611 for i in 0..meta.l0_count {
612 let (first_doc, last_doc, offset, max_weight) = read_l0(&raw[l0_base..], i);
614
615 let blk_size = block_data_size(src_stream, offset as usize);
617 let block = &src_stream[offset as usize..offset as usize + blk_size];
618
619 let new_last = last_doc + doc_offset;
621 if stream_written > u32::MAX as u64 {
622 return Err(io::Error::new(
623 io::ErrorKind::InvalidData,
624 "posting list stream exceeds u32::MAX bytes during streaming merge",
625 ));
626 }
627 write_l0(
628 &mut out_l0,
629 first_doc + doc_offset,
630 new_last,
631 stream_written as u32,
632 max_weight,
633 );
634 out_l0_count += 1;
635
636 if out_l0_count.is_multiple_of(L1_INTERVAL) {
638 out_l1_docs.push(new_last);
639 }
640
641 patch_buf.copy_from_slice(&block[0..8]);
643 let blk_first = u32::from_le_bytes(patch_buf[2..6].try_into().unwrap());
644 patch_buf[2..6].copy_from_slice(&(blk_first + doc_offset).to_le_bytes());
645 writer.write_all(&patch_buf)?;
646 writer.write_all(&block[8..])?;
647
648 stream_written += blk_size as u64;
649 }
650 }
651
652 if !out_l0_count.is_multiple_of(L1_INTERVAL) && out_l0_count > 0 {
654 let (_, last_doc, _, _) = read_l0(&out_l0, out_l0_count - 1);
655 out_l1_docs.push(last_doc);
656 }
657
658 writer.write_all(&out_l0)?;
660 for &doc in &out_l1_docs {
661 writer.write_u32::<LittleEndian>(doc)?;
662 }
663
664 writer.write_u64::<LittleEndian>(stream_written)?;
665 writer.write_u32::<LittleEndian>(out_l0_count as u32)?;
666 writer.write_u32::<LittleEndian>(out_l1_docs.len() as u32)?;
667 writer.write_u32::<LittleEndian>(total_docs)?;
668 writer.write_u32::<LittleEndian>(merged_max_tf)?;
669
670 let l1_bytes_len = out_l1_docs.len() * L1_SIZE;
671 let total_bytes = stream_written as usize + out_l0.len() + l1_bytes_len + FOOTER_SIZE;
672 Ok((total_docs, total_bytes))
673 }
674
675 pub fn decode_block_into(
682 &self,
683 block_idx: usize,
684 doc_ids: &mut Vec<u32>,
685 tfs: &mut Vec<u32>,
686 ) -> bool {
687 if block_idx >= self.l0_count {
688 return false;
689 }
690
691 let (_, _, offset, _) = self.read_l0_entry(block_idx);
692 let pos = offset as usize;
693 let blk_size = block_data_size(&self.stream, pos);
694 let block_data = &self.stream[pos..pos + blk_size];
695
696 let count = u16::from_le_bytes(block_data[0..2].try_into().unwrap()) as usize;
698 let first_doc = u32::from_le_bytes(block_data[2..6].try_into().unwrap());
699 let doc_id_bits = block_data[6];
700 let tf_bits = block_data[7];
701
702 doc_ids.clear();
704 doc_ids.resize(count, 0);
705 doc_ids[0] = first_doc;
706
707 let doc_rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
708 let deltas_bytes = if count > 1 {
709 (count - 1) * doc_rounded.bytes_per_value()
710 } else {
711 0
712 };
713
714 if count > 1 {
715 simd::unpack_rounded(
716 &block_data[8..8 + deltas_bytes],
717 doc_rounded,
718 &mut doc_ids[1..],
719 count - 1,
720 );
721 for i in 1..count {
722 doc_ids[i] += doc_ids[i - 1];
723 }
724 }
725
726 tfs.clear();
728 tfs.resize(count, 0);
729 let tf_rounded = simd::RoundedBitWidth::from_u8(tf_bits);
730 let tfs_start = 8 + deltas_bytes;
731 simd::unpack_rounded(
732 &block_data[tfs_start..tfs_start + count * tf_rounded.bytes_per_value()],
733 tf_rounded,
734 tfs,
735 count,
736 );
737
738 true
739 }
740
741 #[inline]
743 pub fn block_first_doc(&self, block_idx: usize) -> Option<DocId> {
744 if block_idx >= self.l0_count {
745 return None;
746 }
747 let (first_doc, _, _, _) = self.read_l0_entry(block_idx);
748 Some(first_doc)
749 }
750
751 #[inline]
753 pub fn block_last_doc(&self, block_idx: usize) -> Option<DocId> {
754 if block_idx >= self.l0_count {
755 return None;
756 }
757 let (_, last_doc, _, _) = self.read_l0_entry(block_idx);
758 Some(last_doc)
759 }
760
761 pub fn seek_block(&self, target: DocId, from_block: usize) -> Option<usize> {
769 if from_block >= self.l0_count {
770 return None;
771 }
772
773 let from_l1 = from_block / L1_INTERVAL;
774
775 let l1_idx = if !self.l1_docs.is_empty() {
777 let idx = from_l1 + simd::find_first_ge_u32(&self.l1_docs[from_l1..], target);
778 if idx >= self.l1_docs.len() {
779 return None;
780 }
781 idx
782 } else {
783 return None;
784 };
785
786 let start = (l1_idx * L1_INTERVAL).max(from_block);
788 let end = ((l1_idx + 1) * L1_INTERVAL).min(self.l0_count);
789 let count = end - start;
790
791 let mut last_docs = [u32::MAX; L1_INTERVAL];
792 for (j, idx) in (start..end).enumerate() {
793 let (_, ld, _, _) = read_l0(&self.l0_bytes, idx);
794 last_docs[j] = ld;
795 }
796 let within = simd::find_first_ge_u32(&last_docs[..count], target);
797 let block_idx = start + within;
798
799 if block_idx < self.l0_count {
800 Some(block_idx)
801 } else {
802 None
803 }
804 }
805
806 pub fn iterator(&self) -> BlockPostingIterator<'_> {
808 BlockPostingIterator::new(self)
809 }
810
811 pub fn into_iterator(self) -> BlockPostingIterator<'static> {
813 BlockPostingIterator::owned(self)
814 }
815}
816
817pub struct BlockPostingIterator<'a> {
824 block_list: std::borrow::Cow<'a, BlockPostingList>,
825 current_block: usize,
826 block_doc_ids: Vec<u32>,
827 block_tfs: Vec<u32>,
828 position_in_block: usize,
829 exhausted: bool,
830}
831
832impl<'a> BlockPostingIterator<'a> {
833 fn new(block_list: &'a BlockPostingList) -> Self {
834 let exhausted = block_list.l0_count == 0;
835 let mut iter = Self {
836 block_list: std::borrow::Cow::Borrowed(block_list),
837 current_block: 0,
838 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
839 block_tfs: Vec::with_capacity(BLOCK_SIZE),
840 position_in_block: 0,
841 exhausted,
842 };
843 if !iter.exhausted {
844 iter.load_block(0);
845 }
846 iter
847 }
848
849 fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
850 let exhausted = block_list.l0_count == 0;
851 let mut iter = BlockPostingIterator {
852 block_list: std::borrow::Cow::Owned(block_list),
853 current_block: 0,
854 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
855 block_tfs: Vec::with_capacity(BLOCK_SIZE),
856 position_in_block: 0,
857 exhausted,
858 };
859 if !iter.exhausted {
860 iter.load_block(0);
861 }
862 iter
863 }
864
865 fn load_block(&mut self, block_idx: usize) {
866 if block_idx >= self.block_list.l0_count {
867 self.exhausted = true;
868 return;
869 }
870
871 self.current_block = block_idx;
872 self.position_in_block = 0;
873
874 self.block_list
875 .decode_block_into(block_idx, &mut self.block_doc_ids, &mut self.block_tfs);
876 }
877
878 pub fn doc(&self) -> DocId {
879 if self.exhausted {
880 TERMINATED
881 } else if self.position_in_block < self.block_doc_ids.len() {
882 self.block_doc_ids[self.position_in_block]
883 } else {
884 TERMINATED
885 }
886 }
887
888 pub fn term_freq(&self) -> u32 {
889 if self.exhausted || self.position_in_block >= self.block_tfs.len() {
890 0
891 } else {
892 self.block_tfs[self.position_in_block]
893 }
894 }
895
896 pub fn advance(&mut self) -> DocId {
897 if self.exhausted {
898 return TERMINATED;
899 }
900
901 self.position_in_block += 1;
902 if self.position_in_block >= self.block_doc_ids.len() {
903 self.load_block(self.current_block + 1);
904 }
905 self.doc()
906 }
907
908 pub fn seek(&mut self, target: DocId) -> DocId {
909 if self.exhausted {
910 return TERMINATED;
911 }
912
913 let block_idx = match self.block_list.seek_block(target, self.current_block) {
915 Some(idx) => idx,
916 None => {
917 self.exhausted = true;
918 return TERMINATED;
919 }
920 };
921
922 if block_idx != self.current_block {
923 self.load_block(block_idx);
924 }
925
926 let remaining = &self.block_doc_ids[self.position_in_block..];
928 let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
929 self.position_in_block += pos;
930
931 if self.position_in_block >= self.block_doc_ids.len() {
932 self.load_block(self.current_block + 1);
933 }
934 self.doc()
935 }
936
937 pub fn skip_to_next_block(&mut self) -> DocId {
941 if self.exhausted {
942 return TERMINATED;
943 }
944 self.load_block(self.current_block + 1);
945 self.doc()
946 }
947
948 #[inline]
950 pub fn current_block_idx(&self) -> usize {
951 self.current_block
952 }
953
954 #[inline]
956 pub fn num_blocks(&self) -> usize {
957 self.block_list.l0_count
958 }
959
960 #[inline]
962 pub fn current_block_max_tf(&self) -> u32 {
963 if self.exhausted || self.current_block >= self.block_list.l0_count {
964 0
965 } else {
966 let (_, _, _, max_weight) = self.block_list.read_l0_entry(self.current_block);
967 max_weight as u32
968 }
969 }
970}
971
972#[cfg(test)]
973mod tests {
974 use super::*;
975
976 #[test]
977 fn test_posting_list_basic() {
978 let mut list = PostingList::new();
979 list.push(1, 2);
980 list.push(5, 1);
981 list.push(10, 3);
982
983 assert_eq!(list.len(), 3);
984
985 let mut iter = PostingListIterator::new(&list);
986 assert_eq!(iter.doc(), 1);
987 assert_eq!(iter.term_freq(), 2);
988
989 assert_eq!(iter.advance(), 5);
990 assert_eq!(iter.term_freq(), 1);
991
992 assert_eq!(iter.advance(), 10);
993 assert_eq!(iter.term_freq(), 3);
994
995 assert_eq!(iter.advance(), TERMINATED);
996 }
997
998 #[test]
999 fn test_posting_list_serialization() {
1000 let mut list = PostingList::new();
1001 for i in 0..100 {
1002 list.push(i * 3, (i % 5) + 1);
1003 }
1004
1005 let mut buffer = Vec::new();
1006 list.serialize(&mut buffer).unwrap();
1007
1008 let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
1009 assert_eq!(deserialized.len(), list.len());
1010
1011 for (a, b) in list.iter().zip(deserialized.iter()) {
1012 assert_eq!(a, b);
1013 }
1014 }
1015
1016 #[test]
1017 fn test_posting_list_seek() {
1018 let mut list = PostingList::new();
1019 for i in 0..100 {
1020 list.push(i * 2, 1);
1021 }
1022
1023 let mut iter = PostingListIterator::new(&list);
1024
1025 assert_eq!(iter.seek(50), 50);
1026 assert_eq!(iter.seek(51), 52);
1027 assert_eq!(iter.seek(200), TERMINATED);
1028 }
1029
1030 #[test]
1031 fn test_block_posting_list() {
1032 let mut list = PostingList::new();
1033 for i in 0..500 {
1034 list.push(i * 2, (i % 10) + 1);
1035 }
1036
1037 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1038 assert_eq!(block_list.doc_count(), 500);
1039
1040 let mut iter = block_list.iterator();
1041 assert_eq!(iter.doc(), 0);
1042 assert_eq!(iter.term_freq(), 1);
1043
1044 assert_eq!(iter.seek(500), 500);
1046 assert_eq!(iter.seek(998), 998);
1047 assert_eq!(iter.seek(1000), TERMINATED);
1048 }
1049
1050 #[test]
1051 fn test_block_posting_list_serialization() {
1052 let mut list = PostingList::new();
1053 for i in 0..300 {
1054 list.push(i * 3, i + 1);
1055 }
1056
1057 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1058
1059 let mut buffer = Vec::new();
1060 block_list.serialize(&mut buffer).unwrap();
1061
1062 let deserialized = BlockPostingList::deserialize(&buffer[..]).unwrap();
1063 assert_eq!(deserialized.doc_count(), block_list.doc_count());
1064
1065 let mut iter1 = block_list.iterator();
1067 let mut iter2 = deserialized.iterator();
1068
1069 while iter1.doc() != TERMINATED {
1070 assert_eq!(iter1.doc(), iter2.doc());
1071 assert_eq!(iter1.term_freq(), iter2.term_freq());
1072 iter1.advance();
1073 iter2.advance();
1074 }
1075 assert_eq!(iter2.doc(), TERMINATED);
1076 }
1077
1078 fn collect_postings(bpl: &BlockPostingList) -> Vec<(u32, u32)> {
1080 let mut result = Vec::new();
1081 let mut it = bpl.iterator();
1082 while it.doc() != TERMINATED {
1083 result.push((it.doc(), it.term_freq()));
1084 it.advance();
1085 }
1086 result
1087 }
1088
1089 fn build_bpl(postings: &[(u32, u32)]) -> BlockPostingList {
1091 let mut pl = PostingList::new();
1092 for &(doc_id, tf) in postings {
1093 pl.push(doc_id, tf);
1094 }
1095 BlockPostingList::from_posting_list(&pl).unwrap()
1096 }
1097
1098 fn serialize_bpl(bpl: &BlockPostingList) -> Vec<u8> {
1100 let mut buf = Vec::new();
1101 bpl.serialize(&mut buf).unwrap();
1102 buf
1103 }
1104
1105 #[test]
1106 fn test_concatenate_blocks_two_segments() {
1107 let a: Vec<(u32, u32)> = (0..100).map(|i| (i * 2, i + 1)).collect();
1109 let bpl_a = build_bpl(&a);
1110
1111 let b: Vec<(u32, u32)> = (0..100).map(|i| (i * 3, i + 2)).collect();
1113 let bpl_b = build_bpl(&b);
1114
1115 let merged =
1117 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 200)])
1118 .unwrap();
1119
1120 assert_eq!(merged.doc_count(), 200);
1121
1122 let postings = collect_postings(&merged);
1123 assert_eq!(postings.len(), 200);
1124
1125 for (i, p) in postings.iter().enumerate().take(100) {
1127 assert_eq!(*p, (i as u32 * 2, i as u32 + 1));
1128 }
1129 for i in 0..100 {
1131 assert_eq!(postings[100 + i], (i as u32 * 3 + 200, i as u32 + 2));
1132 }
1133 }
1134
1135 #[test]
1136 fn test_concatenate_streaming_matches_blocks() {
1137 let seg_a: Vec<(u32, u32)> = (0..250).map(|i| (i * 2, (i % 7) + 1)).collect();
1139 let seg_b: Vec<(u32, u32)> = (0..180).map(|i| (i * 5, (i % 3) + 1)).collect();
1140 let seg_c: Vec<(u32, u32)> = (0..90).map(|i| (i * 10, (i % 11) + 1)).collect();
1141
1142 let bpl_a = build_bpl(&seg_a);
1143 let bpl_b = build_bpl(&seg_b);
1144 let bpl_c = build_bpl(&seg_c);
1145
1146 let offset_b = 1000u32;
1147 let offset_c = 2000u32;
1148
1149 let ref_merged = BlockPostingList::concatenate_blocks(&[
1151 (bpl_a.clone(), 0),
1152 (bpl_b.clone(), offset_b),
1153 (bpl_c.clone(), offset_c),
1154 ])
1155 .unwrap();
1156 let mut ref_buf = Vec::new();
1157 ref_merged.serialize(&mut ref_buf).unwrap();
1158
1159 let bytes_a = serialize_bpl(&bpl_a);
1161 let bytes_b = serialize_bpl(&bpl_b);
1162 let bytes_c = serialize_bpl(&bpl_c);
1163
1164 let sources: Vec<(&[u8], u32)> =
1165 vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
1166 let mut stream_buf = Vec::new();
1167 let (doc_count, bytes_written) =
1168 BlockPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
1169
1170 assert_eq!(doc_count, 520); assert_eq!(bytes_written, stream_buf.len());
1172
1173 let ref_postings = collect_postings(&BlockPostingList::deserialize(&ref_buf).unwrap());
1175 let stream_postings =
1176 collect_postings(&BlockPostingList::deserialize(&stream_buf).unwrap());
1177
1178 assert_eq!(ref_postings.len(), stream_postings.len());
1179 for (i, (r, s)) in ref_postings.iter().zip(stream_postings.iter()).enumerate() {
1180 assert_eq!(r, s, "mismatch at posting {}", i);
1181 }
1182 }
1183
1184 #[test]
1185 fn test_multi_round_merge() {
1186 let segments: Vec<Vec<(u32, u32)>> = (0..4)
1193 .map(|seg| (0..200).map(|i| (i * 3, (i + seg * 7) % 10 + 1)).collect())
1194 .collect();
1195
1196 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1197 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1198
1199 let mut merged_01 = Vec::new();
1201 let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1202 let (dc_01, _) =
1203 BlockPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1204 assert_eq!(dc_01, 400);
1205
1206 let mut merged_23 = Vec::new();
1207 let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1208 let (dc_23, _) =
1209 BlockPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1210 assert_eq!(dc_23, 400);
1211
1212 let mut final_merged = Vec::new();
1214 let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1215 let (dc_final, _) =
1216 BlockPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1217 assert_eq!(dc_final, 800);
1218
1219 let final_bpl = BlockPostingList::deserialize(&final_merged).unwrap();
1221 let postings = collect_postings(&final_bpl);
1222 assert_eq!(postings.len(), 800);
1223
1224 assert_eq!(postings[0].0, 0); assert_eq!(postings[199].0, 597); assert_eq!(postings[200].0, 600); assert_eq!(postings[399].0, 1197); assert_eq!(postings[400].0, 1200); assert_eq!(postings[799].0, 2397); for seg in 0u32..4 {
1237 for i in 0u32..200 {
1238 let idx = (seg * 200 + i) as usize;
1239 assert_eq!(
1240 postings[idx].1,
1241 (i + seg * 7) % 10 + 1,
1242 "seg{} tf[{}]",
1243 seg,
1244 i
1245 );
1246 }
1247 }
1248
1249 let mut it = final_bpl.iterator();
1251 assert_eq!(it.seek(600), 600);
1252 assert_eq!(it.seek(1200), 1200);
1253 assert_eq!(it.seek(2397), 2397);
1254 assert_eq!(it.seek(2398), TERMINATED);
1255 }
1256
1257 #[test]
1258 fn test_large_scale_merge() {
1259 let num_segments = 5;
1262 let docs_per_segment = 2000;
1263 let docs_gap = 3; let segments: Vec<Vec<(u32, u32)>> = (0..num_segments)
1266 .map(|seg| {
1267 (0..docs_per_segment)
1268 .map(|i| (i as u32 * docs_gap, (i as u32 + seg as u32) % 20 + 1))
1269 .collect()
1270 })
1271 .collect();
1272
1273 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1274
1275 for bpl in &bpls {
1277 assert!(
1278 bpl.num_blocks() >= 15,
1279 "expected >=15 blocks, got {}",
1280 bpl.num_blocks()
1281 );
1282 }
1283
1284 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1285
1286 let max_doc_per_seg = (docs_per_segment as u32 - 1) * docs_gap;
1288 let offsets: Vec<u32> = (0..num_segments)
1289 .map(|i| i as u32 * (max_doc_per_seg + 1))
1290 .collect();
1291
1292 let sources: Vec<(&[u8], u32)> = serialized
1293 .iter()
1294 .zip(offsets.iter())
1295 .map(|(b, o)| (b.as_slice(), *o))
1296 .collect();
1297
1298 let mut merged = Vec::new();
1299 let (doc_count, _) =
1300 BlockPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1301 assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1302
1303 let merged_bpl = BlockPostingList::deserialize(&merged).unwrap();
1305 let postings = collect_postings(&merged_bpl);
1306 assert_eq!(postings.len(), num_segments * docs_per_segment);
1307
1308 for i in 1..postings.len() {
1310 assert!(
1311 postings[i].0 > postings[i - 1].0 || (i % docs_per_segment == 0), "doc_id not increasing at {}: {} vs {}",
1313 i,
1314 postings[i - 1].0,
1315 postings[i].0,
1316 );
1317 }
1318
1319 let mut it = merged_bpl.iterator();
1321 for (seg, &expected_first) in offsets.iter().enumerate() {
1322 assert_eq!(
1323 it.seek(expected_first),
1324 expected_first,
1325 "seek to segment {} start",
1326 seg
1327 );
1328 }
1329 }
1330
1331 #[test]
1332 fn test_merge_edge_cases() {
1333 let bpl_a = build_bpl(&[(0, 5)]);
1335 let bpl_b = build_bpl(&[(0, 3)]);
1336
1337 let merged =
1338 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 1)])
1339 .unwrap();
1340 assert_eq!(merged.doc_count(), 2);
1341 let p = collect_postings(&merged);
1342 assert_eq!(p, vec![(0, 5), (1, 3)]);
1343
1344 let exact_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32).map(|i| (i, i % 5 + 1)).collect();
1346 let bpl_exact = build_bpl(&exact_block);
1347 assert_eq!(bpl_exact.num_blocks(), 1);
1348
1349 let bytes = serialize_bpl(&bpl_exact);
1350 let mut out = Vec::new();
1351 let sources: Vec<(&[u8], u32)> = vec![(&bytes, 0), (&bytes, BLOCK_SIZE as u32)];
1352 let (dc, _) = BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1353 assert_eq!(dc, BLOCK_SIZE as u32 * 2);
1354
1355 let merged = BlockPostingList::deserialize(&out).unwrap();
1356 let postings = collect_postings(&merged);
1357 assert_eq!(postings.len(), BLOCK_SIZE * 2);
1358 assert_eq!(postings[BLOCK_SIZE].0, BLOCK_SIZE as u32);
1360
1361 let over_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32 + 1).map(|i| (i * 2, 1)).collect();
1363 let bpl_over = build_bpl(&over_block);
1364 assert_eq!(bpl_over.num_blocks(), 2);
1365 }
1366
1367 #[test]
1368 fn test_streaming_roundtrip_single_source() {
1369 let docs: Vec<(u32, u32)> = (0..500).map(|i| (i * 7, i % 15 + 1)).collect();
1371 let bpl = build_bpl(&docs);
1372 let direct = serialize_bpl(&bpl);
1373
1374 let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1375 let mut streamed = Vec::new();
1376 BlockPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1377
1378 let p1 = collect_postings(&BlockPostingList::deserialize(&direct).unwrap());
1380 let p2 = collect_postings(&BlockPostingList::deserialize(&streamed).unwrap());
1381 assert_eq!(p1, p2);
1382 }
1383
1384 #[test]
1385 fn test_max_tf_preserved_through_merge() {
1386 let mut a = Vec::new();
1388 for i in 0..200 {
1389 a.push((i * 2, if i == 100 { 50 } else { 1 }));
1390 }
1391 let bpl_a = build_bpl(&a);
1392 assert_eq!(bpl_a.max_tf(), 50);
1393
1394 let mut b = Vec::new();
1396 for i in 0..200 {
1397 b.push((i * 2, if i == 50 { 30 } else { 2 }));
1398 }
1399 let bpl_b = build_bpl(&b);
1400 assert_eq!(bpl_b.max_tf(), 30);
1401
1402 let bytes_a = serialize_bpl(&bpl_a);
1404 let bytes_b = serialize_bpl(&bpl_b);
1405 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 1000)];
1406 let mut out = Vec::new();
1407 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1408
1409 let merged = BlockPostingList::deserialize(&out).unwrap();
1410 assert_eq!(merged.max_tf(), 50);
1411 assert_eq!(merged.doc_count(), 400);
1412 }
1413
1414 #[test]
1417 fn test_l0_l1_counts() {
1418 let bpl = build_bpl(&(0..50u32).map(|i| (i, 1)).collect::<Vec<_>>());
1420 assert_eq!(bpl.num_blocks(), 1);
1421 assert_eq!(bpl.l1_docs.len(), 1);
1422
1423 let n = BLOCK_SIZE * L1_INTERVAL;
1425 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1426 assert_eq!(bpl.num_blocks(), L1_INTERVAL);
1427 assert_eq!(bpl.l1_docs.len(), 1);
1428
1429 let n = BLOCK_SIZE * L1_INTERVAL + 1;
1431 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1432 assert_eq!(bpl.num_blocks(), L1_INTERVAL + 1);
1433 assert_eq!(bpl.l1_docs.len(), 2);
1434
1435 let n = BLOCK_SIZE * L1_INTERVAL * 3;
1437 let bpl = build_bpl(&(0..n as u32).map(|i| (i, 1)).collect::<Vec<_>>());
1438 assert_eq!(bpl.num_blocks(), L1_INTERVAL * 3);
1439 assert_eq!(bpl.l1_docs.len(), 3);
1440 }
1441
1442 #[test]
1443 fn test_l1_last_doc_values() {
1444 let n = BLOCK_SIZE * 20;
1446 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1447 let bpl = build_bpl(&docs);
1448 assert_eq!(bpl.num_blocks(), 20);
1449 assert_eq!(bpl.l1_docs.len(), 3); let expected_l1_0 = bpl.block_last_doc(7).unwrap();
1453 assert_eq!(bpl.l1_docs[0], expected_l1_0);
1454
1455 let expected_l1_1 = bpl.block_last_doc(15).unwrap();
1457 assert_eq!(bpl.l1_docs[1], expected_l1_1);
1458
1459 let expected_l1_2 = bpl.block_last_doc(19).unwrap();
1461 assert_eq!(bpl.l1_docs[2], expected_l1_2);
1462 }
1463
1464 #[test]
1465 fn test_seek_block_basic() {
1466 let n = BLOCK_SIZE * 20;
1468 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 10, 1)).collect();
1469 let bpl = build_bpl(&docs);
1470
1471 assert_eq!(bpl.seek_block(0, 0), Some(0));
1473
1474 for blk in 0..20 {
1476 let first = bpl.block_first_doc(blk).unwrap();
1477 assert_eq!(
1478 bpl.seek_block(first, 0),
1479 Some(blk),
1480 "seek to block {} first_doc",
1481 blk
1482 );
1483 }
1484
1485 for blk in 0..20 {
1487 let last = bpl.block_last_doc(blk).unwrap();
1488 assert_eq!(
1489 bpl.seek_block(last, 0),
1490 Some(blk),
1491 "seek to block {} last_doc",
1492 blk
1493 );
1494 }
1495
1496 let max_doc = bpl.block_last_doc(19).unwrap();
1498 assert_eq!(bpl.seek_block(max_doc + 1, 0), None);
1499
1500 let mid_doc = bpl.block_first_doc(10).unwrap();
1502 assert_eq!(bpl.seek_block(mid_doc, 10), Some(10));
1503 assert_eq!(
1504 bpl.seek_block(mid_doc, 11),
1505 Some(11).or(bpl.seek_block(mid_doc, 11))
1506 );
1507 }
1508
1509 #[test]
1510 fn test_seek_block_across_l1_boundaries() {
1511 let n = BLOCK_SIZE * 24;
1513 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 5, 1)).collect();
1514 let bpl = build_bpl(&docs);
1515 assert_eq!(bpl.l1_docs.len(), 3);
1516
1517 for group in 0..3 {
1519 let blk = group * L1_INTERVAL;
1520 let target = bpl.block_first_doc(blk).unwrap();
1521 assert_eq!(
1522 bpl.seek_block(target, 0),
1523 Some(blk),
1524 "seek to group {} block {}",
1525 group,
1526 blk
1527 );
1528 }
1529
1530 let target = bpl.block_first_doc(20).unwrap() + 1;
1532 assert_eq!(bpl.seek_block(target, 0), Some(20));
1533 }
1534
1535 #[test]
1536 fn test_block_data_size_helper() {
1537 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 7, (i % 20) + 1)).collect();
1539 let bpl = build_bpl(&docs);
1540
1541 for blk in 0..bpl.num_blocks() {
1542 let (_, _, offset, _) = bpl.read_l0_entry(blk);
1543 let computed_size = block_data_size(&bpl.stream, offset as usize);
1544
1545 if blk + 1 < bpl.num_blocks() {
1548 let (_, _, next_offset, _) = bpl.read_l0_entry(blk + 1);
1549 assert_eq!(
1550 computed_size,
1551 (next_offset - offset) as usize,
1552 "block_data_size mismatch at block {}",
1553 blk
1554 );
1555 } else {
1556 assert_eq!(
1558 offset as usize + computed_size,
1559 bpl.stream.len(),
1560 "last block size mismatch"
1561 );
1562 }
1563 }
1564 }
1565
1566 #[test]
1567 fn test_l0_entry_roundtrip() {
1568 let docs: Vec<(u32, u32)> = (0..1000u32).map(|i| (i * 3, (i % 10) + 1)).collect();
1570 let bpl = build_bpl(&docs);
1571
1572 let bytes = serialize_bpl(&bpl);
1573 let bpl2 = BlockPostingList::deserialize(&bytes).unwrap();
1574
1575 assert_eq!(bpl.num_blocks(), bpl2.num_blocks());
1576 for blk in 0..bpl.num_blocks() {
1577 assert_eq!(
1578 bpl.read_l0_entry(blk),
1579 bpl2.read_l0_entry(blk),
1580 "L0 entry mismatch at block {}",
1581 blk
1582 );
1583 }
1584
1585 assert_eq!(bpl.l1_docs, bpl2.l1_docs);
1587 }
1588
1589 #[test]
1590 fn test_zero_copy_deserialize_matches() {
1591 let docs: Vec<(u32, u32)> = (0..2000u32).map(|i| (i * 2, (i % 5) + 1)).collect();
1592 let bpl = build_bpl(&docs);
1593 let bytes = serialize_bpl(&bpl);
1594
1595 let copied = BlockPostingList::deserialize(&bytes).unwrap();
1596 let zero_copy =
1597 BlockPostingList::deserialize_zero_copy(OwnedBytes::new(bytes.clone())).unwrap();
1598
1599 assert_eq!(copied.l0_count, zero_copy.l0_count);
1601 assert_eq!(copied.l1_docs, zero_copy.l1_docs);
1602 assert_eq!(copied.doc_count, zero_copy.doc_count);
1603 assert_eq!(copied.max_tf, zero_copy.max_tf);
1604
1605 let p1 = collect_postings(&copied);
1607 let p2 = collect_postings(&zero_copy);
1608 assert_eq!(p1, p2);
1609 }
1610
1611 #[test]
1612 fn test_l1_preserved_through_streaming_merge() {
1613 let seg_a = build_bpl(&(0..1000u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1615 let seg_b = build_bpl(&(0..800u32).map(|i| (i * 3, 2)).collect::<Vec<_>>());
1616 let seg_c = build_bpl(&(0..500u32).map(|i| (i * 5, 3)).collect::<Vec<_>>());
1617
1618 let bytes_a = serialize_bpl(&seg_a);
1619 let bytes_b = serialize_bpl(&seg_b);
1620 let bytes_c = serialize_bpl(&seg_c);
1621
1622 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 10000), (&bytes_c, 20000)];
1623 let mut out = Vec::new();
1624 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1625
1626 let merged = BlockPostingList::deserialize(&out).unwrap();
1627 let expected_l1_count = merged.num_blocks().div_ceil(L1_INTERVAL);
1628 assert_eq!(merged.l1_docs.len(), expected_l1_count);
1629
1630 for (i, &l1_doc) in merged.l1_docs.iter().enumerate() {
1632 let last_block_in_group = ((i + 1) * L1_INTERVAL - 1).min(merged.num_blocks() - 1);
1633 let expected = merged.block_last_doc(last_block_in_group).unwrap();
1634 assert_eq!(l1_doc, expected, "L1[{}] mismatch", i);
1635 }
1636
1637 for blk in 0..merged.num_blocks() {
1639 let first = merged.block_first_doc(blk).unwrap();
1640 assert_eq!(merged.seek_block(first, 0), Some(blk));
1641 }
1642 }
1643
1644 #[test]
1645 fn test_seek_block_single_block() {
1646 let bpl = build_bpl(&[(0, 1), (10, 2), (20, 3)]);
1648 assert_eq!(bpl.num_blocks(), 1);
1649 assert_eq!(bpl.l1_docs.len(), 1);
1650
1651 assert_eq!(bpl.seek_block(0, 0), Some(0));
1652 assert_eq!(bpl.seek_block(10, 0), Some(0));
1653 assert_eq!(bpl.seek_block(20, 0), Some(0));
1654 assert_eq!(bpl.seek_block(21, 0), None);
1655 }
1656
1657 #[test]
1658 fn test_footer_size() {
1659 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 2, 1)).collect();
1661 let bpl = build_bpl(&docs);
1662 let bytes = serialize_bpl(&bpl);
1663
1664 let expected =
1665 bpl.stream.len() + bpl.l0_count * L0_SIZE + bpl.l1_docs.len() * L1_SIZE + FOOTER_SIZE;
1666 assert_eq!(bytes.len(), expected);
1667 }
1668
1669 #[test]
1670 fn test_seek_block_from_block_skips_earlier() {
1671 let n = BLOCK_SIZE * 16;
1673 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1674 let bpl = build_bpl(&docs);
1675
1676 let target_in_5 = bpl.block_first_doc(5).unwrap() + 1;
1678 let result = bpl.seek_block(target_in_5, 8);
1681 assert!(result.is_some());
1682 assert!(result.unwrap() >= 8);
1683 }
1684}