1use byteorder::{LittleEndian, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use super::posting_common::{read_vint, write_vint};
12use crate::DocId;
13use crate::directories::OwnedBytes;
14use crate::structures::simd;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct Posting {
19 pub doc_id: DocId,
20 pub term_freq: u32,
21}
22
23#[derive(Debug, Clone, Default)]
25pub struct PostingList {
26 postings: Vec<Posting>,
27}
28
29impl PostingList {
30 pub fn new() -> Self {
31 Self::default()
32 }
33
34 pub fn with_capacity(capacity: usize) -> Self {
35 Self {
36 postings: Vec::with_capacity(capacity),
37 }
38 }
39
40 pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
42 debug_assert!(
43 self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
44 "Postings must be added in sorted order"
45 );
46 self.postings.push(Posting { doc_id, term_freq });
47 }
48
49 pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
51 if let Some(last) = self.postings.last_mut()
52 && last.doc_id == doc_id
53 {
54 last.term_freq += term_freq;
55 return;
56 }
57 self.postings.push(Posting { doc_id, term_freq });
58 }
59
60 pub fn doc_count(&self) -> u32 {
62 self.postings.len() as u32
63 }
64
65 pub fn len(&self) -> usize {
66 self.postings.len()
67 }
68
69 pub fn is_empty(&self) -> bool {
70 self.postings.is_empty()
71 }
72
73 pub fn iter(&self) -> impl Iterator<Item = &Posting> {
74 self.postings.iter()
75 }
76
77 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
79 write_vint(writer, self.postings.len() as u64)?;
81
82 let mut prev_doc_id = 0u32;
83 for posting in &self.postings {
84 let delta = posting.doc_id - prev_doc_id;
86 write_vint(writer, delta as u64)?;
87 write_vint(writer, posting.term_freq as u64)?;
88 prev_doc_id = posting.doc_id;
89 }
90
91 Ok(())
92 }
93
94 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
96 let count = read_vint(reader)? as usize;
97 let mut postings = Vec::with_capacity(count);
98
99 let mut prev_doc_id = 0u32;
100 for _ in 0..count {
101 let delta = read_vint(reader)? as u32;
102 let term_freq = read_vint(reader)? as u32;
103 let doc_id = prev_doc_id + delta;
104 postings.push(Posting { doc_id, term_freq });
105 prev_doc_id = doc_id;
106 }
107
108 Ok(Self { postings })
109 }
110}
111
112pub struct PostingListIterator<'a> {
114 postings: &'a [Posting],
115 position: usize,
116}
117
118impl<'a> PostingListIterator<'a> {
119 pub fn new(posting_list: &'a PostingList) -> Self {
120 Self {
121 postings: &posting_list.postings,
122 position: 0,
123 }
124 }
125
126 pub fn doc(&self) -> DocId {
128 if self.position < self.postings.len() {
129 self.postings[self.position].doc_id
130 } else {
131 TERMINATED
132 }
133 }
134
135 pub fn term_freq(&self) -> u32 {
137 if self.position < self.postings.len() {
138 self.postings[self.position].term_freq
139 } else {
140 0
141 }
142 }
143
144 pub fn advance(&mut self) -> DocId {
146 self.position += 1;
147 self.doc()
148 }
149
150 pub fn seek(&mut self, target: DocId) -> DocId {
152 let remaining = &self.postings[self.position..];
153 let offset = remaining.partition_point(|p| p.doc_id < target);
154 self.position += offset;
155 self.doc()
156 }
157
158 pub fn size_hint(&self) -> usize {
160 self.postings.len().saturating_sub(self.position)
161 }
162}
163
164pub const TERMINATED: DocId = DocId::MAX;
166
167pub const BLOCK_SIZE: usize = 128;
176
177const L1_INTERVAL: usize = 8;
179
180const L0_SIZE: usize = 16;
183
184const L1_SIZE: usize = 4;
186
187const FOOTER_SIZE: usize = 24;
189
190#[inline]
192fn read_l0(bytes: &[u8], idx: usize) -> (u32, u32, u32, f32) {
193 let p = idx * L0_SIZE;
194 let first_doc = u32::from_le_bytes(bytes[p..p + 4].try_into().unwrap());
195 let last_doc = u32::from_le_bytes(bytes[p + 4..p + 8].try_into().unwrap());
196 let offset = u32::from_le_bytes(bytes[p + 8..p + 12].try_into().unwrap());
197 let max_weight = f32::from_le_bytes(bytes[p + 12..p + 16].try_into().unwrap());
198 (first_doc, last_doc, offset, max_weight)
199}
200
201#[inline]
203fn write_l0(buf: &mut Vec<u8>, first_doc: u32, last_doc: u32, offset: u32, max_weight: f32) {
204 buf.extend_from_slice(&first_doc.to_le_bytes());
205 buf.extend_from_slice(&last_doc.to_le_bytes());
206 buf.extend_from_slice(&offset.to_le_bytes());
207 buf.extend_from_slice(&max_weight.to_le_bytes());
208}
209
210#[inline]
215fn block_data_size(stream: &[u8], pos: usize) -> usize {
216 let count = u16::from_le_bytes(stream[pos..pos + 2].try_into().unwrap()) as usize;
217 let doc_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 6]);
218 let tf_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 7]);
219 let delta_bytes = if count > 1 {
220 (count - 1) * doc_rounded.bytes_per_value()
221 } else {
222 0
223 };
224 8 + delta_bytes + count * tf_rounded.bytes_per_value()
225}
226
227#[derive(Debug, Clone)]
228pub struct BlockPostingList {
229 stream: OwnedBytes,
231 l0_bytes: OwnedBytes,
234 l0_count: usize,
236 l1_docs: Vec<u32>,
239 doc_count: u32,
241 max_tf: u32,
243}
244
245impl BlockPostingList {
246 #[inline]
248 fn read_l0_entry(&self, idx: usize) -> (u32, u32, u32, f32) {
249 read_l0(&self.l0_bytes, idx)
250 }
251
252 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
261 let mut stream: Vec<u8> = Vec::new();
262 let mut l0_buf: Vec<u8> = Vec::new();
263 let mut l1_docs: Vec<u32> = Vec::new();
264 let mut l0_count = 0usize;
265 let mut max_tf = 0u32;
266
267 let postings = &list.postings;
268 let mut i = 0;
269
270 let mut deltas = Vec::with_capacity(BLOCK_SIZE);
272 let mut tf_buf = Vec::with_capacity(BLOCK_SIZE);
273
274 while i < postings.len() {
275 if stream.len() > u32::MAX as usize {
276 return Err(io::Error::new(
277 io::ErrorKind::InvalidData,
278 "posting list stream exceeds u32::MAX bytes",
279 ));
280 }
281 let block_start = stream.len() as u32;
282 let block_end = (i + BLOCK_SIZE).min(postings.len());
283 let block = &postings[i..block_end];
284 let count = block.len();
285
286 let block_max_tf = block.iter().map(|p| p.term_freq).max().unwrap_or(0);
288 max_tf = max_tf.max(block_max_tf);
289
290 let base_doc_id = block.first().unwrap().doc_id;
291 let last_doc_id = block.last().unwrap().doc_id;
292
293 deltas.clear();
295 let mut prev = base_doc_id;
296 for posting in block.iter().skip(1) {
297 deltas.push(posting.doc_id - prev);
298 prev = posting.doc_id;
299 }
300 let max_delta = deltas.iter().copied().max().unwrap_or(0);
301 let doc_id_bits = simd::round_bit_width(simd::bits_needed(max_delta));
302
303 tf_buf.clear();
305 tf_buf.extend(block.iter().map(|p| p.term_freq));
306 let tf_bits = simd::round_bit_width(simd::bits_needed(block_max_tf));
307
308 stream.write_u16::<LittleEndian>(count as u16)?;
310 stream.write_u32::<LittleEndian>(base_doc_id)?;
311 stream.push(doc_id_bits);
312 stream.push(tf_bits);
313
314 if count > 1 {
316 let rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
317 let byte_count = (count - 1) * rounded.bytes_per_value();
318 let start = stream.len();
319 stream.resize(start + byte_count, 0);
320 simd::pack_rounded(&deltas, rounded, &mut stream[start..]);
321 }
322
323 {
325 let rounded = simd::RoundedBitWidth::from_u8(tf_bits);
326 let byte_count = count * rounded.bytes_per_value();
327 let start = stream.len();
328 stream.resize(start + byte_count, 0);
329 simd::pack_rounded(&tf_buf, rounded, &mut stream[start..]);
330 }
331
332 write_l0(
334 &mut l0_buf,
335 base_doc_id,
336 last_doc_id,
337 block_start,
338 block_max_tf as f32,
339 );
340 l0_count += 1;
341
342 if l0_count.is_multiple_of(L1_INTERVAL) {
344 l1_docs.push(last_doc_id);
345 }
346
347 i = block_end;
348 }
349
350 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
352 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
353 l1_docs.push(last_doc);
354 }
355
356 Ok(Self {
357 stream: OwnedBytes::new(stream),
358 l0_bytes: OwnedBytes::new(l0_buf),
359 l0_count,
360 l1_docs,
361 doc_count: postings.len() as u32,
362 max_tf,
363 })
364 }
365
366 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
376 writer.write_all(&self.stream)?;
377 writer.write_all(&self.l0_bytes)?;
378 for &doc in &self.l1_docs {
379 writer.write_u32::<LittleEndian>(doc)?;
380 }
381
382 writer.write_u64::<LittleEndian>(self.stream.len() as u64)?;
384 writer.write_u32::<LittleEndian>(self.l0_count as u32)?;
385 writer.write_u32::<LittleEndian>(self.l1_docs.len() as u32)?;
386 writer.write_u32::<LittleEndian>(self.doc_count)?;
387 writer.write_u32::<LittleEndian>(self.max_tf)?;
388
389 Ok(())
390 }
391
392 pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
394 if raw.len() < FOOTER_SIZE {
395 return Err(io::Error::new(
396 io::ErrorKind::InvalidData,
397 "posting data too short",
398 ));
399 }
400
401 let f = raw.len() - FOOTER_SIZE;
402 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
403 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
404 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
405 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
406 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
407
408 let l0_start = stream_len;
409 let l0_end = l0_start + l0_count * L0_SIZE;
410 let l1_start = l0_end;
411
412 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
413
414 Ok(Self {
415 stream: OwnedBytes::new(raw[..stream_len].to_vec()),
416 l0_bytes: OwnedBytes::new(raw[l0_start..l0_end].to_vec()),
417 l0_count,
418 l1_docs,
419 doc_count,
420 max_tf,
421 })
422 }
423
424 pub fn deserialize_zero_copy(raw: OwnedBytes) -> io::Result<Self> {
428 if raw.len() < FOOTER_SIZE {
429 return Err(io::Error::new(
430 io::ErrorKind::InvalidData,
431 "posting data too short",
432 ));
433 }
434
435 let f = raw.len() - FOOTER_SIZE;
436 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
437 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
438 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
439 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
440 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
441
442 let l0_start = stream_len;
443 let l0_end = l0_start + l0_count * L0_SIZE;
444 let l1_start = l0_end;
445
446 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
447
448 Ok(Self {
449 stream: raw.slice(0..stream_len),
450 l0_bytes: raw.slice(l0_start..l0_end),
451 l0_count,
452 l1_docs,
453 doc_count,
454 max_tf,
455 })
456 }
457
458 fn extract_l1_docs(bytes: &[u8], count: usize) -> Vec<u32> {
460 let mut docs = Vec::with_capacity(count);
461 for i in 0..count {
462 let p = i * L1_SIZE;
463 docs.push(u32::from_le_bytes(bytes[p..p + 4].try_into().unwrap()));
464 }
465 docs
466 }
467
468 pub fn doc_count(&self) -> u32 {
469 self.doc_count
470 }
471
472 pub fn max_tf(&self) -> u32 {
474 self.max_tf
475 }
476
477 pub fn num_blocks(&self) -> usize {
479 self.l0_count
480 }
481
482 pub fn block_max_tf(&self, block_idx: usize) -> Option<u32> {
484 if block_idx >= self.l0_count {
485 return None;
486 }
487 let (_, _, _, max_weight) = self.read_l0_entry(block_idx);
488 Some(max_weight as u32)
489 }
490
491 pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
494 let mut stream: Vec<u8> = Vec::new();
495 let mut l0_buf: Vec<u8> = Vec::new();
496 let mut l1_docs: Vec<u32> = Vec::new();
497 let mut l0_count = 0usize;
498 let mut total_docs = 0u32;
499 let mut max_tf = 0u32;
500
501 for (source, doc_offset) in sources {
502 max_tf = max_tf.max(source.max_tf);
503 for block_idx in 0..source.num_blocks() {
504 let (first_doc, last_doc, offset, max_weight) = source.read_l0_entry(block_idx);
505 let blk_size = block_data_size(&source.stream, offset as usize);
506 let block_bytes = &source.stream[offset as usize..offset as usize + blk_size];
507
508 let count = u16::from_le_bytes(block_bytes[0..2].try_into().unwrap());
509 if stream.len() > u32::MAX as usize {
510 return Err(io::Error::new(
511 io::ErrorKind::InvalidData,
512 "posting list stream exceeds u32::MAX bytes during concatenation",
513 ));
514 }
515 let new_offset = stream.len() as u32;
516
517 stream.write_u16::<LittleEndian>(count)?;
519 stream.write_u32::<LittleEndian>(first_doc + doc_offset)?;
520 stream.extend_from_slice(&block_bytes[6..]);
521
522 let new_last = last_doc + doc_offset;
523 write_l0(
524 &mut l0_buf,
525 first_doc + doc_offset,
526 new_last,
527 new_offset,
528 max_weight,
529 );
530 l0_count += 1;
531 total_docs += count as u32;
532
533 if l0_count.is_multiple_of(L1_INTERVAL) {
534 l1_docs.push(new_last);
535 }
536 }
537 }
538
539 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
541 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
542 l1_docs.push(last_doc);
543 }
544
545 Ok(Self {
546 stream: OwnedBytes::new(stream),
547 l0_bytes: OwnedBytes::new(l0_buf),
548 l0_count,
549 l1_docs,
550 doc_count: total_docs,
551 max_tf,
552 })
553 }
554
555 pub fn concatenate_streaming<W: Write>(
566 sources: &[(&[u8], u32)], writer: &mut W,
568 ) -> io::Result<(u32, usize)> {
569 struct SourceMeta {
570 stream_len: usize,
571 l0_count: usize,
572 }
573
574 let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
575 let mut total_docs = 0u32;
576 let mut merged_max_tf = 0u32;
577
578 for (raw, _) in sources {
579 if raw.len() < FOOTER_SIZE {
580 continue;
581 }
582 let f = raw.len() - FOOTER_SIZE;
583 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
584 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
585 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
587 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
588 total_docs += doc_count;
589 merged_max_tf = merged_max_tf.max(max_tf);
590 metas.push(SourceMeta {
591 stream_len,
592 l0_count,
593 });
594 }
595
596 let mut out_l0: Vec<u8> = Vec::new();
599 let mut out_l1_docs: Vec<u32> = Vec::new();
600 let mut out_l0_count = 0usize;
601 let mut stream_written = 0u64;
602 let mut patch_buf = [0u8; 8];
603
604 for (src_idx, meta) in metas.iter().enumerate() {
605 let (raw, doc_offset) = &sources[src_idx];
606 let l0_base = meta.stream_len; let src_stream = &raw[..meta.stream_len];
608
609 for i in 0..meta.l0_count {
610 let (first_doc, last_doc, offset, max_weight) = read_l0(&raw[l0_base..], i);
612
613 let blk_size = block_data_size(src_stream, offset as usize);
615 let block = &src_stream[offset as usize..offset as usize + blk_size];
616
617 let new_last = last_doc + doc_offset;
619 if stream_written > u32::MAX as u64 {
620 return Err(io::Error::new(
621 io::ErrorKind::InvalidData,
622 "posting list stream exceeds u32::MAX bytes during streaming merge",
623 ));
624 }
625 write_l0(
626 &mut out_l0,
627 first_doc + doc_offset,
628 new_last,
629 stream_written as u32,
630 max_weight,
631 );
632 out_l0_count += 1;
633
634 if out_l0_count.is_multiple_of(L1_INTERVAL) {
636 out_l1_docs.push(new_last);
637 }
638
639 patch_buf.copy_from_slice(&block[0..8]);
641 let blk_first = u32::from_le_bytes(patch_buf[2..6].try_into().unwrap());
642 patch_buf[2..6].copy_from_slice(&(blk_first + doc_offset).to_le_bytes());
643 writer.write_all(&patch_buf)?;
644 writer.write_all(&block[8..])?;
645
646 stream_written += blk_size as u64;
647 }
648 }
649
650 if !out_l0_count.is_multiple_of(L1_INTERVAL) && out_l0_count > 0 {
652 let (_, last_doc, _, _) = read_l0(&out_l0, out_l0_count - 1);
653 out_l1_docs.push(last_doc);
654 }
655
656 writer.write_all(&out_l0)?;
658 for &doc in &out_l1_docs {
659 writer.write_u32::<LittleEndian>(doc)?;
660 }
661
662 writer.write_u64::<LittleEndian>(stream_written)?;
663 writer.write_u32::<LittleEndian>(out_l0_count as u32)?;
664 writer.write_u32::<LittleEndian>(out_l1_docs.len() as u32)?;
665 writer.write_u32::<LittleEndian>(total_docs)?;
666 writer.write_u32::<LittleEndian>(merged_max_tf)?;
667
668 let l1_bytes_len = out_l1_docs.len() * L1_SIZE;
669 let total_bytes = stream_written as usize + out_l0.len() + l1_bytes_len + FOOTER_SIZE;
670 Ok((total_docs, total_bytes))
671 }
672
673 pub fn decode_block_into(
680 &self,
681 block_idx: usize,
682 doc_ids: &mut Vec<u32>,
683 tfs: &mut Vec<u32>,
684 ) -> bool {
685 if block_idx >= self.l0_count {
686 return false;
687 }
688
689 let (_, _, offset, _) = self.read_l0_entry(block_idx);
690 let pos = offset as usize;
691 let blk_size = block_data_size(&self.stream, pos);
692 let block_data = &self.stream[pos..pos + blk_size];
693
694 let count = u16::from_le_bytes(block_data[0..2].try_into().unwrap()) as usize;
696 let first_doc = u32::from_le_bytes(block_data[2..6].try_into().unwrap());
697 let doc_id_bits = block_data[6];
698 let tf_bits = block_data[7];
699
700 doc_ids.clear();
702 doc_ids.resize(count, 0);
703 doc_ids[0] = first_doc;
704
705 let doc_rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
706 let deltas_bytes = if count > 1 {
707 (count - 1) * doc_rounded.bytes_per_value()
708 } else {
709 0
710 };
711
712 if count > 1 {
713 simd::unpack_rounded(
714 &block_data[8..8 + deltas_bytes],
715 doc_rounded,
716 &mut doc_ids[1..],
717 count - 1,
718 );
719 for i in 1..count {
720 doc_ids[i] += doc_ids[i - 1];
721 }
722 }
723
724 tfs.clear();
726 tfs.resize(count, 0);
727 let tf_rounded = simd::RoundedBitWidth::from_u8(tf_bits);
728 let tfs_start = 8 + deltas_bytes;
729 simd::unpack_rounded(
730 &block_data[tfs_start..tfs_start + count * tf_rounded.bytes_per_value()],
731 tf_rounded,
732 tfs,
733 count,
734 );
735
736 true
737 }
738
739 #[inline]
741 pub fn block_first_doc(&self, block_idx: usize) -> Option<DocId> {
742 if block_idx >= self.l0_count {
743 return None;
744 }
745 let (first_doc, _, _, _) = self.read_l0_entry(block_idx);
746 Some(first_doc)
747 }
748
749 #[inline]
751 pub fn block_last_doc(&self, block_idx: usize) -> Option<DocId> {
752 if block_idx >= self.l0_count {
753 return None;
754 }
755 let (_, last_doc, _, _) = self.read_l0_entry(block_idx);
756 Some(last_doc)
757 }
758
759 pub fn seek_block(&self, target: DocId, from_block: usize) -> Option<usize> {
767 if from_block >= self.l0_count {
768 return None;
769 }
770
771 let from_l1 = from_block / L1_INTERVAL;
772
773 let l1_idx = if !self.l1_docs.is_empty() {
775 let idx = from_l1 + simd::find_first_ge_u32(&self.l1_docs[from_l1..], target);
776 if idx >= self.l1_docs.len() {
777 return None;
778 }
779 idx
780 } else {
781 return None;
782 };
783
784 let start = (l1_idx * L1_INTERVAL).max(from_block);
786 let end = ((l1_idx + 1) * L1_INTERVAL).min(self.l0_count);
787 let count = end - start;
788
789 let mut last_docs = [u32::MAX; L1_INTERVAL];
790 for (j, idx) in (start..end).enumerate() {
791 let (_, ld, _, _) = read_l0(&self.l0_bytes, idx);
792 last_docs[j] = ld;
793 }
794 let within = simd::find_first_ge_u32(&last_docs[..count], target);
795 let block_idx = start + within;
796
797 if block_idx < self.l0_count {
798 Some(block_idx)
799 } else {
800 None
801 }
802 }
803
804 pub fn iterator(&self) -> BlockPostingIterator<'_> {
806 BlockPostingIterator::new(self)
807 }
808
809 pub fn into_iterator(self) -> BlockPostingIterator<'static> {
811 BlockPostingIterator::owned(self)
812 }
813}
814
815pub struct BlockPostingIterator<'a> {
822 block_list: std::borrow::Cow<'a, BlockPostingList>,
823 current_block: usize,
824 block_doc_ids: Vec<u32>,
825 block_tfs: Vec<u32>,
826 position_in_block: usize,
827 exhausted: bool,
828}
829
830impl<'a> BlockPostingIterator<'a> {
831 fn new(block_list: &'a BlockPostingList) -> Self {
832 let exhausted = block_list.l0_count == 0;
833 let mut iter = Self {
834 block_list: std::borrow::Cow::Borrowed(block_list),
835 current_block: 0,
836 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
837 block_tfs: Vec::with_capacity(BLOCK_SIZE),
838 position_in_block: 0,
839 exhausted,
840 };
841 if !iter.exhausted {
842 iter.load_block(0);
843 }
844 iter
845 }
846
847 fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
848 let exhausted = block_list.l0_count == 0;
849 let mut iter = BlockPostingIterator {
850 block_list: std::borrow::Cow::Owned(block_list),
851 current_block: 0,
852 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
853 block_tfs: Vec::with_capacity(BLOCK_SIZE),
854 position_in_block: 0,
855 exhausted,
856 };
857 if !iter.exhausted {
858 iter.load_block(0);
859 }
860 iter
861 }
862
863 fn load_block(&mut self, block_idx: usize) {
864 if block_idx >= self.block_list.l0_count {
865 self.exhausted = true;
866 return;
867 }
868
869 self.current_block = block_idx;
870 self.position_in_block = 0;
871
872 self.block_list
873 .decode_block_into(block_idx, &mut self.block_doc_ids, &mut self.block_tfs);
874 }
875
876 pub fn doc(&self) -> DocId {
877 if self.exhausted {
878 TERMINATED
879 } else if self.position_in_block < self.block_doc_ids.len() {
880 self.block_doc_ids[self.position_in_block]
881 } else {
882 TERMINATED
883 }
884 }
885
886 pub fn term_freq(&self) -> u32 {
887 if self.exhausted || self.position_in_block >= self.block_tfs.len() {
888 0
889 } else {
890 self.block_tfs[self.position_in_block]
891 }
892 }
893
894 pub fn advance(&mut self) -> DocId {
895 if self.exhausted {
896 return TERMINATED;
897 }
898
899 self.position_in_block += 1;
900 if self.position_in_block >= self.block_doc_ids.len() {
901 self.load_block(self.current_block + 1);
902 }
903 self.doc()
904 }
905
906 pub fn seek(&mut self, target: DocId) -> DocId {
907 if self.exhausted {
908 return TERMINATED;
909 }
910
911 let block_idx = match self.block_list.seek_block(target, self.current_block) {
913 Some(idx) => idx,
914 None => {
915 self.exhausted = true;
916 return TERMINATED;
917 }
918 };
919
920 if block_idx != self.current_block {
921 self.load_block(block_idx);
922 }
923
924 let remaining = &self.block_doc_ids[self.position_in_block..];
926 let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
927 self.position_in_block += pos;
928
929 if self.position_in_block >= self.block_doc_ids.len() {
930 self.load_block(self.current_block + 1);
931 }
932 self.doc()
933 }
934
935 pub fn skip_to_next_block(&mut self) -> DocId {
939 if self.exhausted {
940 return TERMINATED;
941 }
942 self.load_block(self.current_block + 1);
943 self.doc()
944 }
945
946 #[inline]
948 pub fn current_block_idx(&self) -> usize {
949 self.current_block
950 }
951
952 #[inline]
954 pub fn num_blocks(&self) -> usize {
955 self.block_list.l0_count
956 }
957
958 #[inline]
960 pub fn current_block_max_tf(&self) -> u32 {
961 if self.exhausted || self.current_block >= self.block_list.l0_count {
962 0
963 } else {
964 let (_, _, _, max_weight) = self.block_list.read_l0_entry(self.current_block);
965 max_weight as u32
966 }
967 }
968}
969
970#[cfg(test)]
971mod tests {
972 use super::*;
973
974 #[test]
975 fn test_posting_list_basic() {
976 let mut list = PostingList::new();
977 list.push(1, 2);
978 list.push(5, 1);
979 list.push(10, 3);
980
981 assert_eq!(list.len(), 3);
982
983 let mut iter = PostingListIterator::new(&list);
984 assert_eq!(iter.doc(), 1);
985 assert_eq!(iter.term_freq(), 2);
986
987 assert_eq!(iter.advance(), 5);
988 assert_eq!(iter.term_freq(), 1);
989
990 assert_eq!(iter.advance(), 10);
991 assert_eq!(iter.term_freq(), 3);
992
993 assert_eq!(iter.advance(), TERMINATED);
994 }
995
996 #[test]
997 fn test_posting_list_serialization() {
998 let mut list = PostingList::new();
999 for i in 0..100 {
1000 list.push(i * 3, (i % 5) + 1);
1001 }
1002
1003 let mut buffer = Vec::new();
1004 list.serialize(&mut buffer).unwrap();
1005
1006 let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
1007 assert_eq!(deserialized.len(), list.len());
1008
1009 for (a, b) in list.iter().zip(deserialized.iter()) {
1010 assert_eq!(a, b);
1011 }
1012 }
1013
1014 #[test]
1015 fn test_posting_list_seek() {
1016 let mut list = PostingList::new();
1017 for i in 0..100 {
1018 list.push(i * 2, 1);
1019 }
1020
1021 let mut iter = PostingListIterator::new(&list);
1022
1023 assert_eq!(iter.seek(50), 50);
1024 assert_eq!(iter.seek(51), 52);
1025 assert_eq!(iter.seek(200), TERMINATED);
1026 }
1027
1028 #[test]
1029 fn test_block_posting_list() {
1030 let mut list = PostingList::new();
1031 for i in 0..500 {
1032 list.push(i * 2, (i % 10) + 1);
1033 }
1034
1035 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1036 assert_eq!(block_list.doc_count(), 500);
1037
1038 let mut iter = block_list.iterator();
1039 assert_eq!(iter.doc(), 0);
1040 assert_eq!(iter.term_freq(), 1);
1041
1042 assert_eq!(iter.seek(500), 500);
1044 assert_eq!(iter.seek(998), 998);
1045 assert_eq!(iter.seek(1000), TERMINATED);
1046 }
1047
1048 #[test]
1049 fn test_block_posting_list_serialization() {
1050 let mut list = PostingList::new();
1051 for i in 0..300 {
1052 list.push(i * 3, i + 1);
1053 }
1054
1055 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1056
1057 let mut buffer = Vec::new();
1058 block_list.serialize(&mut buffer).unwrap();
1059
1060 let deserialized = BlockPostingList::deserialize(&buffer[..]).unwrap();
1061 assert_eq!(deserialized.doc_count(), block_list.doc_count());
1062
1063 let mut iter1 = block_list.iterator();
1065 let mut iter2 = deserialized.iterator();
1066
1067 while iter1.doc() != TERMINATED {
1068 assert_eq!(iter1.doc(), iter2.doc());
1069 assert_eq!(iter1.term_freq(), iter2.term_freq());
1070 iter1.advance();
1071 iter2.advance();
1072 }
1073 assert_eq!(iter2.doc(), TERMINATED);
1074 }
1075
1076 fn collect_postings(bpl: &BlockPostingList) -> Vec<(u32, u32)> {
1078 let mut result = Vec::new();
1079 let mut it = bpl.iterator();
1080 while it.doc() != TERMINATED {
1081 result.push((it.doc(), it.term_freq()));
1082 it.advance();
1083 }
1084 result
1085 }
1086
1087 fn build_bpl(postings: &[(u32, u32)]) -> BlockPostingList {
1089 let mut pl = PostingList::new();
1090 for &(doc_id, tf) in postings {
1091 pl.push(doc_id, tf);
1092 }
1093 BlockPostingList::from_posting_list(&pl).unwrap()
1094 }
1095
1096 fn serialize_bpl(bpl: &BlockPostingList) -> Vec<u8> {
1098 let mut buf = Vec::new();
1099 bpl.serialize(&mut buf).unwrap();
1100 buf
1101 }
1102
1103 #[test]
1104 fn test_concatenate_blocks_two_segments() {
1105 let a: Vec<(u32, u32)> = (0..100).map(|i| (i * 2, i + 1)).collect();
1107 let bpl_a = build_bpl(&a);
1108
1109 let b: Vec<(u32, u32)> = (0..100).map(|i| (i * 3, i + 2)).collect();
1111 let bpl_b = build_bpl(&b);
1112
1113 let merged =
1115 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 200)])
1116 .unwrap();
1117
1118 assert_eq!(merged.doc_count(), 200);
1119
1120 let postings = collect_postings(&merged);
1121 assert_eq!(postings.len(), 200);
1122
1123 for (i, p) in postings.iter().enumerate().take(100) {
1125 assert_eq!(*p, (i as u32 * 2, i as u32 + 1));
1126 }
1127 for i in 0..100 {
1129 assert_eq!(postings[100 + i], (i as u32 * 3 + 200, i as u32 + 2));
1130 }
1131 }
1132
1133 #[test]
1134 fn test_concatenate_streaming_matches_blocks() {
1135 let seg_a: Vec<(u32, u32)> = (0..250).map(|i| (i * 2, (i % 7) + 1)).collect();
1137 let seg_b: Vec<(u32, u32)> = (0..180).map(|i| (i * 5, (i % 3) + 1)).collect();
1138 let seg_c: Vec<(u32, u32)> = (0..90).map(|i| (i * 10, (i % 11) + 1)).collect();
1139
1140 let bpl_a = build_bpl(&seg_a);
1141 let bpl_b = build_bpl(&seg_b);
1142 let bpl_c = build_bpl(&seg_c);
1143
1144 let offset_b = 1000u32;
1145 let offset_c = 2000u32;
1146
1147 let ref_merged = BlockPostingList::concatenate_blocks(&[
1149 (bpl_a.clone(), 0),
1150 (bpl_b.clone(), offset_b),
1151 (bpl_c.clone(), offset_c),
1152 ])
1153 .unwrap();
1154 let mut ref_buf = Vec::new();
1155 ref_merged.serialize(&mut ref_buf).unwrap();
1156
1157 let bytes_a = serialize_bpl(&bpl_a);
1159 let bytes_b = serialize_bpl(&bpl_b);
1160 let bytes_c = serialize_bpl(&bpl_c);
1161
1162 let sources: Vec<(&[u8], u32)> =
1163 vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
1164 let mut stream_buf = Vec::new();
1165 let (doc_count, bytes_written) =
1166 BlockPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
1167
1168 assert_eq!(doc_count, 520); assert_eq!(bytes_written, stream_buf.len());
1170
1171 let ref_postings = collect_postings(&BlockPostingList::deserialize(&ref_buf).unwrap());
1173 let stream_postings =
1174 collect_postings(&BlockPostingList::deserialize(&stream_buf).unwrap());
1175
1176 assert_eq!(ref_postings.len(), stream_postings.len());
1177 for (i, (r, s)) in ref_postings.iter().zip(stream_postings.iter()).enumerate() {
1178 assert_eq!(r, s, "mismatch at posting {}", i);
1179 }
1180 }
1181
1182 #[test]
1183 fn test_multi_round_merge() {
1184 let segments: Vec<Vec<(u32, u32)>> = (0..4)
1191 .map(|seg| (0..200).map(|i| (i * 3, (i + seg * 7) % 10 + 1)).collect())
1192 .collect();
1193
1194 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1195 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1196
1197 let mut merged_01 = Vec::new();
1199 let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1200 let (dc_01, _) =
1201 BlockPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1202 assert_eq!(dc_01, 400);
1203
1204 let mut merged_23 = Vec::new();
1205 let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1206 let (dc_23, _) =
1207 BlockPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1208 assert_eq!(dc_23, 400);
1209
1210 let mut final_merged = Vec::new();
1212 let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1213 let (dc_final, _) =
1214 BlockPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1215 assert_eq!(dc_final, 800);
1216
1217 let final_bpl = BlockPostingList::deserialize(&final_merged).unwrap();
1219 let postings = collect_postings(&final_bpl);
1220 assert_eq!(postings.len(), 800);
1221
1222 assert_eq!(postings[0].0, 0); assert_eq!(postings[199].0, 597); assert_eq!(postings[200].0, 600); assert_eq!(postings[399].0, 1197); assert_eq!(postings[400].0, 1200); assert_eq!(postings[799].0, 2397); for seg in 0u32..4 {
1235 for i in 0u32..200 {
1236 let idx = (seg * 200 + i) as usize;
1237 assert_eq!(
1238 postings[idx].1,
1239 (i + seg * 7) % 10 + 1,
1240 "seg{} tf[{}]",
1241 seg,
1242 i
1243 );
1244 }
1245 }
1246
1247 let mut it = final_bpl.iterator();
1249 assert_eq!(it.seek(600), 600);
1250 assert_eq!(it.seek(1200), 1200);
1251 assert_eq!(it.seek(2397), 2397);
1252 assert_eq!(it.seek(2398), TERMINATED);
1253 }
1254
1255 #[test]
1256 fn test_large_scale_merge() {
1257 let num_segments = 5;
1260 let docs_per_segment = 2000;
1261 let docs_gap = 3; let segments: Vec<Vec<(u32, u32)>> = (0..num_segments)
1264 .map(|seg| {
1265 (0..docs_per_segment)
1266 .map(|i| (i as u32 * docs_gap, (i as u32 + seg as u32) % 20 + 1))
1267 .collect()
1268 })
1269 .collect();
1270
1271 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1272
1273 for bpl in &bpls {
1275 assert!(
1276 bpl.num_blocks() >= 15,
1277 "expected >=15 blocks, got {}",
1278 bpl.num_blocks()
1279 );
1280 }
1281
1282 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1283
1284 let max_doc_per_seg = (docs_per_segment as u32 - 1) * docs_gap;
1286 let offsets: Vec<u32> = (0..num_segments)
1287 .map(|i| i as u32 * (max_doc_per_seg + 1))
1288 .collect();
1289
1290 let sources: Vec<(&[u8], u32)> = serialized
1291 .iter()
1292 .zip(offsets.iter())
1293 .map(|(b, o)| (b.as_slice(), *o))
1294 .collect();
1295
1296 let mut merged = Vec::new();
1297 let (doc_count, _) =
1298 BlockPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1299 assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1300
1301 let merged_bpl = BlockPostingList::deserialize(&merged).unwrap();
1303 let postings = collect_postings(&merged_bpl);
1304 assert_eq!(postings.len(), num_segments * docs_per_segment);
1305
1306 for i in 1..postings.len() {
1308 assert!(
1309 postings[i].0 > postings[i - 1].0 || (i % docs_per_segment == 0), "doc_id not increasing at {}: {} vs {}",
1311 i,
1312 postings[i - 1].0,
1313 postings[i].0,
1314 );
1315 }
1316
1317 let mut it = merged_bpl.iterator();
1319 for (seg, &expected_first) in offsets.iter().enumerate() {
1320 assert_eq!(
1321 it.seek(expected_first),
1322 expected_first,
1323 "seek to segment {} start",
1324 seg
1325 );
1326 }
1327 }
1328
1329 #[test]
1330 fn test_merge_edge_cases() {
1331 let bpl_a = build_bpl(&[(0, 5)]);
1333 let bpl_b = build_bpl(&[(0, 3)]);
1334
1335 let merged =
1336 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 1)])
1337 .unwrap();
1338 assert_eq!(merged.doc_count(), 2);
1339 let p = collect_postings(&merged);
1340 assert_eq!(p, vec![(0, 5), (1, 3)]);
1341
1342 let exact_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32).map(|i| (i, i % 5 + 1)).collect();
1344 let bpl_exact = build_bpl(&exact_block);
1345 assert_eq!(bpl_exact.num_blocks(), 1);
1346
1347 let bytes = serialize_bpl(&bpl_exact);
1348 let mut out = Vec::new();
1349 let sources: Vec<(&[u8], u32)> = vec![(&bytes, 0), (&bytes, BLOCK_SIZE as u32)];
1350 let (dc, _) = BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1351 assert_eq!(dc, BLOCK_SIZE as u32 * 2);
1352
1353 let merged = BlockPostingList::deserialize(&out).unwrap();
1354 let postings = collect_postings(&merged);
1355 assert_eq!(postings.len(), BLOCK_SIZE * 2);
1356 assert_eq!(postings[BLOCK_SIZE].0, BLOCK_SIZE as u32);
1358
1359 let over_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32 + 1).map(|i| (i * 2, 1)).collect();
1361 let bpl_over = build_bpl(&over_block);
1362 assert_eq!(bpl_over.num_blocks(), 2);
1363 }
1364
1365 #[test]
1366 fn test_streaming_roundtrip_single_source() {
1367 let docs: Vec<(u32, u32)> = (0..500).map(|i| (i * 7, i % 15 + 1)).collect();
1369 let bpl = build_bpl(&docs);
1370 let direct = serialize_bpl(&bpl);
1371
1372 let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1373 let mut streamed = Vec::new();
1374 BlockPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1375
1376 let p1 = collect_postings(&BlockPostingList::deserialize(&direct).unwrap());
1378 let p2 = collect_postings(&BlockPostingList::deserialize(&streamed).unwrap());
1379 assert_eq!(p1, p2);
1380 }
1381
1382 #[test]
1383 fn test_max_tf_preserved_through_merge() {
1384 let mut a = Vec::new();
1386 for i in 0..200 {
1387 a.push((i * 2, if i == 100 { 50 } else { 1 }));
1388 }
1389 let bpl_a = build_bpl(&a);
1390 assert_eq!(bpl_a.max_tf(), 50);
1391
1392 let mut b = Vec::new();
1394 for i in 0..200 {
1395 b.push((i * 2, if i == 50 { 30 } else { 2 }));
1396 }
1397 let bpl_b = build_bpl(&b);
1398 assert_eq!(bpl_b.max_tf(), 30);
1399
1400 let bytes_a = serialize_bpl(&bpl_a);
1402 let bytes_b = serialize_bpl(&bpl_b);
1403 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 1000)];
1404 let mut out = Vec::new();
1405 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1406
1407 let merged = BlockPostingList::deserialize(&out).unwrap();
1408 assert_eq!(merged.max_tf(), 50);
1409 assert_eq!(merged.doc_count(), 400);
1410 }
1411
1412 #[test]
1415 fn test_l0_l1_counts() {
1416 let bpl = build_bpl(&(0..50u32).map(|i| (i, 1)).collect::<Vec<_>>());
1418 assert_eq!(bpl.num_blocks(), 1);
1419 assert_eq!(bpl.l1_docs.len(), 1);
1420
1421 let n = BLOCK_SIZE * L1_INTERVAL;
1423 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1424 assert_eq!(bpl.num_blocks(), L1_INTERVAL);
1425 assert_eq!(bpl.l1_docs.len(), 1);
1426
1427 let n = BLOCK_SIZE * L1_INTERVAL + 1;
1429 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1430 assert_eq!(bpl.num_blocks(), L1_INTERVAL + 1);
1431 assert_eq!(bpl.l1_docs.len(), 2);
1432
1433 let n = BLOCK_SIZE * L1_INTERVAL * 3;
1435 let bpl = build_bpl(&(0..n as u32).map(|i| (i, 1)).collect::<Vec<_>>());
1436 assert_eq!(bpl.num_blocks(), L1_INTERVAL * 3);
1437 assert_eq!(bpl.l1_docs.len(), 3);
1438 }
1439
1440 #[test]
1441 fn test_l1_last_doc_values() {
1442 let n = BLOCK_SIZE * 20;
1444 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1445 let bpl = build_bpl(&docs);
1446 assert_eq!(bpl.num_blocks(), 20);
1447 assert_eq!(bpl.l1_docs.len(), 3); let expected_l1_0 = bpl.block_last_doc(7).unwrap();
1451 assert_eq!(bpl.l1_docs[0], expected_l1_0);
1452
1453 let expected_l1_1 = bpl.block_last_doc(15).unwrap();
1455 assert_eq!(bpl.l1_docs[1], expected_l1_1);
1456
1457 let expected_l1_2 = bpl.block_last_doc(19).unwrap();
1459 assert_eq!(bpl.l1_docs[2], expected_l1_2);
1460 }
1461
1462 #[test]
1463 fn test_seek_block_basic() {
1464 let n = BLOCK_SIZE * 20;
1466 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 10, 1)).collect();
1467 let bpl = build_bpl(&docs);
1468
1469 assert_eq!(bpl.seek_block(0, 0), Some(0));
1471
1472 for blk in 0..20 {
1474 let first = bpl.block_first_doc(blk).unwrap();
1475 assert_eq!(
1476 bpl.seek_block(first, 0),
1477 Some(blk),
1478 "seek to block {} first_doc",
1479 blk
1480 );
1481 }
1482
1483 for blk in 0..20 {
1485 let last = bpl.block_last_doc(blk).unwrap();
1486 assert_eq!(
1487 bpl.seek_block(last, 0),
1488 Some(blk),
1489 "seek to block {} last_doc",
1490 blk
1491 );
1492 }
1493
1494 let max_doc = bpl.block_last_doc(19).unwrap();
1496 assert_eq!(bpl.seek_block(max_doc + 1, 0), None);
1497
1498 let mid_doc = bpl.block_first_doc(10).unwrap();
1500 assert_eq!(bpl.seek_block(mid_doc, 10), Some(10));
1501 assert_eq!(
1502 bpl.seek_block(mid_doc, 11),
1503 Some(11).or(bpl.seek_block(mid_doc, 11))
1504 );
1505 }
1506
1507 #[test]
1508 fn test_seek_block_across_l1_boundaries() {
1509 let n = BLOCK_SIZE * 24;
1511 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 5, 1)).collect();
1512 let bpl = build_bpl(&docs);
1513 assert_eq!(bpl.l1_docs.len(), 3);
1514
1515 for group in 0..3 {
1517 let blk = group * L1_INTERVAL;
1518 let target = bpl.block_first_doc(blk).unwrap();
1519 assert_eq!(
1520 bpl.seek_block(target, 0),
1521 Some(blk),
1522 "seek to group {} block {}",
1523 group,
1524 blk
1525 );
1526 }
1527
1528 let target = bpl.block_first_doc(20).unwrap() + 1;
1530 assert_eq!(bpl.seek_block(target, 0), Some(20));
1531 }
1532
1533 #[test]
1534 fn test_block_data_size_helper() {
1535 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 7, (i % 20) + 1)).collect();
1537 let bpl = build_bpl(&docs);
1538
1539 for blk in 0..bpl.num_blocks() {
1540 let (_, _, offset, _) = bpl.read_l0_entry(blk);
1541 let computed_size = block_data_size(&bpl.stream, offset as usize);
1542
1543 if blk + 1 < bpl.num_blocks() {
1546 let (_, _, next_offset, _) = bpl.read_l0_entry(blk + 1);
1547 assert_eq!(
1548 computed_size,
1549 (next_offset - offset) as usize,
1550 "block_data_size mismatch at block {}",
1551 blk
1552 );
1553 } else {
1554 assert_eq!(
1556 offset as usize + computed_size,
1557 bpl.stream.len(),
1558 "last block size mismatch"
1559 );
1560 }
1561 }
1562 }
1563
1564 #[test]
1565 fn test_l0_entry_roundtrip() {
1566 let docs: Vec<(u32, u32)> = (0..1000u32).map(|i| (i * 3, (i % 10) + 1)).collect();
1568 let bpl = build_bpl(&docs);
1569
1570 let bytes = serialize_bpl(&bpl);
1571 let bpl2 = BlockPostingList::deserialize(&bytes).unwrap();
1572
1573 assert_eq!(bpl.num_blocks(), bpl2.num_blocks());
1574 for blk in 0..bpl.num_blocks() {
1575 assert_eq!(
1576 bpl.read_l0_entry(blk),
1577 bpl2.read_l0_entry(blk),
1578 "L0 entry mismatch at block {}",
1579 blk
1580 );
1581 }
1582
1583 assert_eq!(bpl.l1_docs, bpl2.l1_docs);
1585 }
1586
1587 #[test]
1588 fn test_zero_copy_deserialize_matches() {
1589 let docs: Vec<(u32, u32)> = (0..2000u32).map(|i| (i * 2, (i % 5) + 1)).collect();
1590 let bpl = build_bpl(&docs);
1591 let bytes = serialize_bpl(&bpl);
1592
1593 let copied = BlockPostingList::deserialize(&bytes).unwrap();
1594 let zero_copy =
1595 BlockPostingList::deserialize_zero_copy(OwnedBytes::new(bytes.clone())).unwrap();
1596
1597 assert_eq!(copied.l0_count, zero_copy.l0_count);
1599 assert_eq!(copied.l1_docs, zero_copy.l1_docs);
1600 assert_eq!(copied.doc_count, zero_copy.doc_count);
1601 assert_eq!(copied.max_tf, zero_copy.max_tf);
1602
1603 let p1 = collect_postings(&copied);
1605 let p2 = collect_postings(&zero_copy);
1606 assert_eq!(p1, p2);
1607 }
1608
1609 #[test]
1610 fn test_l1_preserved_through_streaming_merge() {
1611 let seg_a = build_bpl(&(0..1000u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1613 let seg_b = build_bpl(&(0..800u32).map(|i| (i * 3, 2)).collect::<Vec<_>>());
1614 let seg_c = build_bpl(&(0..500u32).map(|i| (i * 5, 3)).collect::<Vec<_>>());
1615
1616 let bytes_a = serialize_bpl(&seg_a);
1617 let bytes_b = serialize_bpl(&seg_b);
1618 let bytes_c = serialize_bpl(&seg_c);
1619
1620 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 10000), (&bytes_c, 20000)];
1621 let mut out = Vec::new();
1622 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1623
1624 let merged = BlockPostingList::deserialize(&out).unwrap();
1625 let expected_l1_count = merged.num_blocks().div_ceil(L1_INTERVAL);
1626 assert_eq!(merged.l1_docs.len(), expected_l1_count);
1627
1628 for (i, &l1_doc) in merged.l1_docs.iter().enumerate() {
1630 let last_block_in_group = ((i + 1) * L1_INTERVAL - 1).min(merged.num_blocks() - 1);
1631 let expected = merged.block_last_doc(last_block_in_group).unwrap();
1632 assert_eq!(l1_doc, expected, "L1[{}] mismatch", i);
1633 }
1634
1635 for blk in 0..merged.num_blocks() {
1637 let first = merged.block_first_doc(blk).unwrap();
1638 assert_eq!(merged.seek_block(first, 0), Some(blk));
1639 }
1640 }
1641
1642 #[test]
1643 fn test_seek_block_single_block() {
1644 let bpl = build_bpl(&[(0, 1), (10, 2), (20, 3)]);
1646 assert_eq!(bpl.num_blocks(), 1);
1647 assert_eq!(bpl.l1_docs.len(), 1);
1648
1649 assert_eq!(bpl.seek_block(0, 0), Some(0));
1650 assert_eq!(bpl.seek_block(10, 0), Some(0));
1651 assert_eq!(bpl.seek_block(20, 0), Some(0));
1652 assert_eq!(bpl.seek_block(21, 0), None);
1653 }
1654
1655 #[test]
1656 fn test_footer_size() {
1657 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 2, 1)).collect();
1659 let bpl = build_bpl(&docs);
1660 let bytes = serialize_bpl(&bpl);
1661
1662 let expected =
1663 bpl.stream.len() + bpl.l0_count * L0_SIZE + bpl.l1_docs.len() * L1_SIZE + FOOTER_SIZE;
1664 assert_eq!(bytes.len(), expected);
1665 }
1666
1667 #[test]
1668 fn test_seek_block_from_block_skips_earlier() {
1669 let n = BLOCK_SIZE * 16;
1671 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1672 let bpl = build_bpl(&docs);
1673
1674 let target_in_5 = bpl.block_first_doc(5).unwrap() + 1;
1676 let result = bpl.seek_block(target_in_5, 8);
1679 assert!(result.is_some());
1680 assert!(result.unwrap() >= 8);
1681 }
1682}