1use byteorder::{LittleEndian, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use super::posting_common::{read_vint, write_vint};
12use crate::DocId;
13use crate::directories::OwnedBytes;
14use crate::structures::simd;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct Posting {
19 pub doc_id: DocId,
20 pub term_freq: u32,
21}
22
23#[derive(Debug, Clone, Default)]
25pub struct PostingList {
26 postings: Vec<Posting>,
27}
28
29impl PostingList {
30 pub fn new() -> Self {
31 Self::default()
32 }
33
34 pub fn with_capacity(capacity: usize) -> Self {
35 Self {
36 postings: Vec::with_capacity(capacity),
37 }
38 }
39
40 pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
42 debug_assert!(
43 self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
44 "Postings must be added in sorted order"
45 );
46 self.postings.push(Posting { doc_id, term_freq });
47 }
48
49 pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
51 if let Some(last) = self.postings.last_mut()
52 && last.doc_id == doc_id
53 {
54 last.term_freq += term_freq;
55 return;
56 }
57 self.postings.push(Posting { doc_id, term_freq });
58 }
59
60 pub fn doc_count(&self) -> u32 {
62 self.postings.len() as u32
63 }
64
65 pub fn len(&self) -> usize {
66 self.postings.len()
67 }
68
69 pub fn is_empty(&self) -> bool {
70 self.postings.is_empty()
71 }
72
73 pub fn iter(&self) -> impl Iterator<Item = &Posting> {
74 self.postings.iter()
75 }
76
77 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
79 write_vint(writer, self.postings.len() as u64)?;
81
82 let mut prev_doc_id = 0u32;
83 for posting in &self.postings {
84 let delta = posting.doc_id - prev_doc_id;
86 write_vint(writer, delta as u64)?;
87 write_vint(writer, posting.term_freq as u64)?;
88 prev_doc_id = posting.doc_id;
89 }
90
91 Ok(())
92 }
93
94 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
96 let count = read_vint(reader)? as usize;
97 let mut postings = Vec::with_capacity(count);
98
99 let mut prev_doc_id = 0u32;
100 for _ in 0..count {
101 let delta = read_vint(reader)? as u32;
102 let term_freq = read_vint(reader)? as u32;
103 let doc_id = prev_doc_id + delta;
104 postings.push(Posting { doc_id, term_freq });
105 prev_doc_id = doc_id;
106 }
107
108 Ok(Self { postings })
109 }
110}
111
112pub struct PostingListIterator<'a> {
114 postings: &'a [Posting],
115 position: usize,
116}
117
118impl<'a> PostingListIterator<'a> {
119 pub fn new(posting_list: &'a PostingList) -> Self {
120 Self {
121 postings: &posting_list.postings,
122 position: 0,
123 }
124 }
125
126 pub fn doc(&self) -> DocId {
128 if self.position < self.postings.len() {
129 self.postings[self.position].doc_id
130 } else {
131 TERMINATED
132 }
133 }
134
135 pub fn term_freq(&self) -> u32 {
137 if self.position < self.postings.len() {
138 self.postings[self.position].term_freq
139 } else {
140 0
141 }
142 }
143
144 pub fn advance(&mut self) -> DocId {
146 self.position += 1;
147 self.doc()
148 }
149
150 pub fn seek(&mut self, target: DocId) -> DocId {
152 let remaining = &self.postings[self.position..];
153 let offset = remaining.partition_point(|p| p.doc_id < target);
154 self.position += offset;
155 self.doc()
156 }
157
158 pub fn size_hint(&self) -> usize {
160 self.postings.len().saturating_sub(self.position)
161 }
162}
163
164pub const TERMINATED: DocId = DocId::MAX;
166
167pub const BLOCK_SIZE: usize = 128;
176
177const L1_INTERVAL: usize = 8;
179
180const L0_SIZE: usize = 16;
183
184const L1_SIZE: usize = 4;
186
187const FOOTER_SIZE: usize = 24;
189
190#[inline]
192fn read_l0(bytes: &[u8], idx: usize) -> (u32, u32, u32, f32) {
193 let p = idx * L0_SIZE;
194 let first_doc = u32::from_le_bytes(bytes[p..p + 4].try_into().unwrap());
195 let last_doc = u32::from_le_bytes(bytes[p + 4..p + 8].try_into().unwrap());
196 let offset = u32::from_le_bytes(bytes[p + 8..p + 12].try_into().unwrap());
197 let max_weight = f32::from_le_bytes(bytes[p + 12..p + 16].try_into().unwrap());
198 (first_doc, last_doc, offset, max_weight)
199}
200
201#[inline]
203fn write_l0(buf: &mut Vec<u8>, first_doc: u32, last_doc: u32, offset: u32, max_weight: f32) {
204 buf.extend_from_slice(&first_doc.to_le_bytes());
205 buf.extend_from_slice(&last_doc.to_le_bytes());
206 buf.extend_from_slice(&offset.to_le_bytes());
207 buf.extend_from_slice(&max_weight.to_le_bytes());
208}
209
210#[inline]
215fn block_data_size(stream: &[u8], pos: usize) -> usize {
216 let count = u16::from_le_bytes(stream[pos..pos + 2].try_into().unwrap()) as usize;
217 let doc_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 6]);
218 let tf_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 7]);
219 let delta_bytes = if count > 1 {
220 (count - 1) * doc_rounded.bytes_per_value()
221 } else {
222 0
223 };
224 8 + delta_bytes + count * tf_rounded.bytes_per_value()
225}
226
227#[derive(Debug, Clone)]
228pub struct BlockPostingList {
229 stream: OwnedBytes,
231 l0_bytes: OwnedBytes,
234 l0_count: usize,
236 l1_docs: Vec<u32>,
239 doc_count: u32,
241 max_tf: u32,
243}
244
245impl BlockPostingList {
246 #[inline]
248 fn read_l0_entry(&self, idx: usize) -> (u32, u32, u32, f32) {
249 read_l0(&self.l0_bytes, idx)
250 }
251
252 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
261 let mut stream: Vec<u8> = Vec::new();
262 let mut l0_buf: Vec<u8> = Vec::new();
263 let mut l1_docs: Vec<u32> = Vec::new();
264 let mut l0_count = 0usize;
265 let mut max_tf = 0u32;
266
267 let postings = &list.postings;
268 let mut i = 0;
269
270 let mut deltas = Vec::with_capacity(BLOCK_SIZE);
272 let mut tf_buf = Vec::with_capacity(BLOCK_SIZE);
273
274 while i < postings.len() {
275 let block_start = stream.len() as u32;
276 let block_end = (i + BLOCK_SIZE).min(postings.len());
277 let block = &postings[i..block_end];
278 let count = block.len();
279
280 let block_max_tf = block.iter().map(|p| p.term_freq).max().unwrap_or(0);
282 max_tf = max_tf.max(block_max_tf);
283
284 let base_doc_id = block.first().unwrap().doc_id;
285 let last_doc_id = block.last().unwrap().doc_id;
286
287 deltas.clear();
289 let mut prev = base_doc_id;
290 for posting in block.iter().skip(1) {
291 deltas.push(posting.doc_id - prev);
292 prev = posting.doc_id;
293 }
294 let max_delta = deltas.iter().copied().max().unwrap_or(0);
295 let doc_id_bits = simd::round_bit_width(simd::bits_needed(max_delta));
296
297 tf_buf.clear();
299 tf_buf.extend(block.iter().map(|p| p.term_freq));
300 let tf_bits = simd::round_bit_width(simd::bits_needed(block_max_tf));
301
302 stream.write_u16::<LittleEndian>(count as u16)?;
304 stream.write_u32::<LittleEndian>(base_doc_id)?;
305 stream.push(doc_id_bits);
306 stream.push(tf_bits);
307
308 if count > 1 {
310 let rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
311 let byte_count = (count - 1) * rounded.bytes_per_value();
312 let start = stream.len();
313 stream.resize(start + byte_count, 0);
314 simd::pack_rounded(&deltas, rounded, &mut stream[start..]);
315 }
316
317 {
319 let rounded = simd::RoundedBitWidth::from_u8(tf_bits);
320 let byte_count = count * rounded.bytes_per_value();
321 let start = stream.len();
322 stream.resize(start + byte_count, 0);
323 simd::pack_rounded(&tf_buf, rounded, &mut stream[start..]);
324 }
325
326 write_l0(
328 &mut l0_buf,
329 base_doc_id,
330 last_doc_id,
331 block_start,
332 block_max_tf as f32,
333 );
334 l0_count += 1;
335
336 if l0_count.is_multiple_of(L1_INTERVAL) {
338 l1_docs.push(last_doc_id);
339 }
340
341 i = block_end;
342 }
343
344 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
346 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
347 l1_docs.push(last_doc);
348 }
349
350 Ok(Self {
351 stream: OwnedBytes::new(stream),
352 l0_bytes: OwnedBytes::new(l0_buf),
353 l0_count,
354 l1_docs,
355 doc_count: postings.len() as u32,
356 max_tf,
357 })
358 }
359
360 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
370 writer.write_all(&self.stream)?;
371 writer.write_all(&self.l0_bytes)?;
372 for &doc in &self.l1_docs {
373 writer.write_u32::<LittleEndian>(doc)?;
374 }
375
376 writer.write_u64::<LittleEndian>(self.stream.len() as u64)?;
378 writer.write_u32::<LittleEndian>(self.l0_count as u32)?;
379 writer.write_u32::<LittleEndian>(self.l1_docs.len() as u32)?;
380 writer.write_u32::<LittleEndian>(self.doc_count)?;
381 writer.write_u32::<LittleEndian>(self.max_tf)?;
382
383 Ok(())
384 }
385
386 pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
388 if raw.len() < FOOTER_SIZE {
389 return Err(io::Error::new(
390 io::ErrorKind::InvalidData,
391 "posting data too short",
392 ));
393 }
394
395 let f = raw.len() - FOOTER_SIZE;
396 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
397 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
398 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
399 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
400 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
401
402 let l0_start = stream_len;
403 let l0_end = l0_start + l0_count * L0_SIZE;
404 let l1_start = l0_end;
405
406 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
407
408 Ok(Self {
409 stream: OwnedBytes::new(raw[..stream_len].to_vec()),
410 l0_bytes: OwnedBytes::new(raw[l0_start..l0_end].to_vec()),
411 l0_count,
412 l1_docs,
413 doc_count,
414 max_tf,
415 })
416 }
417
418 pub fn deserialize_zero_copy(raw: OwnedBytes) -> io::Result<Self> {
422 if raw.len() < FOOTER_SIZE {
423 return Err(io::Error::new(
424 io::ErrorKind::InvalidData,
425 "posting data too short",
426 ));
427 }
428
429 let f = raw.len() - FOOTER_SIZE;
430 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
431 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
432 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
433 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
434 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
435
436 let l0_start = stream_len;
437 let l0_end = l0_start + l0_count * L0_SIZE;
438 let l1_start = l0_end;
439
440 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
441
442 Ok(Self {
443 stream: raw.slice(0..stream_len),
444 l0_bytes: raw.slice(l0_start..l0_end),
445 l0_count,
446 l1_docs,
447 doc_count,
448 max_tf,
449 })
450 }
451
452 fn extract_l1_docs(bytes: &[u8], count: usize) -> Vec<u32> {
454 let mut docs = Vec::with_capacity(count);
455 for i in 0..count {
456 let p = i * L1_SIZE;
457 docs.push(u32::from_le_bytes(bytes[p..p + 4].try_into().unwrap()));
458 }
459 docs
460 }
461
462 pub fn doc_count(&self) -> u32 {
463 self.doc_count
464 }
465
466 pub fn max_tf(&self) -> u32 {
468 self.max_tf
469 }
470
471 pub fn num_blocks(&self) -> usize {
473 self.l0_count
474 }
475
476 pub fn block_max_tf(&self, block_idx: usize) -> Option<u32> {
478 if block_idx >= self.l0_count {
479 return None;
480 }
481 let (_, _, _, max_weight) = self.read_l0_entry(block_idx);
482 Some(max_weight as u32)
483 }
484
485 pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
488 let mut stream: Vec<u8> = Vec::new();
489 let mut l0_buf: Vec<u8> = Vec::new();
490 let mut l1_docs: Vec<u32> = Vec::new();
491 let mut l0_count = 0usize;
492 let mut total_docs = 0u32;
493 let mut max_tf = 0u32;
494
495 for (source, doc_offset) in sources {
496 max_tf = max_tf.max(source.max_tf);
497 for block_idx in 0..source.num_blocks() {
498 let (first_doc, last_doc, offset, max_weight) = source.read_l0_entry(block_idx);
499 let blk_size = block_data_size(&source.stream, offset as usize);
500 let block_bytes = &source.stream[offset as usize..offset as usize + blk_size];
501
502 let count = u16::from_le_bytes(block_bytes[0..2].try_into().unwrap());
503 let new_offset = stream.len() as u32;
504
505 stream.write_u16::<LittleEndian>(count)?;
507 stream.write_u32::<LittleEndian>(first_doc + doc_offset)?;
508 stream.extend_from_slice(&block_bytes[6..]);
509
510 let new_last = last_doc + doc_offset;
511 write_l0(
512 &mut l0_buf,
513 first_doc + doc_offset,
514 new_last,
515 new_offset,
516 max_weight,
517 );
518 l0_count += 1;
519 total_docs += count as u32;
520
521 if l0_count.is_multiple_of(L1_INTERVAL) {
522 l1_docs.push(new_last);
523 }
524 }
525 }
526
527 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
529 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
530 l1_docs.push(last_doc);
531 }
532
533 Ok(Self {
534 stream: OwnedBytes::new(stream),
535 l0_bytes: OwnedBytes::new(l0_buf),
536 l0_count,
537 l1_docs,
538 doc_count: total_docs,
539 max_tf,
540 })
541 }
542
543 pub fn concatenate_streaming<W: Write>(
554 sources: &[(&[u8], u32)], writer: &mut W,
556 ) -> io::Result<(u32, usize)> {
557 struct SourceMeta {
558 stream_len: usize,
559 l0_count: usize,
560 }
561
562 let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
563 let mut total_docs = 0u32;
564 let mut merged_max_tf = 0u32;
565
566 for (raw, _) in sources {
567 if raw.len() < FOOTER_SIZE {
568 continue;
569 }
570 let f = raw.len() - FOOTER_SIZE;
571 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
572 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
573 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
575 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
576 total_docs += doc_count;
577 merged_max_tf = merged_max_tf.max(max_tf);
578 metas.push(SourceMeta {
579 stream_len,
580 l0_count,
581 });
582 }
583
584 let mut out_l0: Vec<u8> = Vec::new();
587 let mut out_l1_docs: Vec<u32> = Vec::new();
588 let mut out_l0_count = 0usize;
589 let mut stream_written = 0u64;
590 let mut patch_buf = [0u8; 8];
591
592 for (src_idx, meta) in metas.iter().enumerate() {
593 let (raw, doc_offset) = &sources[src_idx];
594 let l0_base = meta.stream_len; let src_stream = &raw[..meta.stream_len];
596
597 for i in 0..meta.l0_count {
598 let (first_doc, last_doc, offset, max_weight) = read_l0(&raw[l0_base..], i);
600
601 let blk_size = block_data_size(src_stream, offset as usize);
603 let block = &src_stream[offset as usize..offset as usize + blk_size];
604
605 let new_last = last_doc + doc_offset;
607 write_l0(
608 &mut out_l0,
609 first_doc + doc_offset,
610 new_last,
611 stream_written as u32,
612 max_weight,
613 );
614 out_l0_count += 1;
615
616 if out_l0_count.is_multiple_of(L1_INTERVAL) {
618 out_l1_docs.push(new_last);
619 }
620
621 patch_buf.copy_from_slice(&block[0..8]);
623 let blk_first = u32::from_le_bytes(patch_buf[2..6].try_into().unwrap());
624 patch_buf[2..6].copy_from_slice(&(blk_first + doc_offset).to_le_bytes());
625 writer.write_all(&patch_buf)?;
626 writer.write_all(&block[8..])?;
627
628 stream_written += blk_size as u64;
629 }
630 }
631
632 if !out_l0_count.is_multiple_of(L1_INTERVAL) && out_l0_count > 0 {
634 let (_, last_doc, _, _) = read_l0(&out_l0, out_l0_count - 1);
635 out_l1_docs.push(last_doc);
636 }
637
638 writer.write_all(&out_l0)?;
640 for &doc in &out_l1_docs {
641 writer.write_u32::<LittleEndian>(doc)?;
642 }
643
644 writer.write_u64::<LittleEndian>(stream_written)?;
645 writer.write_u32::<LittleEndian>(out_l0_count as u32)?;
646 writer.write_u32::<LittleEndian>(out_l1_docs.len() as u32)?;
647 writer.write_u32::<LittleEndian>(total_docs)?;
648 writer.write_u32::<LittleEndian>(merged_max_tf)?;
649
650 let l1_bytes_len = out_l1_docs.len() * L1_SIZE;
651 let total_bytes = stream_written as usize + out_l0.len() + l1_bytes_len + FOOTER_SIZE;
652 Ok((total_docs, total_bytes))
653 }
654
655 pub fn decode_block_into(
662 &self,
663 block_idx: usize,
664 doc_ids: &mut Vec<u32>,
665 tfs: &mut Vec<u32>,
666 ) -> bool {
667 if block_idx >= self.l0_count {
668 return false;
669 }
670
671 let (_, _, offset, _) = self.read_l0_entry(block_idx);
672 let pos = offset as usize;
673 let blk_size = block_data_size(&self.stream, pos);
674 let block_data = &self.stream[pos..pos + blk_size];
675
676 let count = u16::from_le_bytes(block_data[0..2].try_into().unwrap()) as usize;
678 let first_doc = u32::from_le_bytes(block_data[2..6].try_into().unwrap());
679 let doc_id_bits = block_data[6];
680 let tf_bits = block_data[7];
681
682 doc_ids.clear();
684 doc_ids.resize(count, 0);
685 doc_ids[0] = first_doc;
686
687 let doc_rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
688 let deltas_bytes = if count > 1 {
689 (count - 1) * doc_rounded.bytes_per_value()
690 } else {
691 0
692 };
693
694 if count > 1 {
695 simd::unpack_rounded(
696 &block_data[8..8 + deltas_bytes],
697 doc_rounded,
698 &mut doc_ids[1..],
699 count - 1,
700 );
701 for i in 1..count {
702 doc_ids[i] += doc_ids[i - 1];
703 }
704 }
705
706 tfs.clear();
708 tfs.resize(count, 0);
709 let tf_rounded = simd::RoundedBitWidth::from_u8(tf_bits);
710 let tfs_start = 8 + deltas_bytes;
711 simd::unpack_rounded(
712 &block_data[tfs_start..tfs_start + count * tf_rounded.bytes_per_value()],
713 tf_rounded,
714 tfs,
715 count,
716 );
717
718 true
719 }
720
721 #[inline]
723 pub fn block_first_doc(&self, block_idx: usize) -> Option<DocId> {
724 if block_idx >= self.l0_count {
725 return None;
726 }
727 let (first_doc, _, _, _) = self.read_l0_entry(block_idx);
728 Some(first_doc)
729 }
730
731 #[inline]
733 pub fn block_last_doc(&self, block_idx: usize) -> Option<DocId> {
734 if block_idx >= self.l0_count {
735 return None;
736 }
737 let (_, last_doc, _, _) = self.read_l0_entry(block_idx);
738 Some(last_doc)
739 }
740
741 pub fn seek_block(&self, target: DocId, from_block: usize) -> Option<usize> {
749 if from_block >= self.l0_count {
750 return None;
751 }
752
753 let from_l1 = from_block / L1_INTERVAL;
754
755 let l1_idx = if !self.l1_docs.is_empty() {
757 let idx = from_l1 + simd::find_first_ge_u32(&self.l1_docs[from_l1..], target);
758 if idx >= self.l1_docs.len() {
759 return None;
760 }
761 idx
762 } else {
763 return None;
764 };
765
766 let start = (l1_idx * L1_INTERVAL).max(from_block);
768 let end = ((l1_idx + 1) * L1_INTERVAL).min(self.l0_count);
769 let count = end - start;
770
771 let mut last_docs = [u32::MAX; L1_INTERVAL];
772 for (j, idx) in (start..end).enumerate() {
773 let (_, ld, _, _) = read_l0(&self.l0_bytes, idx);
774 last_docs[j] = ld;
775 }
776 let within = simd::find_first_ge_u32(&last_docs[..count], target);
777 let block_idx = start + within;
778
779 if block_idx < self.l0_count {
780 Some(block_idx)
781 } else {
782 None
783 }
784 }
785
786 pub fn iterator(&self) -> BlockPostingIterator<'_> {
788 BlockPostingIterator::new(self)
789 }
790
791 pub fn into_iterator(self) -> BlockPostingIterator<'static> {
793 BlockPostingIterator::owned(self)
794 }
795}
796
797pub struct BlockPostingIterator<'a> {
804 block_list: std::borrow::Cow<'a, BlockPostingList>,
805 current_block: usize,
806 block_doc_ids: Vec<u32>,
807 block_tfs: Vec<u32>,
808 position_in_block: usize,
809 exhausted: bool,
810}
811
812impl<'a> BlockPostingIterator<'a> {
813 fn new(block_list: &'a BlockPostingList) -> Self {
814 let exhausted = block_list.l0_count == 0;
815 let mut iter = Self {
816 block_list: std::borrow::Cow::Borrowed(block_list),
817 current_block: 0,
818 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
819 block_tfs: Vec::with_capacity(BLOCK_SIZE),
820 position_in_block: 0,
821 exhausted,
822 };
823 if !iter.exhausted {
824 iter.load_block(0);
825 }
826 iter
827 }
828
829 fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
830 let exhausted = block_list.l0_count == 0;
831 let mut iter = BlockPostingIterator {
832 block_list: std::borrow::Cow::Owned(block_list),
833 current_block: 0,
834 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
835 block_tfs: Vec::with_capacity(BLOCK_SIZE),
836 position_in_block: 0,
837 exhausted,
838 };
839 if !iter.exhausted {
840 iter.load_block(0);
841 }
842 iter
843 }
844
845 fn load_block(&mut self, block_idx: usize) {
846 if block_idx >= self.block_list.l0_count {
847 self.exhausted = true;
848 return;
849 }
850
851 self.current_block = block_idx;
852 self.position_in_block = 0;
853
854 self.block_list
855 .decode_block_into(block_idx, &mut self.block_doc_ids, &mut self.block_tfs);
856 }
857
858 pub fn doc(&self) -> DocId {
859 if self.exhausted {
860 TERMINATED
861 } else if self.position_in_block < self.block_doc_ids.len() {
862 self.block_doc_ids[self.position_in_block]
863 } else {
864 TERMINATED
865 }
866 }
867
868 pub fn term_freq(&self) -> u32 {
869 if self.exhausted || self.position_in_block >= self.block_tfs.len() {
870 0
871 } else {
872 self.block_tfs[self.position_in_block]
873 }
874 }
875
876 pub fn advance(&mut self) -> DocId {
877 if self.exhausted {
878 return TERMINATED;
879 }
880
881 self.position_in_block += 1;
882 if self.position_in_block >= self.block_doc_ids.len() {
883 self.load_block(self.current_block + 1);
884 }
885 self.doc()
886 }
887
888 pub fn seek(&mut self, target: DocId) -> DocId {
889 if self.exhausted {
890 return TERMINATED;
891 }
892
893 let block_idx = match self.block_list.seek_block(target, self.current_block) {
895 Some(idx) => idx,
896 None => {
897 self.exhausted = true;
898 return TERMINATED;
899 }
900 };
901
902 if block_idx != self.current_block {
903 self.load_block(block_idx);
904 }
905
906 let remaining = &self.block_doc_ids[self.position_in_block..];
908 let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
909 self.position_in_block += pos;
910
911 if self.position_in_block >= self.block_doc_ids.len() {
912 self.load_block(self.current_block + 1);
913 }
914 self.doc()
915 }
916
917 pub fn skip_to_next_block(&mut self) -> DocId {
921 if self.exhausted {
922 return TERMINATED;
923 }
924 self.load_block(self.current_block + 1);
925 self.doc()
926 }
927
928 #[inline]
930 pub fn current_block_idx(&self) -> usize {
931 self.current_block
932 }
933
934 #[inline]
936 pub fn num_blocks(&self) -> usize {
937 self.block_list.l0_count
938 }
939
940 #[inline]
942 pub fn current_block_max_tf(&self) -> u32 {
943 if self.exhausted || self.current_block >= self.block_list.l0_count {
944 0
945 } else {
946 let (_, _, _, max_weight) = self.block_list.read_l0_entry(self.current_block);
947 max_weight as u32
948 }
949 }
950}
951
952#[cfg(test)]
953mod tests {
954 use super::*;
955
956 #[test]
957 fn test_posting_list_basic() {
958 let mut list = PostingList::new();
959 list.push(1, 2);
960 list.push(5, 1);
961 list.push(10, 3);
962
963 assert_eq!(list.len(), 3);
964
965 let mut iter = PostingListIterator::new(&list);
966 assert_eq!(iter.doc(), 1);
967 assert_eq!(iter.term_freq(), 2);
968
969 assert_eq!(iter.advance(), 5);
970 assert_eq!(iter.term_freq(), 1);
971
972 assert_eq!(iter.advance(), 10);
973 assert_eq!(iter.term_freq(), 3);
974
975 assert_eq!(iter.advance(), TERMINATED);
976 }
977
978 #[test]
979 fn test_posting_list_serialization() {
980 let mut list = PostingList::new();
981 for i in 0..100 {
982 list.push(i * 3, (i % 5) + 1);
983 }
984
985 let mut buffer = Vec::new();
986 list.serialize(&mut buffer).unwrap();
987
988 let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
989 assert_eq!(deserialized.len(), list.len());
990
991 for (a, b) in list.iter().zip(deserialized.iter()) {
992 assert_eq!(a, b);
993 }
994 }
995
996 #[test]
997 fn test_posting_list_seek() {
998 let mut list = PostingList::new();
999 for i in 0..100 {
1000 list.push(i * 2, 1);
1001 }
1002
1003 let mut iter = PostingListIterator::new(&list);
1004
1005 assert_eq!(iter.seek(50), 50);
1006 assert_eq!(iter.seek(51), 52);
1007 assert_eq!(iter.seek(200), TERMINATED);
1008 }
1009
1010 #[test]
1011 fn test_block_posting_list() {
1012 let mut list = PostingList::new();
1013 for i in 0..500 {
1014 list.push(i * 2, (i % 10) + 1);
1015 }
1016
1017 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1018 assert_eq!(block_list.doc_count(), 500);
1019
1020 let mut iter = block_list.iterator();
1021 assert_eq!(iter.doc(), 0);
1022 assert_eq!(iter.term_freq(), 1);
1023
1024 assert_eq!(iter.seek(500), 500);
1026 assert_eq!(iter.seek(998), 998);
1027 assert_eq!(iter.seek(1000), TERMINATED);
1028 }
1029
1030 #[test]
1031 fn test_block_posting_list_serialization() {
1032 let mut list = PostingList::new();
1033 for i in 0..300 {
1034 list.push(i * 3, i + 1);
1035 }
1036
1037 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1038
1039 let mut buffer = Vec::new();
1040 block_list.serialize(&mut buffer).unwrap();
1041
1042 let deserialized = BlockPostingList::deserialize(&buffer[..]).unwrap();
1043 assert_eq!(deserialized.doc_count(), block_list.doc_count());
1044
1045 let mut iter1 = block_list.iterator();
1047 let mut iter2 = deserialized.iterator();
1048
1049 while iter1.doc() != TERMINATED {
1050 assert_eq!(iter1.doc(), iter2.doc());
1051 assert_eq!(iter1.term_freq(), iter2.term_freq());
1052 iter1.advance();
1053 iter2.advance();
1054 }
1055 assert_eq!(iter2.doc(), TERMINATED);
1056 }
1057
1058 fn collect_postings(bpl: &BlockPostingList) -> Vec<(u32, u32)> {
1060 let mut result = Vec::new();
1061 let mut it = bpl.iterator();
1062 while it.doc() != TERMINATED {
1063 result.push((it.doc(), it.term_freq()));
1064 it.advance();
1065 }
1066 result
1067 }
1068
1069 fn build_bpl(postings: &[(u32, u32)]) -> BlockPostingList {
1071 let mut pl = PostingList::new();
1072 for &(doc_id, tf) in postings {
1073 pl.push(doc_id, tf);
1074 }
1075 BlockPostingList::from_posting_list(&pl).unwrap()
1076 }
1077
1078 fn serialize_bpl(bpl: &BlockPostingList) -> Vec<u8> {
1080 let mut buf = Vec::new();
1081 bpl.serialize(&mut buf).unwrap();
1082 buf
1083 }
1084
1085 #[test]
1086 fn test_concatenate_blocks_two_segments() {
1087 let a: Vec<(u32, u32)> = (0..100).map(|i| (i * 2, i + 1)).collect();
1089 let bpl_a = build_bpl(&a);
1090
1091 let b: Vec<(u32, u32)> = (0..100).map(|i| (i * 3, i + 2)).collect();
1093 let bpl_b = build_bpl(&b);
1094
1095 let merged =
1097 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 200)])
1098 .unwrap();
1099
1100 assert_eq!(merged.doc_count(), 200);
1101
1102 let postings = collect_postings(&merged);
1103 assert_eq!(postings.len(), 200);
1104
1105 for (i, p) in postings.iter().enumerate().take(100) {
1107 assert_eq!(*p, (i as u32 * 2, i as u32 + 1));
1108 }
1109 for i in 0..100 {
1111 assert_eq!(postings[100 + i], (i as u32 * 3 + 200, i as u32 + 2));
1112 }
1113 }
1114
1115 #[test]
1116 fn test_concatenate_streaming_matches_blocks() {
1117 let seg_a: Vec<(u32, u32)> = (0..250).map(|i| (i * 2, (i % 7) + 1)).collect();
1119 let seg_b: Vec<(u32, u32)> = (0..180).map(|i| (i * 5, (i % 3) + 1)).collect();
1120 let seg_c: Vec<(u32, u32)> = (0..90).map(|i| (i * 10, (i % 11) + 1)).collect();
1121
1122 let bpl_a = build_bpl(&seg_a);
1123 let bpl_b = build_bpl(&seg_b);
1124 let bpl_c = build_bpl(&seg_c);
1125
1126 let offset_b = 1000u32;
1127 let offset_c = 2000u32;
1128
1129 let ref_merged = BlockPostingList::concatenate_blocks(&[
1131 (bpl_a.clone(), 0),
1132 (bpl_b.clone(), offset_b),
1133 (bpl_c.clone(), offset_c),
1134 ])
1135 .unwrap();
1136 let mut ref_buf = Vec::new();
1137 ref_merged.serialize(&mut ref_buf).unwrap();
1138
1139 let bytes_a = serialize_bpl(&bpl_a);
1141 let bytes_b = serialize_bpl(&bpl_b);
1142 let bytes_c = serialize_bpl(&bpl_c);
1143
1144 let sources: Vec<(&[u8], u32)> =
1145 vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
1146 let mut stream_buf = Vec::new();
1147 let (doc_count, bytes_written) =
1148 BlockPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
1149
1150 assert_eq!(doc_count, 520); assert_eq!(bytes_written, stream_buf.len());
1152
1153 let ref_postings = collect_postings(&BlockPostingList::deserialize(&ref_buf).unwrap());
1155 let stream_postings =
1156 collect_postings(&BlockPostingList::deserialize(&stream_buf).unwrap());
1157
1158 assert_eq!(ref_postings.len(), stream_postings.len());
1159 for (i, (r, s)) in ref_postings.iter().zip(stream_postings.iter()).enumerate() {
1160 assert_eq!(r, s, "mismatch at posting {}", i);
1161 }
1162 }
1163
1164 #[test]
1165 fn test_multi_round_merge() {
1166 let segments: Vec<Vec<(u32, u32)>> = (0..4)
1173 .map(|seg| (0..200).map(|i| (i * 3, (i + seg * 7) % 10 + 1)).collect())
1174 .collect();
1175
1176 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1177 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1178
1179 let mut merged_01 = Vec::new();
1181 let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1182 let (dc_01, _) =
1183 BlockPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1184 assert_eq!(dc_01, 400);
1185
1186 let mut merged_23 = Vec::new();
1187 let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1188 let (dc_23, _) =
1189 BlockPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1190 assert_eq!(dc_23, 400);
1191
1192 let mut final_merged = Vec::new();
1194 let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1195 let (dc_final, _) =
1196 BlockPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1197 assert_eq!(dc_final, 800);
1198
1199 let final_bpl = BlockPostingList::deserialize(&final_merged).unwrap();
1201 let postings = collect_postings(&final_bpl);
1202 assert_eq!(postings.len(), 800);
1203
1204 assert_eq!(postings[0].0, 0); assert_eq!(postings[199].0, 597); assert_eq!(postings[200].0, 600); assert_eq!(postings[399].0, 1197); assert_eq!(postings[400].0, 1200); assert_eq!(postings[799].0, 2397); for seg in 0u32..4 {
1217 for i in 0u32..200 {
1218 let idx = (seg * 200 + i) as usize;
1219 assert_eq!(
1220 postings[idx].1,
1221 (i + seg * 7) % 10 + 1,
1222 "seg{} tf[{}]",
1223 seg,
1224 i
1225 );
1226 }
1227 }
1228
1229 let mut it = final_bpl.iterator();
1231 assert_eq!(it.seek(600), 600);
1232 assert_eq!(it.seek(1200), 1200);
1233 assert_eq!(it.seek(2397), 2397);
1234 assert_eq!(it.seek(2398), TERMINATED);
1235 }
1236
1237 #[test]
1238 fn test_large_scale_merge() {
1239 let num_segments = 5;
1242 let docs_per_segment = 2000;
1243 let docs_gap = 3; let segments: Vec<Vec<(u32, u32)>> = (0..num_segments)
1246 .map(|seg| {
1247 (0..docs_per_segment)
1248 .map(|i| (i as u32 * docs_gap, (i as u32 + seg as u32) % 20 + 1))
1249 .collect()
1250 })
1251 .collect();
1252
1253 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1254
1255 for bpl in &bpls {
1257 assert!(
1258 bpl.num_blocks() >= 15,
1259 "expected >=15 blocks, got {}",
1260 bpl.num_blocks()
1261 );
1262 }
1263
1264 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1265
1266 let max_doc_per_seg = (docs_per_segment as u32 - 1) * docs_gap;
1268 let offsets: Vec<u32> = (0..num_segments)
1269 .map(|i| i as u32 * (max_doc_per_seg + 1))
1270 .collect();
1271
1272 let sources: Vec<(&[u8], u32)> = serialized
1273 .iter()
1274 .zip(offsets.iter())
1275 .map(|(b, o)| (b.as_slice(), *o))
1276 .collect();
1277
1278 let mut merged = Vec::new();
1279 let (doc_count, _) =
1280 BlockPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1281 assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1282
1283 let merged_bpl = BlockPostingList::deserialize(&merged).unwrap();
1285 let postings = collect_postings(&merged_bpl);
1286 assert_eq!(postings.len(), num_segments * docs_per_segment);
1287
1288 for i in 1..postings.len() {
1290 assert!(
1291 postings[i].0 > postings[i - 1].0 || (i % docs_per_segment == 0), "doc_id not increasing at {}: {} vs {}",
1293 i,
1294 postings[i - 1].0,
1295 postings[i].0,
1296 );
1297 }
1298
1299 let mut it = merged_bpl.iterator();
1301 for (seg, &expected_first) in offsets.iter().enumerate() {
1302 assert_eq!(
1303 it.seek(expected_first),
1304 expected_first,
1305 "seek to segment {} start",
1306 seg
1307 );
1308 }
1309 }
1310
1311 #[test]
1312 fn test_merge_edge_cases() {
1313 let bpl_a = build_bpl(&[(0, 5)]);
1315 let bpl_b = build_bpl(&[(0, 3)]);
1316
1317 let merged =
1318 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 1)])
1319 .unwrap();
1320 assert_eq!(merged.doc_count(), 2);
1321 let p = collect_postings(&merged);
1322 assert_eq!(p, vec![(0, 5), (1, 3)]);
1323
1324 let exact_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32).map(|i| (i, i % 5 + 1)).collect();
1326 let bpl_exact = build_bpl(&exact_block);
1327 assert_eq!(bpl_exact.num_blocks(), 1);
1328
1329 let bytes = serialize_bpl(&bpl_exact);
1330 let mut out = Vec::new();
1331 let sources: Vec<(&[u8], u32)> = vec![(&bytes, 0), (&bytes, BLOCK_SIZE as u32)];
1332 let (dc, _) = BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1333 assert_eq!(dc, BLOCK_SIZE as u32 * 2);
1334
1335 let merged = BlockPostingList::deserialize(&out).unwrap();
1336 let postings = collect_postings(&merged);
1337 assert_eq!(postings.len(), BLOCK_SIZE * 2);
1338 assert_eq!(postings[BLOCK_SIZE].0, BLOCK_SIZE as u32);
1340
1341 let over_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32 + 1).map(|i| (i * 2, 1)).collect();
1343 let bpl_over = build_bpl(&over_block);
1344 assert_eq!(bpl_over.num_blocks(), 2);
1345 }
1346
1347 #[test]
1348 fn test_streaming_roundtrip_single_source() {
1349 let docs: Vec<(u32, u32)> = (0..500).map(|i| (i * 7, i % 15 + 1)).collect();
1351 let bpl = build_bpl(&docs);
1352 let direct = serialize_bpl(&bpl);
1353
1354 let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1355 let mut streamed = Vec::new();
1356 BlockPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1357
1358 let p1 = collect_postings(&BlockPostingList::deserialize(&direct).unwrap());
1360 let p2 = collect_postings(&BlockPostingList::deserialize(&streamed).unwrap());
1361 assert_eq!(p1, p2);
1362 }
1363
1364 #[test]
1365 fn test_max_tf_preserved_through_merge() {
1366 let mut a = Vec::new();
1368 for i in 0..200 {
1369 a.push((i * 2, if i == 100 { 50 } else { 1 }));
1370 }
1371 let bpl_a = build_bpl(&a);
1372 assert_eq!(bpl_a.max_tf(), 50);
1373
1374 let mut b = Vec::new();
1376 for i in 0..200 {
1377 b.push((i * 2, if i == 50 { 30 } else { 2 }));
1378 }
1379 let bpl_b = build_bpl(&b);
1380 assert_eq!(bpl_b.max_tf(), 30);
1381
1382 let bytes_a = serialize_bpl(&bpl_a);
1384 let bytes_b = serialize_bpl(&bpl_b);
1385 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 1000)];
1386 let mut out = Vec::new();
1387 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1388
1389 let merged = BlockPostingList::deserialize(&out).unwrap();
1390 assert_eq!(merged.max_tf(), 50);
1391 assert_eq!(merged.doc_count(), 400);
1392 }
1393
1394 #[test]
1397 fn test_l0_l1_counts() {
1398 let bpl = build_bpl(&(0..50u32).map(|i| (i, 1)).collect::<Vec<_>>());
1400 assert_eq!(bpl.num_blocks(), 1);
1401 assert_eq!(bpl.l1_docs.len(), 1);
1402
1403 let n = BLOCK_SIZE * L1_INTERVAL;
1405 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1406 assert_eq!(bpl.num_blocks(), L1_INTERVAL);
1407 assert_eq!(bpl.l1_docs.len(), 1);
1408
1409 let n = BLOCK_SIZE * L1_INTERVAL + 1;
1411 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1412 assert_eq!(bpl.num_blocks(), L1_INTERVAL + 1);
1413 assert_eq!(bpl.l1_docs.len(), 2);
1414
1415 let n = BLOCK_SIZE * L1_INTERVAL * 3;
1417 let bpl = build_bpl(&(0..n as u32).map(|i| (i, 1)).collect::<Vec<_>>());
1418 assert_eq!(bpl.num_blocks(), L1_INTERVAL * 3);
1419 assert_eq!(bpl.l1_docs.len(), 3);
1420 }
1421
1422 #[test]
1423 fn test_l1_last_doc_values() {
1424 let n = BLOCK_SIZE * 20;
1426 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1427 let bpl = build_bpl(&docs);
1428 assert_eq!(bpl.num_blocks(), 20);
1429 assert_eq!(bpl.l1_docs.len(), 3); let expected_l1_0 = bpl.block_last_doc(7).unwrap();
1433 assert_eq!(bpl.l1_docs[0], expected_l1_0);
1434
1435 let expected_l1_1 = bpl.block_last_doc(15).unwrap();
1437 assert_eq!(bpl.l1_docs[1], expected_l1_1);
1438
1439 let expected_l1_2 = bpl.block_last_doc(19).unwrap();
1441 assert_eq!(bpl.l1_docs[2], expected_l1_2);
1442 }
1443
1444 #[test]
1445 fn test_seek_block_basic() {
1446 let n = BLOCK_SIZE * 20;
1448 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 10, 1)).collect();
1449 let bpl = build_bpl(&docs);
1450
1451 assert_eq!(bpl.seek_block(0, 0), Some(0));
1453
1454 for blk in 0..20 {
1456 let first = bpl.block_first_doc(blk).unwrap();
1457 assert_eq!(
1458 bpl.seek_block(first, 0),
1459 Some(blk),
1460 "seek to block {} first_doc",
1461 blk
1462 );
1463 }
1464
1465 for blk in 0..20 {
1467 let last = bpl.block_last_doc(blk).unwrap();
1468 assert_eq!(
1469 bpl.seek_block(last, 0),
1470 Some(blk),
1471 "seek to block {} last_doc",
1472 blk
1473 );
1474 }
1475
1476 let max_doc = bpl.block_last_doc(19).unwrap();
1478 assert_eq!(bpl.seek_block(max_doc + 1, 0), None);
1479
1480 let mid_doc = bpl.block_first_doc(10).unwrap();
1482 assert_eq!(bpl.seek_block(mid_doc, 10), Some(10));
1483 assert_eq!(
1484 bpl.seek_block(mid_doc, 11),
1485 Some(11).or(bpl.seek_block(mid_doc, 11))
1486 );
1487 }
1488
1489 #[test]
1490 fn test_seek_block_across_l1_boundaries() {
1491 let n = BLOCK_SIZE * 24;
1493 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 5, 1)).collect();
1494 let bpl = build_bpl(&docs);
1495 assert_eq!(bpl.l1_docs.len(), 3);
1496
1497 for group in 0..3 {
1499 let blk = group * L1_INTERVAL;
1500 let target = bpl.block_first_doc(blk).unwrap();
1501 assert_eq!(
1502 bpl.seek_block(target, 0),
1503 Some(blk),
1504 "seek to group {} block {}",
1505 group,
1506 blk
1507 );
1508 }
1509
1510 let target = bpl.block_first_doc(20).unwrap() + 1;
1512 assert_eq!(bpl.seek_block(target, 0), Some(20));
1513 }
1514
1515 #[test]
1516 fn test_block_data_size_helper() {
1517 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 7, (i % 20) + 1)).collect();
1519 let bpl = build_bpl(&docs);
1520
1521 for blk in 0..bpl.num_blocks() {
1522 let (_, _, offset, _) = bpl.read_l0_entry(blk);
1523 let computed_size = block_data_size(&bpl.stream, offset as usize);
1524
1525 if blk + 1 < bpl.num_blocks() {
1528 let (_, _, next_offset, _) = bpl.read_l0_entry(blk + 1);
1529 assert_eq!(
1530 computed_size,
1531 (next_offset - offset) as usize,
1532 "block_data_size mismatch at block {}",
1533 blk
1534 );
1535 } else {
1536 assert_eq!(
1538 offset as usize + computed_size,
1539 bpl.stream.len(),
1540 "last block size mismatch"
1541 );
1542 }
1543 }
1544 }
1545
1546 #[test]
1547 fn test_l0_entry_roundtrip() {
1548 let docs: Vec<(u32, u32)> = (0..1000u32).map(|i| (i * 3, (i % 10) + 1)).collect();
1550 let bpl = build_bpl(&docs);
1551
1552 let bytes = serialize_bpl(&bpl);
1553 let bpl2 = BlockPostingList::deserialize(&bytes).unwrap();
1554
1555 assert_eq!(bpl.num_blocks(), bpl2.num_blocks());
1556 for blk in 0..bpl.num_blocks() {
1557 assert_eq!(
1558 bpl.read_l0_entry(blk),
1559 bpl2.read_l0_entry(blk),
1560 "L0 entry mismatch at block {}",
1561 blk
1562 );
1563 }
1564
1565 assert_eq!(bpl.l1_docs, bpl2.l1_docs);
1567 }
1568
1569 #[test]
1570 fn test_zero_copy_deserialize_matches() {
1571 let docs: Vec<(u32, u32)> = (0..2000u32).map(|i| (i * 2, (i % 5) + 1)).collect();
1572 let bpl = build_bpl(&docs);
1573 let bytes = serialize_bpl(&bpl);
1574
1575 let copied = BlockPostingList::deserialize(&bytes).unwrap();
1576 let zero_copy =
1577 BlockPostingList::deserialize_zero_copy(OwnedBytes::new(bytes.clone())).unwrap();
1578
1579 assert_eq!(copied.l0_count, zero_copy.l0_count);
1581 assert_eq!(copied.l1_docs, zero_copy.l1_docs);
1582 assert_eq!(copied.doc_count, zero_copy.doc_count);
1583 assert_eq!(copied.max_tf, zero_copy.max_tf);
1584
1585 let p1 = collect_postings(&copied);
1587 let p2 = collect_postings(&zero_copy);
1588 assert_eq!(p1, p2);
1589 }
1590
1591 #[test]
1592 fn test_l1_preserved_through_streaming_merge() {
1593 let seg_a = build_bpl(&(0..1000u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1595 let seg_b = build_bpl(&(0..800u32).map(|i| (i * 3, 2)).collect::<Vec<_>>());
1596 let seg_c = build_bpl(&(0..500u32).map(|i| (i * 5, 3)).collect::<Vec<_>>());
1597
1598 let bytes_a = serialize_bpl(&seg_a);
1599 let bytes_b = serialize_bpl(&seg_b);
1600 let bytes_c = serialize_bpl(&seg_c);
1601
1602 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 10000), (&bytes_c, 20000)];
1603 let mut out = Vec::new();
1604 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1605
1606 let merged = BlockPostingList::deserialize(&out).unwrap();
1607 let expected_l1_count = merged.num_blocks().div_ceil(L1_INTERVAL);
1608 assert_eq!(merged.l1_docs.len(), expected_l1_count);
1609
1610 for (i, &l1_doc) in merged.l1_docs.iter().enumerate() {
1612 let last_block_in_group = ((i + 1) * L1_INTERVAL - 1).min(merged.num_blocks() - 1);
1613 let expected = merged.block_last_doc(last_block_in_group).unwrap();
1614 assert_eq!(l1_doc, expected, "L1[{}] mismatch", i);
1615 }
1616
1617 for blk in 0..merged.num_blocks() {
1619 let first = merged.block_first_doc(blk).unwrap();
1620 assert_eq!(merged.seek_block(first, 0), Some(blk));
1621 }
1622 }
1623
1624 #[test]
1625 fn test_seek_block_single_block() {
1626 let bpl = build_bpl(&[(0, 1), (10, 2), (20, 3)]);
1628 assert_eq!(bpl.num_blocks(), 1);
1629 assert_eq!(bpl.l1_docs.len(), 1);
1630
1631 assert_eq!(bpl.seek_block(0, 0), Some(0));
1632 assert_eq!(bpl.seek_block(10, 0), Some(0));
1633 assert_eq!(bpl.seek_block(20, 0), Some(0));
1634 assert_eq!(bpl.seek_block(21, 0), None);
1635 }
1636
1637 #[test]
1638 fn test_footer_size() {
1639 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 2, 1)).collect();
1641 let bpl = build_bpl(&docs);
1642 let bytes = serialize_bpl(&bpl);
1643
1644 let expected =
1645 bpl.stream.len() + bpl.l0_count * L0_SIZE + bpl.l1_docs.len() * L1_SIZE + FOOTER_SIZE;
1646 assert_eq!(bytes.len(), expected);
1647 }
1648
1649 #[test]
1650 fn test_seek_block_from_block_skips_earlier() {
1651 let n = BLOCK_SIZE * 16;
1653 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1654 let bpl = build_bpl(&docs);
1655
1656 let target_in_5 = bpl.block_first_doc(5).unwrap() + 1;
1658 let result = bpl.seek_block(target_in_5, 8);
1661 assert!(result.is_some());
1662 assert!(result.unwrap() >= 8);
1663 }
1664}