1use byteorder::{LittleEndian, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use super::posting_common::{read_vint, write_vint};
12use crate::DocId;
13use crate::directories::OwnedBytes;
14use crate::structures::simd;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct Posting {
19 pub doc_id: DocId,
20 pub term_freq: u32,
21}
22
23#[derive(Debug, Clone, Default)]
25pub struct PostingList {
26 postings: Vec<Posting>,
27}
28
29impl PostingList {
30 pub fn new() -> Self {
31 Self::default()
32 }
33
34 pub fn with_capacity(capacity: usize) -> Self {
35 Self {
36 postings: Vec::with_capacity(capacity),
37 }
38 }
39
40 pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
42 debug_assert!(
43 self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
44 "Postings must be added in sorted order"
45 );
46 self.postings.push(Posting { doc_id, term_freq });
47 }
48
49 pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
51 if let Some(last) = self.postings.last_mut()
52 && last.doc_id == doc_id
53 {
54 last.term_freq += term_freq;
55 return;
56 }
57 self.postings.push(Posting { doc_id, term_freq });
58 }
59
60 pub fn doc_count(&self) -> u32 {
62 self.postings.len() as u32
63 }
64
65 pub fn len(&self) -> usize {
66 self.postings.len()
67 }
68
69 pub fn is_empty(&self) -> bool {
70 self.postings.is_empty()
71 }
72
73 pub fn iter(&self) -> impl Iterator<Item = &Posting> {
74 self.postings.iter()
75 }
76
77 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
79 write_vint(writer, self.postings.len() as u64)?;
81
82 let mut prev_doc_id = 0u32;
83 for posting in &self.postings {
84 let delta = posting.doc_id - prev_doc_id;
86 write_vint(writer, delta as u64)?;
87 write_vint(writer, posting.term_freq as u64)?;
88 prev_doc_id = posting.doc_id;
89 }
90
91 Ok(())
92 }
93
94 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
96 let count = read_vint(reader)? as usize;
97 let mut postings = Vec::with_capacity(count);
98
99 let mut prev_doc_id = 0u32;
100 for _ in 0..count {
101 let delta = read_vint(reader)? as u32;
102 let term_freq = read_vint(reader)? as u32;
103 let doc_id = prev_doc_id + delta;
104 postings.push(Posting { doc_id, term_freq });
105 prev_doc_id = doc_id;
106 }
107
108 Ok(Self { postings })
109 }
110}
111
112pub struct PostingListIterator<'a> {
114 postings: &'a [Posting],
115 position: usize,
116}
117
118impl<'a> PostingListIterator<'a> {
119 pub fn new(posting_list: &'a PostingList) -> Self {
120 Self {
121 postings: &posting_list.postings,
122 position: 0,
123 }
124 }
125
126 pub fn doc(&self) -> DocId {
128 if self.position < self.postings.len() {
129 self.postings[self.position].doc_id
130 } else {
131 TERMINATED
132 }
133 }
134
135 pub fn term_freq(&self) -> u32 {
137 if self.position < self.postings.len() {
138 self.postings[self.position].term_freq
139 } else {
140 0
141 }
142 }
143
144 pub fn advance(&mut self) -> DocId {
146 self.position += 1;
147 self.doc()
148 }
149
150 pub fn seek(&mut self, target: DocId) -> DocId {
152 let remaining = &self.postings[self.position..];
153 let offset = remaining.partition_point(|p| p.doc_id < target);
154 self.position += offset;
155 self.doc()
156 }
157
158 pub fn size_hint(&self) -> usize {
160 self.postings.len().saturating_sub(self.position)
161 }
162}
163
164pub const TERMINATED: DocId = DocId::MAX;
166
167pub const BLOCK_SIZE: usize = 128;
176
177const L1_INTERVAL: usize = 8;
179
180const L0_SIZE: usize = 16;
183
184const L1_SIZE: usize = 4;
186
187const FOOTER_SIZE: usize = 24;
189
190#[inline]
194fn read_l0(bytes: &[u8], idx: usize) -> (u32, u32, u32, f32) {
195 let b = &bytes[idx * L0_SIZE..][..L0_SIZE];
196 let first_doc = u32::from_le_bytes([b[0], b[1], b[2], b[3]]);
197 let last_doc = u32::from_le_bytes([b[4], b[5], b[6], b[7]]);
198 let offset = u32::from_le_bytes([b[8], b[9], b[10], b[11]]);
199 let max_weight = f32::from_le_bytes([b[12], b[13], b[14], b[15]]);
200 (first_doc, last_doc, offset, max_weight)
201}
202
203#[inline]
205fn write_l0(buf: &mut Vec<u8>, first_doc: u32, last_doc: u32, offset: u32, max_weight: f32) {
206 buf.extend_from_slice(&first_doc.to_le_bytes());
207 buf.extend_from_slice(&last_doc.to_le_bytes());
208 buf.extend_from_slice(&offset.to_le_bytes());
209 buf.extend_from_slice(&max_weight.to_le_bytes());
210}
211
212#[inline]
217fn block_data_size(stream: &[u8], pos: usize) -> usize {
218 let count = u16::from_le_bytes(stream[pos..pos + 2].try_into().unwrap()) as usize;
219 let doc_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 6]);
220 let tf_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 7]);
221 let delta_bytes = if count > 1 {
222 (count - 1) * doc_rounded.bytes_per_value()
223 } else {
224 0
225 };
226 8 + delta_bytes + count * tf_rounded.bytes_per_value()
227}
228
229#[derive(Debug, Clone)]
230pub struct BlockPostingList {
231 stream: OwnedBytes,
233 l0_bytes: OwnedBytes,
236 l0_count: usize,
238 l1_docs: Vec<u32>,
241 doc_count: u32,
243 max_tf: u32,
245}
246
247impl BlockPostingList {
248 #[inline]
250 fn read_l0_entry(&self, idx: usize) -> (u32, u32, u32, f32) {
251 read_l0(&self.l0_bytes, idx)
252 }
253
254 pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
263 let mut stream: Vec<u8> = Vec::new();
264 let mut l0_buf: Vec<u8> = Vec::new();
265 let mut l1_docs: Vec<u32> = Vec::new();
266 let mut l0_count = 0usize;
267 let mut max_tf = 0u32;
268
269 let postings = &list.postings;
270 let mut i = 0;
271
272 let mut deltas = Vec::with_capacity(BLOCK_SIZE);
274 let mut tf_buf = Vec::with_capacity(BLOCK_SIZE);
275
276 while i < postings.len() {
277 if stream.len() > u32::MAX as usize {
278 return Err(io::Error::new(
279 io::ErrorKind::InvalidData,
280 "posting list stream exceeds u32::MAX bytes",
281 ));
282 }
283 let block_start = stream.len() as u32;
284 let block_end = (i + BLOCK_SIZE).min(postings.len());
285 let block = &postings[i..block_end];
286 let count = block.len();
287
288 let block_max_tf = block.iter().map(|p| p.term_freq).max().unwrap_or(0);
290 max_tf = max_tf.max(block_max_tf);
291
292 let base_doc_id = block.first().unwrap().doc_id;
293 let last_doc_id = block.last().unwrap().doc_id;
294
295 deltas.clear();
297 let mut prev = base_doc_id;
298 for posting in block.iter().skip(1) {
299 deltas.push(posting.doc_id - prev);
300 prev = posting.doc_id;
301 }
302 let max_delta = deltas.iter().copied().max().unwrap_or(0);
303 let doc_id_bits = simd::round_bit_width(simd::bits_needed(max_delta));
304
305 tf_buf.clear();
307 tf_buf.extend(block.iter().map(|p| p.term_freq));
308 let tf_bits = simd::round_bit_width(simd::bits_needed(block_max_tf));
309
310 stream.write_u16::<LittleEndian>(count as u16)?;
312 stream.write_u32::<LittleEndian>(base_doc_id)?;
313 stream.push(doc_id_bits);
314 stream.push(tf_bits);
315
316 if count > 1 {
318 let rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
319 let byte_count = (count - 1) * rounded.bytes_per_value();
320 let start = stream.len();
321 stream.resize(start + byte_count, 0);
322 simd::pack_rounded(&deltas, rounded, &mut stream[start..]);
323 }
324
325 {
327 let rounded = simd::RoundedBitWidth::from_u8(tf_bits);
328 let byte_count = count * rounded.bytes_per_value();
329 let start = stream.len();
330 stream.resize(start + byte_count, 0);
331 simd::pack_rounded(&tf_buf, rounded, &mut stream[start..]);
332 }
333
334 write_l0(
336 &mut l0_buf,
337 base_doc_id,
338 last_doc_id,
339 block_start,
340 block_max_tf as f32,
341 );
342 l0_count += 1;
343
344 if l0_count.is_multiple_of(L1_INTERVAL) {
346 l1_docs.push(last_doc_id);
347 }
348
349 i = block_end;
350 }
351
352 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
354 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
355 l1_docs.push(last_doc);
356 }
357
358 Ok(Self {
359 stream: OwnedBytes::new(stream),
360 l0_bytes: OwnedBytes::new(l0_buf),
361 l0_count,
362 l1_docs,
363 doc_count: postings.len() as u32,
364 max_tf,
365 })
366 }
367
368 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
378 writer.write_all(&self.stream)?;
379 writer.write_all(&self.l0_bytes)?;
380 for &doc in &self.l1_docs {
381 writer.write_u32::<LittleEndian>(doc)?;
382 }
383
384 writer.write_u64::<LittleEndian>(self.stream.len() as u64)?;
386 writer.write_u32::<LittleEndian>(self.l0_count as u32)?;
387 writer.write_u32::<LittleEndian>(self.l1_docs.len() as u32)?;
388 writer.write_u32::<LittleEndian>(self.doc_count)?;
389 writer.write_u32::<LittleEndian>(self.max_tf)?;
390
391 Ok(())
392 }
393
394 pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
396 if raw.len() < FOOTER_SIZE {
397 return Err(io::Error::new(
398 io::ErrorKind::InvalidData,
399 "posting data too short",
400 ));
401 }
402
403 let f = raw.len() - FOOTER_SIZE;
404 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
405 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
406 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
407 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
408 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
409
410 let l0_start = stream_len;
411 let l0_end = l0_start + l0_count * L0_SIZE;
412 let l1_start = l0_end;
413
414 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
415
416 Ok(Self {
417 stream: OwnedBytes::new(raw[..stream_len].to_vec()),
418 l0_bytes: OwnedBytes::new(raw[l0_start..l0_end].to_vec()),
419 l0_count,
420 l1_docs,
421 doc_count,
422 max_tf,
423 })
424 }
425
426 pub fn deserialize_zero_copy(raw: OwnedBytes) -> io::Result<Self> {
430 if raw.len() < FOOTER_SIZE {
431 return Err(io::Error::new(
432 io::ErrorKind::InvalidData,
433 "posting data too short",
434 ));
435 }
436
437 let f = raw.len() - FOOTER_SIZE;
438 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
439 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
440 let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
441 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
442 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
443
444 let l0_start = stream_len;
445 let l0_end = l0_start + l0_count * L0_SIZE;
446 let l1_start = l0_end;
447
448 let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
449
450 Ok(Self {
451 stream: raw.slice(0..stream_len),
452 l0_bytes: raw.slice(l0_start..l0_end),
453 l0_count,
454 l1_docs,
455 doc_count,
456 max_tf,
457 })
458 }
459
460 fn extract_l1_docs(bytes: &[u8], count: usize) -> Vec<u32> {
462 let mut docs = Vec::with_capacity(count);
463 for i in 0..count {
464 let p = i * L1_SIZE;
465 docs.push(u32::from_le_bytes(bytes[p..p + 4].try_into().unwrap()));
466 }
467 docs
468 }
469
470 pub fn doc_count(&self) -> u32 {
471 self.doc_count
472 }
473
474 pub fn max_tf(&self) -> u32 {
476 self.max_tf
477 }
478
479 pub fn num_blocks(&self) -> usize {
481 self.l0_count
482 }
483
484 pub fn block_max_tf(&self, block_idx: usize) -> Option<u32> {
486 if block_idx >= self.l0_count {
487 return None;
488 }
489 let (_, _, _, max_weight) = self.read_l0_entry(block_idx);
490 Some(max_weight as u32)
491 }
492
493 pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
496 let mut stream: Vec<u8> = Vec::new();
497 let mut l0_buf: Vec<u8> = Vec::new();
498 let mut l1_docs: Vec<u32> = Vec::new();
499 let mut l0_count = 0usize;
500 let mut total_docs = 0u32;
501 let mut max_tf = 0u32;
502
503 for (source, doc_offset) in sources {
504 max_tf = max_tf.max(source.max_tf);
505 for block_idx in 0..source.num_blocks() {
506 let (first_doc, last_doc, offset, max_weight) = source.read_l0_entry(block_idx);
507 let blk_size = block_data_size(&source.stream, offset as usize);
508 let block_bytes = &source.stream[offset as usize..offset as usize + blk_size];
509
510 let count = u16::from_le_bytes(block_bytes[0..2].try_into().unwrap());
511 if stream.len() > u32::MAX as usize {
512 return Err(io::Error::new(
513 io::ErrorKind::InvalidData,
514 "posting list stream exceeds u32::MAX bytes during concatenation",
515 ));
516 }
517 let new_offset = stream.len() as u32;
518
519 stream.write_u16::<LittleEndian>(count)?;
521 stream.write_u32::<LittleEndian>(first_doc + doc_offset)?;
522 stream.extend_from_slice(&block_bytes[6..]);
523
524 let new_last = last_doc + doc_offset;
525 write_l0(
526 &mut l0_buf,
527 first_doc + doc_offset,
528 new_last,
529 new_offset,
530 max_weight,
531 );
532 l0_count += 1;
533 total_docs += count as u32;
534
535 if l0_count.is_multiple_of(L1_INTERVAL) {
536 l1_docs.push(new_last);
537 }
538 }
539 }
540
541 if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
543 let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
544 l1_docs.push(last_doc);
545 }
546
547 Ok(Self {
548 stream: OwnedBytes::new(stream),
549 l0_bytes: OwnedBytes::new(l0_buf),
550 l0_count,
551 l1_docs,
552 doc_count: total_docs,
553 max_tf,
554 })
555 }
556
557 pub fn concatenate_streaming<W: Write>(
568 sources: &[(&[u8], u32)], writer: &mut W,
570 ) -> io::Result<(u32, usize)> {
571 struct SourceMeta {
572 stream_len: usize,
573 l0_count: usize,
574 }
575
576 let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
577 let mut total_docs = 0u32;
578 let mut merged_max_tf = 0u32;
579
580 for (raw, _) in sources {
581 if raw.len() < FOOTER_SIZE {
582 continue;
583 }
584 let f = raw.len() - FOOTER_SIZE;
585 let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
586 let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
587 let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
589 let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
590 total_docs += doc_count;
591 merged_max_tf = merged_max_tf.max(max_tf);
592 metas.push(SourceMeta {
593 stream_len,
594 l0_count,
595 });
596 }
597
598 let mut out_l0: Vec<u8> = Vec::new();
601 let mut out_l1_docs: Vec<u32> = Vec::new();
602 let mut out_l0_count = 0usize;
603 let mut stream_written = 0u64;
604 let mut patch_buf = [0u8; 8];
605
606 for (src_idx, meta) in metas.iter().enumerate() {
607 let (raw, doc_offset) = &sources[src_idx];
608 let l0_base = meta.stream_len; let src_stream = &raw[..meta.stream_len];
610
611 for i in 0..meta.l0_count {
612 let (first_doc, last_doc, offset, max_weight) = read_l0(&raw[l0_base..], i);
614
615 let blk_size = block_data_size(src_stream, offset as usize);
617 let block = &src_stream[offset as usize..offset as usize + blk_size];
618
619 let new_last = last_doc + doc_offset;
621 if stream_written > u32::MAX as u64 {
622 return Err(io::Error::new(
623 io::ErrorKind::InvalidData,
624 "posting list stream exceeds u32::MAX bytes during streaming merge",
625 ));
626 }
627 write_l0(
628 &mut out_l0,
629 first_doc + doc_offset,
630 new_last,
631 stream_written as u32,
632 max_weight,
633 );
634 out_l0_count += 1;
635
636 if out_l0_count.is_multiple_of(L1_INTERVAL) {
638 out_l1_docs.push(new_last);
639 }
640
641 patch_buf.copy_from_slice(&block[0..8]);
643 let blk_first = u32::from_le_bytes(patch_buf[2..6].try_into().unwrap());
644 patch_buf[2..6].copy_from_slice(&(blk_first + doc_offset).to_le_bytes());
645 writer.write_all(&patch_buf)?;
646 writer.write_all(&block[8..])?;
647
648 stream_written += blk_size as u64;
649 }
650 }
651
652 if !out_l0_count.is_multiple_of(L1_INTERVAL) && out_l0_count > 0 {
654 let (_, last_doc, _, _) = read_l0(&out_l0, out_l0_count - 1);
655 out_l1_docs.push(last_doc);
656 }
657
658 writer.write_all(&out_l0)?;
660 for &doc in &out_l1_docs {
661 writer.write_u32::<LittleEndian>(doc)?;
662 }
663
664 writer.write_u64::<LittleEndian>(stream_written)?;
665 writer.write_u32::<LittleEndian>(out_l0_count as u32)?;
666 writer.write_u32::<LittleEndian>(out_l1_docs.len() as u32)?;
667 writer.write_u32::<LittleEndian>(total_docs)?;
668 writer.write_u32::<LittleEndian>(merged_max_tf)?;
669
670 let l1_bytes_len = out_l1_docs.len() * L1_SIZE;
671 let total_bytes = stream_written as usize + out_l0.len() + l1_bytes_len + FOOTER_SIZE;
672 Ok((total_docs, total_bytes))
673 }
674
675 pub fn decode_block_into(
682 &self,
683 block_idx: usize,
684 doc_ids: &mut Vec<u32>,
685 tfs: &mut Vec<u32>,
686 ) -> bool {
687 if let Some((offset, tf_start, count)) = self.decode_block_doc_ids_only(block_idx, doc_ids)
688 {
689 self.decode_block_tfs_deferred(offset, tf_start, count, tfs);
690 true
691 } else {
692 false
693 }
694 }
695
696 pub fn decode_block_doc_ids_only(
701 &self,
702 block_idx: usize,
703 doc_ids: &mut Vec<u32>,
704 ) -> Option<(usize, usize, usize)> {
705 if block_idx >= self.l0_count {
706 return None;
707 }
708
709 let (_, _, offset, _) = self.read_l0_entry(block_idx);
710 let pos = offset as usize;
711 let blk_size = block_data_size(&self.stream, pos);
712 let block_data = &self.stream[pos..pos + blk_size];
713
714 let count = u16::from_le_bytes(block_data[0..2].try_into().unwrap()) as usize;
716 let first_doc = u32::from_le_bytes(block_data[2..6].try_into().unwrap());
717 let doc_id_bits = block_data[6];
718
719 doc_ids.clear();
720 doc_ids.resize(count, 0);
721 doc_ids[0] = first_doc;
722
723 let doc_rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
724 let deltas_bytes = if count > 1 {
725 (count - 1) * doc_rounded.bytes_per_value()
726 } else {
727 0
728 };
729
730 if count > 1 {
731 simd::unpack_rounded(
732 &block_data[8..8 + deltas_bytes],
733 doc_rounded,
734 &mut doc_ids[1..],
735 count - 1,
736 );
737 for i in 1..count {
738 doc_ids[i] += doc_ids[i - 1];
739 }
740 }
741
742 let tfs_start = 8 + deltas_bytes;
743 Some((pos, tfs_start, count))
744 }
745
746 pub fn decode_block_tfs_deferred(
750 &self,
751 block_offset: usize,
752 tf_start: usize,
753 count: usize,
754 tfs: &mut Vec<u32>,
755 ) {
756 let blk_size = block_data_size(&self.stream, block_offset);
757 let block_data = &self.stream[block_offset..block_offset + blk_size];
758 let tf_bits = block_data[7];
759 let tf_rounded = simd::RoundedBitWidth::from_u8(tf_bits);
760
761 tfs.clear();
762 tfs.resize(count, 0);
763 simd::unpack_rounded(
764 &block_data[tf_start..tf_start + count * tf_rounded.bytes_per_value()],
765 tf_rounded,
766 tfs,
767 count,
768 );
769 }
770
771 #[inline]
773 pub fn block_first_doc(&self, block_idx: usize) -> Option<DocId> {
774 if block_idx >= self.l0_count {
775 return None;
776 }
777 let (first_doc, _, _, _) = self.read_l0_entry(block_idx);
778 Some(first_doc)
779 }
780
781 #[inline]
783 pub fn block_last_doc(&self, block_idx: usize) -> Option<DocId> {
784 if block_idx >= self.l0_count {
785 return None;
786 }
787 let (_, last_doc, _, _) = self.read_l0_entry(block_idx);
788 Some(last_doc)
789 }
790
791 pub fn seek_block(&self, target: DocId, from_block: usize) -> Option<usize> {
799 if from_block >= self.l0_count {
800 return None;
801 }
802
803 let from_l1 = from_block / L1_INTERVAL;
804
805 let l1_idx = if !self.l1_docs.is_empty() {
807 let idx = from_l1 + simd::find_first_ge_u32(&self.l1_docs[from_l1..], target);
808 if idx >= self.l1_docs.len() {
809 return None;
810 }
811 idx
812 } else {
813 return None;
814 };
815
816 let start = (l1_idx * L1_INTERVAL).max(from_block);
818 let end = ((l1_idx + 1) * L1_INTERVAL).min(self.l0_count);
819 let count = end - start;
820
821 let mut last_docs = [u32::MAX; L1_INTERVAL];
822 for (j, idx) in (start..end).enumerate() {
823 let (_, ld, _, _) = read_l0(&self.l0_bytes, idx);
824 last_docs[j] = ld;
825 }
826 let within = simd::find_first_ge_u32(&last_docs[..count], target);
827 let block_idx = start + within;
828
829 if block_idx < self.l0_count {
830 Some(block_idx)
831 } else {
832 None
833 }
834 }
835
836 pub fn iterator(&self) -> BlockPostingIterator<'_> {
838 BlockPostingIterator::new(self)
839 }
840
841 pub fn into_iterator(self) -> BlockPostingIterator<'static> {
843 BlockPostingIterator::owned(self)
844 }
845}
846
847pub struct BlockPostingIterator<'a> {
854 block_list: std::borrow::Cow<'a, BlockPostingList>,
855 current_block: usize,
856 block_doc_ids: Vec<u32>,
857 block_tfs: Vec<u32>,
858 position_in_block: usize,
859 exhausted: bool,
860}
861
862impl<'a> BlockPostingIterator<'a> {
863 fn new(block_list: &'a BlockPostingList) -> Self {
864 let exhausted = block_list.l0_count == 0;
865 let mut iter = Self {
866 block_list: std::borrow::Cow::Borrowed(block_list),
867 current_block: 0,
868 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
869 block_tfs: Vec::with_capacity(BLOCK_SIZE),
870 position_in_block: 0,
871 exhausted,
872 };
873 if !iter.exhausted {
874 iter.load_block(0);
875 }
876 iter
877 }
878
879 fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
880 let exhausted = block_list.l0_count == 0;
881 let mut iter = BlockPostingIterator {
882 block_list: std::borrow::Cow::Owned(block_list),
883 current_block: 0,
884 block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
885 block_tfs: Vec::with_capacity(BLOCK_SIZE),
886 position_in_block: 0,
887 exhausted,
888 };
889 if !iter.exhausted {
890 iter.load_block(0);
891 }
892 iter
893 }
894
895 fn load_block(&mut self, block_idx: usize) {
896 if block_idx >= self.block_list.l0_count {
897 self.exhausted = true;
898 return;
899 }
900
901 self.current_block = block_idx;
902 self.position_in_block = 0;
903
904 self.block_list
905 .decode_block_into(block_idx, &mut self.block_doc_ids, &mut self.block_tfs);
906 }
907
908 pub fn doc(&self) -> DocId {
909 if self.exhausted {
910 TERMINATED
911 } else if self.position_in_block < self.block_doc_ids.len() {
912 self.block_doc_ids[self.position_in_block]
913 } else {
914 TERMINATED
915 }
916 }
917
918 pub fn term_freq(&self) -> u32 {
919 if self.exhausted || self.position_in_block >= self.block_tfs.len() {
920 0
921 } else {
922 self.block_tfs[self.position_in_block]
923 }
924 }
925
926 pub fn advance(&mut self) -> DocId {
927 if self.exhausted {
928 return TERMINATED;
929 }
930
931 self.position_in_block += 1;
932 if self.position_in_block >= self.block_doc_ids.len() {
933 self.load_block(self.current_block + 1);
934 }
935 self.doc()
936 }
937
938 pub fn seek(&mut self, target: DocId) -> DocId {
939 if self.exhausted {
940 return TERMINATED;
941 }
942
943 let block_idx = match self.block_list.seek_block(target, self.current_block) {
945 Some(idx) => idx,
946 None => {
947 self.exhausted = true;
948 return TERMINATED;
949 }
950 };
951
952 if block_idx != self.current_block {
953 self.load_block(block_idx);
954 }
955
956 let remaining = &self.block_doc_ids[self.position_in_block..];
958 let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
959 self.position_in_block += pos;
960
961 if self.position_in_block >= self.block_doc_ids.len() {
962 self.load_block(self.current_block + 1);
963 }
964 self.doc()
965 }
966
967 pub fn skip_to_next_block(&mut self) -> DocId {
971 if self.exhausted {
972 return TERMINATED;
973 }
974 self.load_block(self.current_block + 1);
975 self.doc()
976 }
977
978 #[inline]
980 pub fn current_block_idx(&self) -> usize {
981 self.current_block
982 }
983
984 #[inline]
986 pub fn num_blocks(&self) -> usize {
987 self.block_list.l0_count
988 }
989
990 #[inline]
992 pub fn current_block_max_tf(&self) -> u32 {
993 if self.exhausted || self.current_block >= self.block_list.l0_count {
994 0
995 } else {
996 let (_, _, _, max_weight) = self.block_list.read_l0_entry(self.current_block);
997 max_weight as u32
998 }
999 }
1000}
1001
1002#[cfg(test)]
1003mod tests {
1004 use super::*;
1005
1006 #[test]
1007 fn test_posting_list_basic() {
1008 let mut list = PostingList::new();
1009 list.push(1, 2);
1010 list.push(5, 1);
1011 list.push(10, 3);
1012
1013 assert_eq!(list.len(), 3);
1014
1015 let mut iter = PostingListIterator::new(&list);
1016 assert_eq!(iter.doc(), 1);
1017 assert_eq!(iter.term_freq(), 2);
1018
1019 assert_eq!(iter.advance(), 5);
1020 assert_eq!(iter.term_freq(), 1);
1021
1022 assert_eq!(iter.advance(), 10);
1023 assert_eq!(iter.term_freq(), 3);
1024
1025 assert_eq!(iter.advance(), TERMINATED);
1026 }
1027
1028 #[test]
1029 fn test_posting_list_serialization() {
1030 let mut list = PostingList::new();
1031 for i in 0..100 {
1032 list.push(i * 3, (i % 5) + 1);
1033 }
1034
1035 let mut buffer = Vec::new();
1036 list.serialize(&mut buffer).unwrap();
1037
1038 let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
1039 assert_eq!(deserialized.len(), list.len());
1040
1041 for (a, b) in list.iter().zip(deserialized.iter()) {
1042 assert_eq!(a, b);
1043 }
1044 }
1045
1046 #[test]
1047 fn test_posting_list_seek() {
1048 let mut list = PostingList::new();
1049 for i in 0..100 {
1050 list.push(i * 2, 1);
1051 }
1052
1053 let mut iter = PostingListIterator::new(&list);
1054
1055 assert_eq!(iter.seek(50), 50);
1056 assert_eq!(iter.seek(51), 52);
1057 assert_eq!(iter.seek(200), TERMINATED);
1058 }
1059
1060 #[test]
1061 fn test_block_posting_list() {
1062 let mut list = PostingList::new();
1063 for i in 0..500 {
1064 list.push(i * 2, (i % 10) + 1);
1065 }
1066
1067 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1068 assert_eq!(block_list.doc_count(), 500);
1069
1070 let mut iter = block_list.iterator();
1071 assert_eq!(iter.doc(), 0);
1072 assert_eq!(iter.term_freq(), 1);
1073
1074 assert_eq!(iter.seek(500), 500);
1076 assert_eq!(iter.seek(998), 998);
1077 assert_eq!(iter.seek(1000), TERMINATED);
1078 }
1079
1080 #[test]
1081 fn test_block_posting_list_serialization() {
1082 let mut list = PostingList::new();
1083 for i in 0..300 {
1084 list.push(i * 3, i + 1);
1085 }
1086
1087 let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1088
1089 let mut buffer = Vec::new();
1090 block_list.serialize(&mut buffer).unwrap();
1091
1092 let deserialized = BlockPostingList::deserialize(&buffer[..]).unwrap();
1093 assert_eq!(deserialized.doc_count(), block_list.doc_count());
1094
1095 let mut iter1 = block_list.iterator();
1097 let mut iter2 = deserialized.iterator();
1098
1099 while iter1.doc() != TERMINATED {
1100 assert_eq!(iter1.doc(), iter2.doc());
1101 assert_eq!(iter1.term_freq(), iter2.term_freq());
1102 iter1.advance();
1103 iter2.advance();
1104 }
1105 assert_eq!(iter2.doc(), TERMINATED);
1106 }
1107
1108 fn collect_postings(bpl: &BlockPostingList) -> Vec<(u32, u32)> {
1110 let mut result = Vec::new();
1111 let mut it = bpl.iterator();
1112 while it.doc() != TERMINATED {
1113 result.push((it.doc(), it.term_freq()));
1114 it.advance();
1115 }
1116 result
1117 }
1118
1119 fn build_bpl(postings: &[(u32, u32)]) -> BlockPostingList {
1121 let mut pl = PostingList::new();
1122 for &(doc_id, tf) in postings {
1123 pl.push(doc_id, tf);
1124 }
1125 BlockPostingList::from_posting_list(&pl).unwrap()
1126 }
1127
1128 fn serialize_bpl(bpl: &BlockPostingList) -> Vec<u8> {
1130 let mut buf = Vec::new();
1131 bpl.serialize(&mut buf).unwrap();
1132 buf
1133 }
1134
1135 #[test]
1136 fn test_concatenate_blocks_two_segments() {
1137 let a: Vec<(u32, u32)> = (0..100).map(|i| (i * 2, i + 1)).collect();
1139 let bpl_a = build_bpl(&a);
1140
1141 let b: Vec<(u32, u32)> = (0..100).map(|i| (i * 3, i + 2)).collect();
1143 let bpl_b = build_bpl(&b);
1144
1145 let merged =
1147 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 200)])
1148 .unwrap();
1149
1150 assert_eq!(merged.doc_count(), 200);
1151
1152 let postings = collect_postings(&merged);
1153 assert_eq!(postings.len(), 200);
1154
1155 for (i, p) in postings.iter().enumerate().take(100) {
1157 assert_eq!(*p, (i as u32 * 2, i as u32 + 1));
1158 }
1159 for i in 0..100 {
1161 assert_eq!(postings[100 + i], (i as u32 * 3 + 200, i as u32 + 2));
1162 }
1163 }
1164
1165 #[test]
1166 fn test_concatenate_streaming_matches_blocks() {
1167 let seg_a: Vec<(u32, u32)> = (0..250).map(|i| (i * 2, (i % 7) + 1)).collect();
1169 let seg_b: Vec<(u32, u32)> = (0..180).map(|i| (i * 5, (i % 3) + 1)).collect();
1170 let seg_c: Vec<(u32, u32)> = (0..90).map(|i| (i * 10, (i % 11) + 1)).collect();
1171
1172 let bpl_a = build_bpl(&seg_a);
1173 let bpl_b = build_bpl(&seg_b);
1174 let bpl_c = build_bpl(&seg_c);
1175
1176 let offset_b = 1000u32;
1177 let offset_c = 2000u32;
1178
1179 let ref_merged = BlockPostingList::concatenate_blocks(&[
1181 (bpl_a.clone(), 0),
1182 (bpl_b.clone(), offset_b),
1183 (bpl_c.clone(), offset_c),
1184 ])
1185 .unwrap();
1186 let mut ref_buf = Vec::new();
1187 ref_merged.serialize(&mut ref_buf).unwrap();
1188
1189 let bytes_a = serialize_bpl(&bpl_a);
1191 let bytes_b = serialize_bpl(&bpl_b);
1192 let bytes_c = serialize_bpl(&bpl_c);
1193
1194 let sources: Vec<(&[u8], u32)> =
1195 vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
1196 let mut stream_buf = Vec::new();
1197 let (doc_count, bytes_written) =
1198 BlockPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
1199
1200 assert_eq!(doc_count, 520); assert_eq!(bytes_written, stream_buf.len());
1202
1203 let ref_postings = collect_postings(&BlockPostingList::deserialize(&ref_buf).unwrap());
1205 let stream_postings =
1206 collect_postings(&BlockPostingList::deserialize(&stream_buf).unwrap());
1207
1208 assert_eq!(ref_postings.len(), stream_postings.len());
1209 for (i, (r, s)) in ref_postings.iter().zip(stream_postings.iter()).enumerate() {
1210 assert_eq!(r, s, "mismatch at posting {}", i);
1211 }
1212 }
1213
1214 #[test]
1215 fn test_multi_round_merge() {
1216 let segments: Vec<Vec<(u32, u32)>> = (0..4)
1223 .map(|seg| (0..200).map(|i| (i * 3, (i + seg * 7) % 10 + 1)).collect())
1224 .collect();
1225
1226 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1227 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1228
1229 let mut merged_01 = Vec::new();
1231 let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1232 let (dc_01, _) =
1233 BlockPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1234 assert_eq!(dc_01, 400);
1235
1236 let mut merged_23 = Vec::new();
1237 let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1238 let (dc_23, _) =
1239 BlockPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1240 assert_eq!(dc_23, 400);
1241
1242 let mut final_merged = Vec::new();
1244 let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1245 let (dc_final, _) =
1246 BlockPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1247 assert_eq!(dc_final, 800);
1248
1249 let final_bpl = BlockPostingList::deserialize(&final_merged).unwrap();
1251 let postings = collect_postings(&final_bpl);
1252 assert_eq!(postings.len(), 800);
1253
1254 assert_eq!(postings[0].0, 0); assert_eq!(postings[199].0, 597); assert_eq!(postings[200].0, 600); assert_eq!(postings[399].0, 1197); assert_eq!(postings[400].0, 1200); assert_eq!(postings[799].0, 2397); for seg in 0u32..4 {
1267 for i in 0u32..200 {
1268 let idx = (seg * 200 + i) as usize;
1269 assert_eq!(
1270 postings[idx].1,
1271 (i + seg * 7) % 10 + 1,
1272 "seg{} tf[{}]",
1273 seg,
1274 i
1275 );
1276 }
1277 }
1278
1279 let mut it = final_bpl.iterator();
1281 assert_eq!(it.seek(600), 600);
1282 assert_eq!(it.seek(1200), 1200);
1283 assert_eq!(it.seek(2397), 2397);
1284 assert_eq!(it.seek(2398), TERMINATED);
1285 }
1286
1287 #[test]
1288 fn test_large_scale_merge() {
1289 let num_segments = 5;
1292 let docs_per_segment = 2000;
1293 let docs_gap = 3; let segments: Vec<Vec<(u32, u32)>> = (0..num_segments)
1296 .map(|seg| {
1297 (0..docs_per_segment)
1298 .map(|i| (i as u32 * docs_gap, (i as u32 + seg as u32) % 20 + 1))
1299 .collect()
1300 })
1301 .collect();
1302
1303 let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1304
1305 for bpl in &bpls {
1307 assert!(
1308 bpl.num_blocks() >= 15,
1309 "expected >=15 blocks, got {}",
1310 bpl.num_blocks()
1311 );
1312 }
1313
1314 let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1315
1316 let max_doc_per_seg = (docs_per_segment as u32 - 1) * docs_gap;
1318 let offsets: Vec<u32> = (0..num_segments)
1319 .map(|i| i as u32 * (max_doc_per_seg + 1))
1320 .collect();
1321
1322 let sources: Vec<(&[u8], u32)> = serialized
1323 .iter()
1324 .zip(offsets.iter())
1325 .map(|(b, o)| (b.as_slice(), *o))
1326 .collect();
1327
1328 let mut merged = Vec::new();
1329 let (doc_count, _) =
1330 BlockPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1331 assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1332
1333 let merged_bpl = BlockPostingList::deserialize(&merged).unwrap();
1335 let postings = collect_postings(&merged_bpl);
1336 assert_eq!(postings.len(), num_segments * docs_per_segment);
1337
1338 for i in 1..postings.len() {
1340 assert!(
1341 postings[i].0 > postings[i - 1].0 || (i % docs_per_segment == 0), "doc_id not increasing at {}: {} vs {}",
1343 i,
1344 postings[i - 1].0,
1345 postings[i].0,
1346 );
1347 }
1348
1349 let mut it = merged_bpl.iterator();
1351 for (seg, &expected_first) in offsets.iter().enumerate() {
1352 assert_eq!(
1353 it.seek(expected_first),
1354 expected_first,
1355 "seek to segment {} start",
1356 seg
1357 );
1358 }
1359 }
1360
1361 #[test]
1362 fn test_merge_edge_cases() {
1363 let bpl_a = build_bpl(&[(0, 5)]);
1365 let bpl_b = build_bpl(&[(0, 3)]);
1366
1367 let merged =
1368 BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 1)])
1369 .unwrap();
1370 assert_eq!(merged.doc_count(), 2);
1371 let p = collect_postings(&merged);
1372 assert_eq!(p, vec![(0, 5), (1, 3)]);
1373
1374 let exact_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32).map(|i| (i, i % 5 + 1)).collect();
1376 let bpl_exact = build_bpl(&exact_block);
1377 assert_eq!(bpl_exact.num_blocks(), 1);
1378
1379 let bytes = serialize_bpl(&bpl_exact);
1380 let mut out = Vec::new();
1381 let sources: Vec<(&[u8], u32)> = vec![(&bytes, 0), (&bytes, BLOCK_SIZE as u32)];
1382 let (dc, _) = BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1383 assert_eq!(dc, BLOCK_SIZE as u32 * 2);
1384
1385 let merged = BlockPostingList::deserialize(&out).unwrap();
1386 let postings = collect_postings(&merged);
1387 assert_eq!(postings.len(), BLOCK_SIZE * 2);
1388 assert_eq!(postings[BLOCK_SIZE].0, BLOCK_SIZE as u32);
1390
1391 let over_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32 + 1).map(|i| (i * 2, 1)).collect();
1393 let bpl_over = build_bpl(&over_block);
1394 assert_eq!(bpl_over.num_blocks(), 2);
1395 }
1396
1397 #[test]
1398 fn test_streaming_roundtrip_single_source() {
1399 let docs: Vec<(u32, u32)> = (0..500).map(|i| (i * 7, i % 15 + 1)).collect();
1401 let bpl = build_bpl(&docs);
1402 let direct = serialize_bpl(&bpl);
1403
1404 let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1405 let mut streamed = Vec::new();
1406 BlockPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1407
1408 let p1 = collect_postings(&BlockPostingList::deserialize(&direct).unwrap());
1410 let p2 = collect_postings(&BlockPostingList::deserialize(&streamed).unwrap());
1411 assert_eq!(p1, p2);
1412 }
1413
1414 #[test]
1415 fn test_max_tf_preserved_through_merge() {
1416 let mut a = Vec::new();
1418 for i in 0..200 {
1419 a.push((i * 2, if i == 100 { 50 } else { 1 }));
1420 }
1421 let bpl_a = build_bpl(&a);
1422 assert_eq!(bpl_a.max_tf(), 50);
1423
1424 let mut b = Vec::new();
1426 for i in 0..200 {
1427 b.push((i * 2, if i == 50 { 30 } else { 2 }));
1428 }
1429 let bpl_b = build_bpl(&b);
1430 assert_eq!(bpl_b.max_tf(), 30);
1431
1432 let bytes_a = serialize_bpl(&bpl_a);
1434 let bytes_b = serialize_bpl(&bpl_b);
1435 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 1000)];
1436 let mut out = Vec::new();
1437 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1438
1439 let merged = BlockPostingList::deserialize(&out).unwrap();
1440 assert_eq!(merged.max_tf(), 50);
1441 assert_eq!(merged.doc_count(), 400);
1442 }
1443
1444 #[test]
1447 fn test_l0_l1_counts() {
1448 let bpl = build_bpl(&(0..50u32).map(|i| (i, 1)).collect::<Vec<_>>());
1450 assert_eq!(bpl.num_blocks(), 1);
1451 assert_eq!(bpl.l1_docs.len(), 1);
1452
1453 let n = BLOCK_SIZE * L1_INTERVAL;
1455 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1456 assert_eq!(bpl.num_blocks(), L1_INTERVAL);
1457 assert_eq!(bpl.l1_docs.len(), 1);
1458
1459 let n = BLOCK_SIZE * L1_INTERVAL + 1;
1461 let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1462 assert_eq!(bpl.num_blocks(), L1_INTERVAL + 1);
1463 assert_eq!(bpl.l1_docs.len(), 2);
1464
1465 let n = BLOCK_SIZE * L1_INTERVAL * 3;
1467 let bpl = build_bpl(&(0..n as u32).map(|i| (i, 1)).collect::<Vec<_>>());
1468 assert_eq!(bpl.num_blocks(), L1_INTERVAL * 3);
1469 assert_eq!(bpl.l1_docs.len(), 3);
1470 }
1471
1472 #[test]
1473 fn test_l1_last_doc_values() {
1474 let n = BLOCK_SIZE * 20;
1476 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1477 let bpl = build_bpl(&docs);
1478 assert_eq!(bpl.num_blocks(), 20);
1479 assert_eq!(bpl.l1_docs.len(), 3); let expected_l1_0 = bpl.block_last_doc(7).unwrap();
1483 assert_eq!(bpl.l1_docs[0], expected_l1_0);
1484
1485 let expected_l1_1 = bpl.block_last_doc(15).unwrap();
1487 assert_eq!(bpl.l1_docs[1], expected_l1_1);
1488
1489 let expected_l1_2 = bpl.block_last_doc(19).unwrap();
1491 assert_eq!(bpl.l1_docs[2], expected_l1_2);
1492 }
1493
1494 #[test]
1495 fn test_seek_block_basic() {
1496 let n = BLOCK_SIZE * 20;
1498 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 10, 1)).collect();
1499 let bpl = build_bpl(&docs);
1500
1501 assert_eq!(bpl.seek_block(0, 0), Some(0));
1503
1504 for blk in 0..20 {
1506 let first = bpl.block_first_doc(blk).unwrap();
1507 assert_eq!(
1508 bpl.seek_block(first, 0),
1509 Some(blk),
1510 "seek to block {} first_doc",
1511 blk
1512 );
1513 }
1514
1515 for blk in 0..20 {
1517 let last = bpl.block_last_doc(blk).unwrap();
1518 assert_eq!(
1519 bpl.seek_block(last, 0),
1520 Some(blk),
1521 "seek to block {} last_doc",
1522 blk
1523 );
1524 }
1525
1526 let max_doc = bpl.block_last_doc(19).unwrap();
1528 assert_eq!(bpl.seek_block(max_doc + 1, 0), None);
1529
1530 let mid_doc = bpl.block_first_doc(10).unwrap();
1532 assert_eq!(bpl.seek_block(mid_doc, 10), Some(10));
1533 assert_eq!(
1534 bpl.seek_block(mid_doc, 11),
1535 Some(11).or(bpl.seek_block(mid_doc, 11))
1536 );
1537 }
1538
1539 #[test]
1540 fn test_seek_block_across_l1_boundaries() {
1541 let n = BLOCK_SIZE * 24;
1543 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 5, 1)).collect();
1544 let bpl = build_bpl(&docs);
1545 assert_eq!(bpl.l1_docs.len(), 3);
1546
1547 for group in 0..3 {
1549 let blk = group * L1_INTERVAL;
1550 let target = bpl.block_first_doc(blk).unwrap();
1551 assert_eq!(
1552 bpl.seek_block(target, 0),
1553 Some(blk),
1554 "seek to group {} block {}",
1555 group,
1556 blk
1557 );
1558 }
1559
1560 let target = bpl.block_first_doc(20).unwrap() + 1;
1562 assert_eq!(bpl.seek_block(target, 0), Some(20));
1563 }
1564
1565 #[test]
1566 fn test_block_data_size_helper() {
1567 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 7, (i % 20) + 1)).collect();
1569 let bpl = build_bpl(&docs);
1570
1571 for blk in 0..bpl.num_blocks() {
1572 let (_, _, offset, _) = bpl.read_l0_entry(blk);
1573 let computed_size = block_data_size(&bpl.stream, offset as usize);
1574
1575 if blk + 1 < bpl.num_blocks() {
1578 let (_, _, next_offset, _) = bpl.read_l0_entry(blk + 1);
1579 assert_eq!(
1580 computed_size,
1581 (next_offset - offset) as usize,
1582 "block_data_size mismatch at block {}",
1583 blk
1584 );
1585 } else {
1586 assert_eq!(
1588 offset as usize + computed_size,
1589 bpl.stream.len(),
1590 "last block size mismatch"
1591 );
1592 }
1593 }
1594 }
1595
1596 #[test]
1597 fn test_l0_entry_roundtrip() {
1598 let docs: Vec<(u32, u32)> = (0..1000u32).map(|i| (i * 3, (i % 10) + 1)).collect();
1600 let bpl = build_bpl(&docs);
1601
1602 let bytes = serialize_bpl(&bpl);
1603 let bpl2 = BlockPostingList::deserialize(&bytes).unwrap();
1604
1605 assert_eq!(bpl.num_blocks(), bpl2.num_blocks());
1606 for blk in 0..bpl.num_blocks() {
1607 assert_eq!(
1608 bpl.read_l0_entry(blk),
1609 bpl2.read_l0_entry(blk),
1610 "L0 entry mismatch at block {}",
1611 blk
1612 );
1613 }
1614
1615 assert_eq!(bpl.l1_docs, bpl2.l1_docs);
1617 }
1618
1619 #[test]
1620 fn test_zero_copy_deserialize_matches() {
1621 let docs: Vec<(u32, u32)> = (0..2000u32).map(|i| (i * 2, (i % 5) + 1)).collect();
1622 let bpl = build_bpl(&docs);
1623 let bytes = serialize_bpl(&bpl);
1624
1625 let copied = BlockPostingList::deserialize(&bytes).unwrap();
1626 let zero_copy =
1627 BlockPostingList::deserialize_zero_copy(OwnedBytes::new(bytes.clone())).unwrap();
1628
1629 assert_eq!(copied.l0_count, zero_copy.l0_count);
1631 assert_eq!(copied.l1_docs, zero_copy.l1_docs);
1632 assert_eq!(copied.doc_count, zero_copy.doc_count);
1633 assert_eq!(copied.max_tf, zero_copy.max_tf);
1634
1635 let p1 = collect_postings(&copied);
1637 let p2 = collect_postings(&zero_copy);
1638 assert_eq!(p1, p2);
1639 }
1640
1641 #[test]
1642 fn test_l1_preserved_through_streaming_merge() {
1643 let seg_a = build_bpl(&(0..1000u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1645 let seg_b = build_bpl(&(0..800u32).map(|i| (i * 3, 2)).collect::<Vec<_>>());
1646 let seg_c = build_bpl(&(0..500u32).map(|i| (i * 5, 3)).collect::<Vec<_>>());
1647
1648 let bytes_a = serialize_bpl(&seg_a);
1649 let bytes_b = serialize_bpl(&seg_b);
1650 let bytes_c = serialize_bpl(&seg_c);
1651
1652 let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 10000), (&bytes_c, 20000)];
1653 let mut out = Vec::new();
1654 BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1655
1656 let merged = BlockPostingList::deserialize(&out).unwrap();
1657 let expected_l1_count = merged.num_blocks().div_ceil(L1_INTERVAL);
1658 assert_eq!(merged.l1_docs.len(), expected_l1_count);
1659
1660 for (i, &l1_doc) in merged.l1_docs.iter().enumerate() {
1662 let last_block_in_group = ((i + 1) * L1_INTERVAL - 1).min(merged.num_blocks() - 1);
1663 let expected = merged.block_last_doc(last_block_in_group).unwrap();
1664 assert_eq!(l1_doc, expected, "L1[{}] mismatch", i);
1665 }
1666
1667 for blk in 0..merged.num_blocks() {
1669 let first = merged.block_first_doc(blk).unwrap();
1670 assert_eq!(merged.seek_block(first, 0), Some(blk));
1671 }
1672 }
1673
1674 #[test]
1675 fn test_seek_block_single_block() {
1676 let bpl = build_bpl(&[(0, 1), (10, 2), (20, 3)]);
1678 assert_eq!(bpl.num_blocks(), 1);
1679 assert_eq!(bpl.l1_docs.len(), 1);
1680
1681 assert_eq!(bpl.seek_block(0, 0), Some(0));
1682 assert_eq!(bpl.seek_block(10, 0), Some(0));
1683 assert_eq!(bpl.seek_block(20, 0), Some(0));
1684 assert_eq!(bpl.seek_block(21, 0), None);
1685 }
1686
1687 #[test]
1688 fn test_footer_size() {
1689 let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 2, 1)).collect();
1691 let bpl = build_bpl(&docs);
1692 let bytes = serialize_bpl(&bpl);
1693
1694 let expected =
1695 bpl.stream.len() + bpl.l0_count * L0_SIZE + bpl.l1_docs.len() * L1_SIZE + FOOTER_SIZE;
1696 assert_eq!(bytes.len(), expected);
1697 }
1698
1699 #[test]
1700 fn test_seek_block_from_block_skips_earlier() {
1701 let n = BLOCK_SIZE * 16;
1703 let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1704 let bpl = build_bpl(&docs);
1705
1706 let target_in_5 = bpl.block_first_doc(5).unwrap() + 1;
1708 let result = bpl.seek_block(target_in_5, 8);
1711 assert!(result.is_some());
1712 assert!(result.unwrap() >= 8);
1713 }
1714}