1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Cursor, Read, Write};
10
11use super::config::WeightQuantization;
12use crate::DocId;
13use crate::structures::postings::TERMINATED;
14use crate::structures::simd;
15
16pub const BLOCK_SIZE: usize = 128;
17
18#[derive(Debug, Clone, Copy)]
19pub struct BlockHeader {
20 pub count: u16,
21 pub doc_id_bits: u8,
22 pub ordinal_bits: u8,
23 pub weight_quant: WeightQuantization,
24 pub first_doc_id: DocId,
25 pub max_weight: f32,
26}
27
28impl BlockHeader {
29 pub const SIZE: usize = 16;
30
31 pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
32 w.write_u16::<LittleEndian>(self.count)?;
33 w.write_u8(self.doc_id_bits)?;
34 w.write_u8(self.ordinal_bits)?;
35 w.write_u8(self.weight_quant as u8)?;
36 w.write_u8(0)?;
37 w.write_u16::<LittleEndian>(0)?;
38 w.write_u32::<LittleEndian>(self.first_doc_id)?;
39 w.write_f32::<LittleEndian>(self.max_weight)?;
40 Ok(())
41 }
42
43 pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
44 let count = r.read_u16::<LittleEndian>()?;
45 let doc_id_bits = r.read_u8()?;
46 let ordinal_bits = r.read_u8()?;
47 let weight_quant_byte = r.read_u8()?;
48 let _ = r.read_u8()?;
49 let _ = r.read_u16::<LittleEndian>()?;
50 let first_doc_id = r.read_u32::<LittleEndian>()?;
51 let max_weight = r.read_f32::<LittleEndian>()?;
52
53 let weight_quant = WeightQuantization::from_u8(weight_quant_byte)
54 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid weight quant"))?;
55
56 Ok(Self {
57 count,
58 doc_id_bits,
59 ordinal_bits,
60 weight_quant,
61 first_doc_id,
62 max_weight,
63 })
64 }
65}
66
67#[derive(Debug, Clone)]
68pub struct SparseBlock {
69 pub header: BlockHeader,
70 pub doc_ids_data: Vec<u8>,
71 pub ordinals_data: Vec<u8>,
72 pub weights_data: Vec<u8>,
73}
74
75impl SparseBlock {
76 pub fn from_postings(
77 postings: &[(DocId, u16, f32)],
78 weight_quant: WeightQuantization,
79 ) -> io::Result<Self> {
80 assert!(!postings.is_empty() && postings.len() <= BLOCK_SIZE);
81
82 let count = postings.len();
83 let first_doc_id = postings[0].0;
84
85 let mut deltas = Vec::with_capacity(count);
87 let mut prev = first_doc_id;
88 for &(doc_id, _, _) in postings {
89 deltas.push(doc_id.saturating_sub(prev));
90 prev = doc_id;
91 }
92 deltas[0] = 0;
93
94 let doc_id_bits = find_optimal_bit_width(&deltas[1..]);
95 let ordinals: Vec<u16> = postings.iter().map(|(_, o, _)| *o).collect();
96 let max_ordinal = ordinals.iter().copied().max().unwrap_or(0);
97 let ordinal_bits = if max_ordinal == 0 {
98 0
99 } else {
100 bits_needed_u16(max_ordinal)
101 };
102
103 let weights: Vec<f32> = postings.iter().map(|(_, _, w)| *w).collect();
104 let max_weight = weights.iter().copied().fold(0.0f32, f32::max);
105
106 let doc_ids_data = pack_bit_array(&deltas[1..], doc_id_bits);
107 let ordinals_data = if ordinal_bits > 0 {
108 pack_bit_array_u16(&ordinals, ordinal_bits)
109 } else {
110 Vec::new()
111 };
112 let weights_data = encode_weights(&weights, weight_quant)?;
113
114 Ok(Self {
115 header: BlockHeader {
116 count: count as u16,
117 doc_id_bits,
118 ordinal_bits,
119 weight_quant,
120 first_doc_id,
121 max_weight,
122 },
123 doc_ids_data,
124 ordinals_data,
125 weights_data,
126 })
127 }
128
129 pub fn decode_doc_ids(&self) -> Vec<DocId> {
130 let mut out = Vec::with_capacity(self.header.count as usize);
131 self.decode_doc_ids_into(&mut out);
132 out
133 }
134
135 pub fn decode_doc_ids_into(&self, out: &mut Vec<DocId>) {
137 let count = self.header.count as usize;
138 out.clear();
139 out.push(self.header.first_doc_id);
140
141 if count > 1 {
142 let deltas = unpack_bit_array(&self.doc_ids_data, self.header.doc_id_bits, count - 1);
143 let mut prev = self.header.first_doc_id;
144 for delta in deltas {
145 prev += delta;
146 out.push(prev);
147 }
148 }
149 }
150
151 pub fn decode_ordinals(&self) -> Vec<u16> {
152 let mut out = Vec::with_capacity(self.header.count as usize);
153 self.decode_ordinals_into(&mut out);
154 out
155 }
156
157 pub fn decode_ordinals_into(&self, out: &mut Vec<u16>) {
159 let count = self.header.count as usize;
160 out.clear();
161 if self.header.ordinal_bits == 0 {
162 out.resize(count, 0u16);
163 } else {
164 unpack_bit_array_u16_into(&self.ordinals_data, self.header.ordinal_bits, count, out);
165 }
166 }
167
168 pub fn decode_weights(&self) -> Vec<f32> {
169 let mut out = Vec::with_capacity(self.header.count as usize);
170 self.decode_weights_into(&mut out);
171 out
172 }
173
174 pub fn decode_weights_into(&self, out: &mut Vec<f32>) {
176 out.clear();
177 decode_weights_into(
178 &self.weights_data,
179 self.header.weight_quant,
180 self.header.count as usize,
181 out,
182 );
183 }
184
185 pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
186 self.header.write(w)?;
187 w.write_u16::<LittleEndian>(self.doc_ids_data.len() as u16)?;
188 w.write_u16::<LittleEndian>(self.ordinals_data.len() as u16)?;
189 w.write_u16::<LittleEndian>(self.weights_data.len() as u16)?;
190 w.write_u16::<LittleEndian>(0)?;
191 w.write_all(&self.doc_ids_data)?;
192 w.write_all(&self.ordinals_data)?;
193 w.write_all(&self.weights_data)?;
194 Ok(())
195 }
196
197 pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
198 let header = BlockHeader::read(r)?;
199 let doc_ids_len = r.read_u16::<LittleEndian>()? as usize;
200 let ordinals_len = r.read_u16::<LittleEndian>()? as usize;
201 let weights_len = r.read_u16::<LittleEndian>()? as usize;
202 let _ = r.read_u16::<LittleEndian>()?;
203
204 let mut doc_ids_data = vec![0u8; doc_ids_len];
205 r.read_exact(&mut doc_ids_data)?;
206 let mut ordinals_data = vec![0u8; ordinals_len];
207 r.read_exact(&mut ordinals_data)?;
208 let mut weights_data = vec![0u8; weights_len];
209 r.read_exact(&mut weights_data)?;
210
211 Ok(Self {
212 header,
213 doc_ids_data,
214 ordinals_data,
215 weights_data,
216 })
217 }
218
219 pub fn from_owned_bytes(data: crate::directories::OwnedBytes) -> crate::Result<Self> {
226 let b = data.as_slice();
227 if b.len() < BlockHeader::SIZE + 8 {
228 return Err(crate::Error::Corruption(
229 "sparse block too small".to_string(),
230 ));
231 }
232 let mut cursor = Cursor::new(&b[..BlockHeader::SIZE]);
233 let header =
234 BlockHeader::read(&mut cursor).map_err(|e| crate::Error::Corruption(e.to_string()))?;
235
236 let p = BlockHeader::SIZE;
237 let doc_ids_len = u16::from_le_bytes([b[p], b[p + 1]]) as usize;
238 let ordinals_len = u16::from_le_bytes([b[p + 2], b[p + 3]]) as usize;
239 let weights_len = u16::from_le_bytes([b[p + 4], b[p + 5]]) as usize;
240 let data_start = p + 8;
243 let ord_start = data_start + doc_ids_len;
244 let wt_start = ord_start + ordinals_len;
245
246 Ok(Self {
247 header,
248 doc_ids_data: b[data_start..ord_start].to_vec(),
249 ordinals_data: b[ord_start..wt_start].to_vec(),
250 weights_data: b[wt_start..wt_start + weights_len].to_vec(),
251 })
252 }
253
254 pub fn with_doc_offset(&self, doc_offset: u32) -> Self {
260 Self {
261 header: BlockHeader {
262 first_doc_id: self.header.first_doc_id + doc_offset,
263 ..self.header
264 },
265 doc_ids_data: self.doc_ids_data.clone(),
266 ordinals_data: self.ordinals_data.clone(),
267 weights_data: self.weights_data.clone(),
268 }
269 }
270}
271
272#[derive(Debug, Clone)]
277pub struct BlockSparsePostingList {
278 pub doc_count: u32,
279 pub blocks: Vec<SparseBlock>,
280}
281
282impl BlockSparsePostingList {
283 pub fn from_postings_with_block_size(
285 postings: &[(DocId, u16, f32)],
286 weight_quant: WeightQuantization,
287 block_size: usize,
288 ) -> io::Result<Self> {
289 if postings.is_empty() {
290 return Ok(Self {
291 doc_count: 0,
292 blocks: Vec::new(),
293 });
294 }
295
296 let block_size = block_size.max(16); let mut blocks = Vec::new();
298 for chunk in postings.chunks(block_size) {
299 blocks.push(SparseBlock::from_postings(chunk, weight_quant)?);
300 }
301
302 let mut unique_docs = 1u32;
307 for i in 1..postings.len() {
308 if postings[i].0 != postings[i - 1].0 {
309 unique_docs += 1;
310 }
311 }
312
313 Ok(Self {
314 doc_count: unique_docs,
315 blocks,
316 })
317 }
318
319 pub fn from_postings(
321 postings: &[(DocId, u16, f32)],
322 weight_quant: WeightQuantization,
323 ) -> io::Result<Self> {
324 Self::from_postings_with_block_size(postings, weight_quant, BLOCK_SIZE)
325 }
326
327 pub fn doc_count(&self) -> u32 {
328 self.doc_count
329 }
330
331 pub fn num_blocks(&self) -> usize {
332 self.blocks.len()
333 }
334
335 pub fn global_max_weight(&self) -> f32 {
336 self.blocks
337 .iter()
338 .map(|b| b.header.max_weight)
339 .fold(0.0f32, f32::max)
340 }
341
342 pub fn block_max_weight(&self, block_idx: usize) -> Option<f32> {
343 self.blocks.get(block_idx).map(|b| b.header.max_weight)
344 }
345
346 pub fn size_bytes(&self) -> usize {
348 use std::mem::size_of;
349
350 let header_size = size_of::<u32>() * 2; let blocks_size: usize = self
352 .blocks
353 .iter()
354 .map(|b| {
355 size_of::<BlockHeader>()
356 + b.doc_ids_data.len()
357 + b.ordinals_data.len()
358 + b.weights_data.len()
359 })
360 .sum();
361 header_size + blocks_size
362 }
363
364 pub fn iterator(&self) -> BlockSparsePostingIterator<'_> {
365 BlockSparsePostingIterator::new(self)
366 }
367
368 pub fn serialize<W: Write>(&self, w: &mut W) -> io::Result<()> {
377 use super::SparseSkipEntry;
378
379 w.write_u32::<LittleEndian>(self.doc_count)?;
380 w.write_f32::<LittleEndian>(self.global_max_weight())?;
381 w.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
382
383 let mut block_bytes: Vec<Vec<u8>> = Vec::with_capacity(self.blocks.len());
385 for block in &self.blocks {
386 let mut buf = Vec::new();
387 block.write(&mut buf)?;
388 block_bytes.push(buf);
389 }
390
391 let mut offset = 0u32;
393 for (block, bytes) in self.blocks.iter().zip(block_bytes.iter()) {
394 let first_doc = block.header.first_doc_id;
395 let doc_ids = block.decode_doc_ids();
396 let last_doc = doc_ids.last().copied().unwrap_or(first_doc);
397 let length = bytes.len() as u32;
398
399 let entry =
400 SparseSkipEntry::new(first_doc, last_doc, offset, length, block.header.max_weight);
401 entry.write(w)?;
402 offset += length;
403 }
404
405 for bytes in block_bytes {
407 w.write_all(&bytes)?;
408 }
409
410 Ok(())
411 }
412
413 pub fn serialize_v3(&self) -> io::Result<(Vec<u8>, Vec<super::SparseSkipEntry>)> {
419 let mut block_data = Vec::new();
421 let mut skip_entries = Vec::with_capacity(self.blocks.len());
422 let mut offset = 0u32;
423
424 for block in &self.blocks {
425 let mut buf = Vec::new();
426 block.write(&mut buf)?;
427 let length = buf.len() as u32;
428
429 let first_doc = block.header.first_doc_id;
430 let doc_ids = block.decode_doc_ids();
431 let last_doc = doc_ids.last().copied().unwrap_or(first_doc);
432
433 skip_entries.push(super::SparseSkipEntry::new(
434 first_doc,
435 last_doc,
436 offset,
437 length,
438 block.header.max_weight,
439 ));
440
441 block_data.extend_from_slice(&buf);
442 offset += length;
443 }
444
445 Ok((block_data, skip_entries))
446 }
447
448 pub fn deserialize<R: Read>(r: &mut R) -> io::Result<Self> {
451 use super::SparseSkipEntry;
452
453 let doc_count = r.read_u32::<LittleEndian>()?;
454 let _global_max_weight = r.read_f32::<LittleEndian>()?;
455 let num_blocks = r.read_u32::<LittleEndian>()? as usize;
456
457 for _ in 0..num_blocks {
459 let _ = SparseSkipEntry::read(r)?;
460 }
461
462 let mut blocks = Vec::with_capacity(num_blocks);
464 for _ in 0..num_blocks {
465 blocks.push(SparseBlock::read(r)?);
466 }
467 Ok(Self { doc_count, blocks })
468 }
469
470 pub fn deserialize_header<R: Read>(
473 r: &mut R,
474 ) -> io::Result<(u32, f32, Vec<super::SparseSkipEntry>, usize)> {
475 use super::SparseSkipEntry;
476
477 let doc_count = r.read_u32::<LittleEndian>()?;
478 let global_max_weight = r.read_f32::<LittleEndian>()?;
479 let num_blocks = r.read_u32::<LittleEndian>()? as usize;
480
481 let mut entries = Vec::with_capacity(num_blocks);
482 for _ in 0..num_blocks {
483 entries.push(SparseSkipEntry::read(r)?);
484 }
485
486 let header_size = 4 + 4 + 4 + num_blocks * SparseSkipEntry::SIZE;
488
489 Ok((doc_count, global_max_weight, entries, header_size))
490 }
491
492 pub fn decode_all(&self) -> Vec<(DocId, u16, f32)> {
493 let total_postings: usize = self.blocks.iter().map(|b| b.header.count as usize).sum();
494 let mut result = Vec::with_capacity(total_postings);
495 for block in &self.blocks {
496 let doc_ids = block.decode_doc_ids();
497 let ordinals = block.decode_ordinals();
498 let weights = block.decode_weights();
499 for i in 0..block.header.count as usize {
500 result.push((doc_ids[i], ordinals[i], weights[i]));
501 }
502 }
503 result
504 }
505
506 pub fn merge_with_offsets(lists: &[(&BlockSparsePostingList, u32)]) -> Self {
517 if lists.is_empty() {
518 return Self {
519 doc_count: 0,
520 blocks: Vec::new(),
521 };
522 }
523
524 let total_blocks: usize = lists.iter().map(|(pl, _)| pl.blocks.len()).sum();
526 let total_docs: u32 = lists.iter().map(|(pl, _)| pl.doc_count).sum();
527
528 let mut merged_blocks = Vec::with_capacity(total_blocks);
529
530 for (posting_list, doc_offset) in lists {
532 for block in &posting_list.blocks {
533 merged_blocks.push(block.with_doc_offset(*doc_offset));
534 }
535 }
536
537 Self {
538 doc_count: total_docs,
539 blocks: merged_blocks,
540 }
541 }
542
543 fn find_block(&self, target: DocId) -> Option<usize> {
544 if self.blocks.is_empty() {
545 return None;
546 }
547 let idx = self
550 .blocks
551 .partition_point(|b| b.header.first_doc_id <= target);
552 if idx == 0 {
553 Some(0)
555 } else {
556 Some(idx - 1)
557 }
558 }
559}
560
561pub struct BlockSparsePostingIterator<'a> {
566 posting_list: &'a BlockSparsePostingList,
567 block_idx: usize,
568 in_block_idx: usize,
569 current_doc_ids: Vec<DocId>,
570 current_ordinals: Vec<u16>,
571 current_weights: Vec<f32>,
572 exhausted: bool,
573}
574
575impl<'a> BlockSparsePostingIterator<'a> {
576 fn new(posting_list: &'a BlockSparsePostingList) -> Self {
577 let mut iter = Self {
578 posting_list,
579 block_idx: 0,
580 in_block_idx: 0,
581 current_doc_ids: Vec::new(),
582 current_ordinals: Vec::new(),
583 current_weights: Vec::new(),
584 exhausted: posting_list.blocks.is_empty(),
585 };
586 if !iter.exhausted {
587 iter.load_block(0);
588 }
589 iter
590 }
591
592 fn load_block(&mut self, block_idx: usize) {
593 if let Some(block) = self.posting_list.blocks.get(block_idx) {
594 block.decode_doc_ids_into(&mut self.current_doc_ids);
595 block.decode_ordinals_into(&mut self.current_ordinals);
596 block.decode_weights_into(&mut self.current_weights);
597 self.block_idx = block_idx;
598 self.in_block_idx = 0;
599 }
600 }
601
602 pub fn doc(&self) -> DocId {
603 if self.exhausted {
604 TERMINATED
605 } else {
606 self.current_doc_ids
607 .get(self.in_block_idx)
608 .copied()
609 .unwrap_or(TERMINATED)
610 }
611 }
612
613 pub fn weight(&self) -> f32 {
614 self.current_weights
615 .get(self.in_block_idx)
616 .copied()
617 .unwrap_or(0.0)
618 }
619
620 pub fn ordinal(&self) -> u16 {
621 self.current_ordinals
622 .get(self.in_block_idx)
623 .copied()
624 .unwrap_or(0)
625 }
626
627 pub fn advance(&mut self) -> DocId {
628 if self.exhausted {
629 return TERMINATED;
630 }
631 self.in_block_idx += 1;
632 if self.in_block_idx >= self.current_doc_ids.len() {
633 self.block_idx += 1;
634 if self.block_idx >= self.posting_list.blocks.len() {
635 self.exhausted = true;
636 } else {
637 self.load_block(self.block_idx);
638 }
639 }
640 self.doc()
641 }
642
643 pub fn seek(&mut self, target: DocId) -> DocId {
644 if self.exhausted {
645 return TERMINATED;
646 }
647 if self.doc() >= target {
648 return self.doc();
649 }
650
651 if let Some(&last_doc) = self.current_doc_ids.last()
653 && last_doc >= target
654 {
655 let remaining = &self.current_doc_ids[self.in_block_idx..];
656 let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
657 self.in_block_idx += pos;
658 if self.in_block_idx >= self.current_doc_ids.len() {
659 self.block_idx += 1;
660 if self.block_idx >= self.posting_list.blocks.len() {
661 self.exhausted = true;
662 } else {
663 self.load_block(self.block_idx);
664 }
665 }
666 return self.doc();
667 }
668
669 if let Some(block_idx) = self.posting_list.find_block(target) {
671 self.load_block(block_idx);
672 let pos = crate::structures::simd::find_first_ge_u32(&self.current_doc_ids, target);
673 self.in_block_idx = pos;
674 if self.in_block_idx >= self.current_doc_ids.len() {
675 self.block_idx += 1;
676 if self.block_idx >= self.posting_list.blocks.len() {
677 self.exhausted = true;
678 } else {
679 self.load_block(self.block_idx);
680 }
681 }
682 } else {
683 self.exhausted = true;
684 }
685 self.doc()
686 }
687
688 pub fn skip_to_next_block(&mut self) -> DocId {
691 if self.exhausted {
692 return TERMINATED;
693 }
694 let next = self.block_idx + 1;
695 if next >= self.posting_list.blocks.len() {
696 self.exhausted = true;
697 return TERMINATED;
698 }
699 self.load_block(next);
700 self.doc()
701 }
702
703 pub fn is_exhausted(&self) -> bool {
704 self.exhausted
705 }
706
707 pub fn current_block_max_weight(&self) -> f32 {
708 self.posting_list
709 .blocks
710 .get(self.block_idx)
711 .map(|b| b.header.max_weight)
712 .unwrap_or(0.0)
713 }
714
715 pub fn current_block_max_contribution(&self, query_weight: f32) -> f32 {
716 query_weight * self.current_block_max_weight()
717 }
718}
719
720fn find_optimal_bit_width(values: &[u32]) -> u8 {
725 if values.is_empty() {
726 return 0;
727 }
728 let max_val = values.iter().copied().max().unwrap_or(0);
729 simd::bits_needed(max_val)
730}
731
732fn bits_needed_u16(val: u16) -> u8 {
733 if val == 0 {
734 0
735 } else {
736 16 - val.leading_zeros() as u8
737 }
738}
739
740fn pack_bit_array(values: &[u32], bits: u8) -> Vec<u8> {
741 if bits == 0 || values.is_empty() {
742 return Vec::new();
743 }
744 let total_bytes = (values.len() * bits as usize).div_ceil(8);
745 let mut result = vec![0u8; total_bytes];
746 let mut bit_pos = 0usize;
747 for &val in values {
748 pack_value(&mut result, bit_pos, val & ((1u32 << bits) - 1), bits);
749 bit_pos += bits as usize;
750 }
751 result
752}
753
754fn pack_bit_array_u16(values: &[u16], bits: u8) -> Vec<u8> {
755 if bits == 0 || values.is_empty() {
756 return Vec::new();
757 }
758 let total_bytes = (values.len() * bits as usize).div_ceil(8);
759 let mut result = vec![0u8; total_bytes];
760 let mut bit_pos = 0usize;
761 for &val in values {
762 pack_value(
763 &mut result,
764 bit_pos,
765 (val as u32) & ((1u32 << bits) - 1),
766 bits,
767 );
768 bit_pos += bits as usize;
769 }
770 result
771}
772
773#[inline]
774fn pack_value(data: &mut [u8], bit_pos: usize, val: u32, bits: u8) {
775 let mut remaining = bits as usize;
776 let mut val = val;
777 let mut byte = bit_pos / 8;
778 let mut offset = bit_pos % 8;
779 while remaining > 0 {
780 let space = 8 - offset;
781 let to_write = remaining.min(space);
782 let mask = (1u32 << to_write) - 1;
783 data[byte] |= ((val & mask) as u8) << offset;
784 val >>= to_write;
785 remaining -= to_write;
786 byte += 1;
787 offset = 0;
788 }
789}
790
791fn unpack_bit_array(data: &[u8], bits: u8, count: usize) -> Vec<u32> {
792 if bits == 0 || count == 0 {
793 return vec![0; count];
794 }
795 let mut result = Vec::with_capacity(count);
796 let mut bit_pos = 0usize;
797 for _ in 0..count {
798 result.push(unpack_value(data, bit_pos, bits));
799 bit_pos += bits as usize;
800 }
801 result
802}
803
804fn unpack_bit_array_u16_into(data: &[u8], bits: u8, count: usize, out: &mut Vec<u16>) {
805 if bits == 0 || count == 0 {
806 out.resize(count, 0u16);
807 return;
808 }
809 let mut bit_pos = 0usize;
810 for _ in 0..count {
811 out.push(unpack_value(data, bit_pos, bits) as u16);
812 bit_pos += bits as usize;
813 }
814}
815
816#[inline]
817fn unpack_value(data: &[u8], bit_pos: usize, bits: u8) -> u32 {
818 let mut val = 0u32;
819 let mut remaining = bits as usize;
820 let mut byte = bit_pos / 8;
821 let mut offset = bit_pos % 8;
822 let mut shift = 0;
823 while remaining > 0 {
824 let space = 8 - offset;
825 let to_read = remaining.min(space);
826 let mask = (1u8 << to_read) - 1;
827 val |= (((data.get(byte).copied().unwrap_or(0) >> offset) & mask) as u32) << shift;
828 remaining -= to_read;
829 shift += to_read;
830 byte += 1;
831 offset = 0;
832 }
833 val
834}
835
836fn encode_weights(weights: &[f32], quant: WeightQuantization) -> io::Result<Vec<u8>> {
841 let mut data = Vec::new();
842 match quant {
843 WeightQuantization::Float32 => {
844 for &w in weights {
845 data.write_f32::<LittleEndian>(w)?;
846 }
847 }
848 WeightQuantization::Float16 => {
849 use half::f16;
850 for &w in weights {
851 data.write_u16::<LittleEndian>(f16::from_f32(w).to_bits())?;
852 }
853 }
854 WeightQuantization::UInt8 => {
855 let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
856 let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
857 let range = max - min;
858 let scale = if range < f32::EPSILON {
859 1.0
860 } else {
861 range / 255.0
862 };
863 data.write_f32::<LittleEndian>(scale)?;
864 data.write_f32::<LittleEndian>(min)?;
865 for &w in weights {
866 data.write_u8(((w - min) / scale).round() as u8)?;
867 }
868 }
869 WeightQuantization::UInt4 => {
870 let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
871 let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
872 let range = max - min;
873 let scale = if range < f32::EPSILON {
874 1.0
875 } else {
876 range / 15.0
877 };
878 data.write_f32::<LittleEndian>(scale)?;
879 data.write_f32::<LittleEndian>(min)?;
880 let mut i = 0;
881 while i < weights.len() {
882 let q1 = ((weights[i] - min) / scale).round() as u8 & 0x0F;
883 let q2 = if i + 1 < weights.len() {
884 ((weights[i + 1] - min) / scale).round() as u8 & 0x0F
885 } else {
886 0
887 };
888 data.write_u8((q2 << 4) | q1)?;
889 i += 2;
890 }
891 }
892 }
893 Ok(data)
894}
895
896fn decode_weights_into(data: &[u8], quant: WeightQuantization, count: usize, out: &mut Vec<f32>) {
897 let mut cursor = Cursor::new(data);
898 match quant {
899 WeightQuantization::Float32 => {
900 for _ in 0..count {
901 out.push(cursor.read_f32::<LittleEndian>().unwrap_or(0.0));
902 }
903 }
904 WeightQuantization::Float16 => {
905 use half::f16;
906 for _ in 0..count {
907 let bits = cursor.read_u16::<LittleEndian>().unwrap_or(0);
908 out.push(f16::from_bits(bits).to_f32());
909 }
910 }
911 WeightQuantization::UInt8 => {
912 let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
913 let min = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
914 for _ in 0..count {
915 let q = cursor.read_u8().unwrap_or(0);
916 out.push(q as f32 * scale + min);
917 }
918 }
919 WeightQuantization::UInt4 => {
920 let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
921 let min = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
922 let mut i = 0;
923 while i < count {
924 let byte = cursor.read_u8().unwrap_or(0);
925 out.push((byte & 0x0F) as f32 * scale + min);
926 i += 1;
927 if i < count {
928 out.push((byte >> 4) as f32 * scale + min);
929 i += 1;
930 }
931 }
932 }
933 }
934}
935
936#[cfg(test)]
937mod tests {
938 use super::*;
939
940 #[test]
941 fn test_block_roundtrip() {
942 let postings = vec![
943 (10u32, 0u16, 1.5f32),
944 (15, 0, 2.0),
945 (20, 1, 0.5),
946 (100, 0, 3.0),
947 ];
948 let block = SparseBlock::from_postings(&postings, WeightQuantization::Float32).unwrap();
949
950 assert_eq!(block.decode_doc_ids(), vec![10, 15, 20, 100]);
951 assert_eq!(block.decode_ordinals(), vec![0, 0, 1, 0]);
952 let weights = block.decode_weights();
953 assert!((weights[0] - 1.5).abs() < 0.01);
954 }
955
956 #[test]
957 fn test_posting_list() {
958 let postings: Vec<(DocId, u16, f32)> =
959 (0..300).map(|i| (i * 2, 0, i as f32 * 0.1)).collect();
960 let list =
961 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
962
963 assert_eq!(list.doc_count(), 300);
964 assert_eq!(list.num_blocks(), 3);
965
966 let mut iter = list.iterator();
967 assert_eq!(iter.doc(), 0);
968 iter.advance();
969 assert_eq!(iter.doc(), 2);
970 }
971
972 #[test]
973 fn test_serialization() {
974 let postings = vec![(1u32, 0u16, 0.5f32), (10, 1, 1.5), (100, 0, 2.5)];
975 let list =
976 BlockSparsePostingList::from_postings(&postings, WeightQuantization::UInt8).unwrap();
977
978 let mut buf = Vec::new();
979 list.serialize(&mut buf).unwrap();
980 let list2 = BlockSparsePostingList::deserialize(&mut Cursor::new(&buf)).unwrap();
981
982 assert_eq!(list.doc_count(), list2.doc_count());
983 }
984
985 #[test]
986 fn test_seek() {
987 let postings: Vec<(DocId, u16, f32)> = (0..500).map(|i| (i * 3, 0, i as f32)).collect();
988 let list =
989 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
990
991 let mut iter = list.iterator();
992 assert_eq!(iter.seek(300), 300);
993 assert_eq!(iter.seek(301), 303);
994 assert_eq!(iter.seek(2000), TERMINATED);
995 }
996
997 #[test]
998 fn test_merge_with_offsets() {
999 let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
1001 let list1 =
1002 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1003
1004 let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
1006 let list2 =
1007 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1008
1009 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1011
1012 assert_eq!(merged.doc_count(), 6);
1013
1014 let decoded = merged.decode_all();
1016 assert_eq!(decoded.len(), 6);
1017
1018 assert_eq!(decoded[0].0, 0);
1020 assert_eq!(decoded[1].0, 5);
1021 assert_eq!(decoded[2].0, 10);
1022
1023 assert_eq!(decoded[3].0, 100); assert_eq!(decoded[4].0, 103); assert_eq!(decoded[5].0, 107); assert!((decoded[0].2 - 1.0).abs() < 0.01);
1030 assert!((decoded[3].2 - 4.0).abs() < 0.01);
1031
1032 assert_eq!(decoded[2].1, 1); assert_eq!(decoded[4].1, 1); }
1036
1037 #[test]
1038 fn test_merge_with_offsets_multi_block() {
1039 let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
1041 let list1 =
1042 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1043 assert!(list1.num_blocks() > 1, "Should have multiple blocks");
1044
1045 let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
1046 let list2 =
1047 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1048
1049 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
1051
1052 assert_eq!(merged.doc_count(), 350);
1053 assert_eq!(merged.num_blocks(), list1.num_blocks() + list2.num_blocks());
1054
1055 let mut iter = merged.iterator();
1057
1058 assert_eq!(iter.doc(), 0);
1060
1061 let doc = iter.seek(1000);
1063 assert_eq!(doc, 1000); iter.advance();
1067 assert_eq!(iter.doc(), 1003); }
1069
1070 #[test]
1071 fn test_merge_with_offsets_serialize_roundtrip() {
1072 let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
1074 let list1 =
1075 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1076
1077 let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
1078 let list2 =
1079 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1080
1081 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1083
1084 let mut bytes = Vec::new();
1086 merged.serialize(&mut bytes).unwrap();
1087
1088 let mut cursor = std::io::Cursor::new(&bytes);
1090 let loaded = BlockSparsePostingList::deserialize(&mut cursor).unwrap();
1091
1092 let decoded = loaded.decode_all();
1094 assert_eq!(decoded.len(), 6);
1095
1096 assert_eq!(decoded[0].0, 0);
1098 assert_eq!(decoded[1].0, 5);
1099 assert_eq!(decoded[2].0, 10);
1100
1101 assert_eq!(decoded[3].0, 100, "First doc of seg2 should be 0+100=100");
1103 assert_eq!(decoded[4].0, 103, "Second doc of seg2 should be 3+100=103");
1104 assert_eq!(decoded[5].0, 107, "Third doc of seg2 should be 7+100=107");
1105
1106 let mut iter = loaded.iterator();
1108 assert_eq!(iter.doc(), 0);
1109 iter.advance();
1110 assert_eq!(iter.doc(), 5);
1111 iter.advance();
1112 assert_eq!(iter.doc(), 10);
1113 iter.advance();
1114 assert_eq!(iter.doc(), 100);
1115 iter.advance();
1116 assert_eq!(iter.doc(), 103);
1117 iter.advance();
1118 assert_eq!(iter.doc(), 107);
1119 }
1120
1121 #[test]
1122 fn test_merge_seek_after_roundtrip() {
1123 let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, 1.0)).collect();
1125 let list1 =
1126 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1127
1128 let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 0, 2.0)).collect();
1129 let list2 =
1130 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1131
1132 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
1134
1135 let mut bytes = Vec::new();
1137 merged.serialize(&mut bytes).unwrap();
1138 let loaded =
1139 BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1140
1141 let mut iter = loaded.iterator();
1143
1144 let doc = iter.seek(100);
1146 assert_eq!(doc, 100, "Seek to 100 in segment 1");
1147
1148 let doc = iter.seek(1000);
1150 assert_eq!(doc, 1000, "Seek to 1000 (first doc of segment 2)");
1151
1152 let doc = iter.seek(1050);
1154 assert!(
1155 doc >= 1050,
1156 "Seek to 1050 should find doc >= 1050, got {}",
1157 doc
1158 );
1159
1160 let doc = iter.seek(500);
1162 assert!(
1163 doc >= 1050,
1164 "Seek backwards should not go back, got {}",
1165 doc
1166 );
1167
1168 let mut iter2 = loaded.iterator();
1170
1171 let mut count = 0;
1173 let mut prev_doc = 0;
1174 while iter2.doc() != super::TERMINATED {
1175 let current = iter2.doc();
1176 if count > 0 {
1177 assert!(
1178 current > prev_doc,
1179 "Docs should be monotonically increasing: {} vs {}",
1180 prev_doc,
1181 current
1182 );
1183 }
1184 prev_doc = current;
1185 iter2.advance();
1186 count += 1;
1187 }
1188 assert_eq!(count, 350, "Should have 350 total docs");
1189 }
1190
1191 #[test]
1192 fn test_doc_count_multi_value() {
1193 let postings: Vec<(DocId, u16, f32)> = vec![
1196 (0, 0, 1.0),
1197 (0, 1, 1.5),
1198 (0, 2, 2.0),
1199 (5, 0, 3.0),
1200 (5, 1, 3.5),
1201 (10, 0, 4.0),
1202 ];
1203 let list =
1204 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1205
1206 assert_eq!(list.doc_count(), 3);
1208
1209 let decoded = list.decode_all();
1211 assert_eq!(decoded.len(), 6);
1212 }
1213
1214 #[test]
1218 fn test_zero_copy_merge_patches_first_doc_id() {
1219 use crate::structures::SparseSkipEntry;
1220
1221 let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
1223 let list1 =
1224 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1225 assert!(list1.num_blocks() > 1);
1226
1227 let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
1228 let list2 =
1229 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1230
1231 let mut bytes1 = Vec::new();
1233 list1.serialize(&mut bytes1).unwrap();
1234 let mut bytes2 = Vec::new();
1235 list2.serialize(&mut bytes2).unwrap();
1236
1237 fn parse_raw(data: &[u8]) -> (u32, f32, Vec<SparseSkipEntry>, &[u8]) {
1239 let doc_count = u32::from_le_bytes(data[0..4].try_into().unwrap());
1240 let global_max = f32::from_le_bytes(data[4..8].try_into().unwrap());
1241 let num_blocks = u32::from_le_bytes(data[8..12].try_into().unwrap()) as usize;
1242 let mut pos = 12;
1243 let mut skip = Vec::new();
1244 for _ in 0..num_blocks {
1245 let first_doc = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
1246 let last_doc = u32::from_le_bytes(data[pos + 4..pos + 8].try_into().unwrap());
1247 let offset = u32::from_le_bytes(data[pos + 8..pos + 12].try_into().unwrap());
1248 let length = u32::from_le_bytes(data[pos + 12..pos + 16].try_into().unwrap());
1249 let max_w = f32::from_le_bytes(data[pos + 16..pos + 20].try_into().unwrap());
1250 skip.push(SparseSkipEntry::new(
1251 first_doc, last_doc, offset, length, max_w,
1252 ));
1253 pos += 20;
1254 }
1255 (doc_count, global_max, skip, &data[pos..])
1256 }
1257
1258 let (dc1, gm1, skip1, raw1) = parse_raw(&bytes1);
1259 let (dc2, gm2, skip2, raw2) = parse_raw(&bytes2);
1260
1261 let doc_offset: u32 = 1000; let total_docs = dc1 + dc2;
1264 let global_max = gm1.max(gm2);
1265 let total_blocks = (skip1.len() + skip2.len()) as u32;
1266
1267 let mut output = Vec::new();
1268 output.extend_from_slice(&total_docs.to_le_bytes());
1270 output.extend_from_slice(&global_max.to_le_bytes());
1271 output.extend_from_slice(&total_blocks.to_le_bytes());
1272
1273 let mut block_data_offset = 0u32;
1275 for entry in &skip1 {
1276 let adjusted = SparseSkipEntry::new(
1277 entry.first_doc,
1278 entry.last_doc,
1279 block_data_offset + entry.offset,
1280 entry.length,
1281 entry.max_weight,
1282 );
1283 adjusted.write(&mut output).unwrap();
1284 }
1285 if let Some(last) = skip1.last() {
1286 block_data_offset += last.offset + last.length;
1287 }
1288 for entry in &skip2 {
1289 let adjusted = SparseSkipEntry::new(
1290 entry.first_doc + doc_offset,
1291 entry.last_doc + doc_offset,
1292 block_data_offset + entry.offset,
1293 entry.length,
1294 entry.max_weight,
1295 );
1296 adjusted.write(&mut output).unwrap();
1297 }
1298
1299 output.extend_from_slice(raw1);
1301
1302 const FIRST_DOC_ID_OFFSET: usize = 8;
1303 let mut buf2 = raw2.to_vec();
1304 for entry in &skip2 {
1305 let off = entry.offset as usize + FIRST_DOC_ID_OFFSET;
1306 if off + 4 <= buf2.len() {
1307 let old = u32::from_le_bytes(buf2[off..off + 4].try_into().unwrap());
1308 let patched = (old + doc_offset).to_le_bytes();
1309 buf2[off..off + 4].copy_from_slice(&patched);
1310 }
1311 }
1312 output.extend_from_slice(&buf2);
1313
1314 let loaded = BlockSparsePostingList::deserialize(&mut Cursor::new(&output)).unwrap();
1316 assert_eq!(loaded.doc_count(), 350);
1317
1318 let mut iter = loaded.iterator();
1319
1320 assert_eq!(iter.doc(), 0);
1322 let doc = iter.seek(100);
1323 assert_eq!(doc, 100);
1324 let doc = iter.seek(398);
1325 assert_eq!(doc, 398);
1326
1327 let doc = iter.seek(1000);
1329 assert_eq!(doc, 1000, "First doc of segment 2 should be 1000");
1330 iter.advance();
1331 assert_eq!(iter.doc(), 1003, "Second doc of segment 2 should be 1003");
1332 let doc = iter.seek(1447);
1333 assert_eq!(doc, 1447, "Last doc of segment 2 should be 1447");
1334
1335 iter.advance();
1337 assert_eq!(iter.doc(), super::TERMINATED);
1338
1339 let reference =
1341 BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, doc_offset)]);
1342 let mut ref_iter = reference.iterator();
1343 let mut zc_iter = loaded.iterator();
1344 while ref_iter.doc() != super::TERMINATED {
1345 assert_eq!(
1346 ref_iter.doc(),
1347 zc_iter.doc(),
1348 "Zero-copy and reference merge should produce identical doc_ids"
1349 );
1350 assert!(
1351 (ref_iter.weight() - zc_iter.weight()).abs() < 0.01,
1352 "Weights should match: {} vs {}",
1353 ref_iter.weight(),
1354 zc_iter.weight()
1355 );
1356 ref_iter.advance();
1357 zc_iter.advance();
1358 }
1359 assert_eq!(zc_iter.doc(), super::TERMINATED);
1360 }
1361
1362 #[test]
1363 fn test_doc_count_single_value() {
1364 let postings: Vec<(DocId, u16, f32)> =
1366 vec![(0, 0, 1.0), (5, 0, 2.0), (10, 0, 3.0), (15, 0, 4.0)];
1367 let list =
1368 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1369
1370 assert_eq!(list.doc_count(), 4);
1372 }
1373
1374 #[test]
1375 fn test_doc_count_multi_value_serialization_roundtrip() {
1376 let postings: Vec<(DocId, u16, f32)> =
1378 vec![(0, 0, 1.0), (0, 1, 1.5), (5, 0, 2.0), (5, 1, 2.5)];
1379 let list =
1380 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1381 assert_eq!(list.doc_count(), 2);
1382
1383 let mut buf = Vec::new();
1384 list.serialize(&mut buf).unwrap();
1385 let loaded = BlockSparsePostingList::deserialize(&mut Cursor::new(&buf)).unwrap();
1386 assert_eq!(loaded.doc_count(), 2);
1387 }
1388
1389 #[test]
1390 fn test_merge_preserves_weights_and_ordinals() {
1391 let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.5), (5, 1, 2.5), (10, 2, 3.5)];
1393 let list1 =
1394 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1395
1396 let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.5), (3, 1, 5.5), (7, 3, 6.5)];
1397 let list2 =
1398 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1399
1400 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1402
1403 let mut bytes = Vec::new();
1405 merged.serialize(&mut bytes).unwrap();
1406 let loaded =
1407 BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1408
1409 let mut iter = loaded.iterator();
1411
1412 assert_eq!(iter.doc(), 0);
1414 assert!(
1415 (iter.weight() - 1.5).abs() < 0.01,
1416 "Weight should be 1.5, got {}",
1417 iter.weight()
1418 );
1419 assert_eq!(iter.ordinal(), 0);
1420
1421 iter.advance();
1422 assert_eq!(iter.doc(), 5);
1423 assert!(
1424 (iter.weight() - 2.5).abs() < 0.01,
1425 "Weight should be 2.5, got {}",
1426 iter.weight()
1427 );
1428 assert_eq!(iter.ordinal(), 1);
1429
1430 iter.advance();
1431 assert_eq!(iter.doc(), 10);
1432 assert!(
1433 (iter.weight() - 3.5).abs() < 0.01,
1434 "Weight should be 3.5, got {}",
1435 iter.weight()
1436 );
1437 assert_eq!(iter.ordinal(), 2);
1438
1439 iter.advance();
1441 assert_eq!(iter.doc(), 100);
1442 assert!(
1443 (iter.weight() - 4.5).abs() < 0.01,
1444 "Weight should be 4.5, got {}",
1445 iter.weight()
1446 );
1447 assert_eq!(iter.ordinal(), 0);
1448
1449 iter.advance();
1450 assert_eq!(iter.doc(), 103);
1451 assert!(
1452 (iter.weight() - 5.5).abs() < 0.01,
1453 "Weight should be 5.5, got {}",
1454 iter.weight()
1455 );
1456 assert_eq!(iter.ordinal(), 1);
1457
1458 iter.advance();
1459 assert_eq!(iter.doc(), 107);
1460 assert!(
1461 (iter.weight() - 6.5).abs() < 0.01,
1462 "Weight should be 6.5, got {}",
1463 iter.weight()
1464 );
1465 assert_eq!(iter.ordinal(), 3);
1466
1467 iter.advance();
1469 assert_eq!(iter.doc(), super::TERMINATED);
1470 }
1471
1472 #[test]
1473 fn test_merge_global_max_weight() {
1474 let postings1: Vec<(DocId, u16, f32)> = vec![
1476 (0, 0, 3.0),
1477 (1, 0, 7.0), (2, 0, 2.0),
1479 ];
1480 let list1 =
1481 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1482
1483 let postings2: Vec<(DocId, u16, f32)> = vec![
1484 (0, 0, 5.0),
1485 (1, 0, 4.0),
1486 (2, 0, 6.0), ];
1488 let list2 =
1489 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1490
1491 assert!((list1.global_max_weight() - 7.0).abs() < 0.01);
1493 assert!((list2.global_max_weight() - 6.0).abs() < 0.01);
1494
1495 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1497
1498 assert!(
1500 (merged.global_max_weight() - 7.0).abs() < 0.01,
1501 "Global max should be 7.0, got {}",
1502 merged.global_max_weight()
1503 );
1504
1505 let mut bytes = Vec::new();
1507 merged.serialize(&mut bytes).unwrap();
1508 let loaded =
1509 BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1510
1511 assert!(
1512 (loaded.global_max_weight() - 7.0).abs() < 0.01,
1513 "After roundtrip, global max should still be 7.0, got {}",
1514 loaded.global_max_weight()
1515 );
1516 }
1517
1518 #[test]
1519 fn test_scoring_simulation_after_merge() {
1520 let postings1: Vec<(DocId, u16, f32)> = vec![
1522 (0, 0, 0.5), (5, 0, 0.8), ];
1525 let list1 =
1526 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1527
1528 let postings2: Vec<(DocId, u16, f32)> = vec![
1529 (0, 0, 0.6), (3, 0, 0.9), ];
1532 let list2 =
1533 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1534
1535 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1537
1538 let mut bytes = Vec::new();
1540 merged.serialize(&mut bytes).unwrap();
1541 let loaded =
1542 BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1543
1544 let query_weight = 2.0f32;
1546 let mut iter = loaded.iterator();
1547
1548 assert_eq!(iter.doc(), 0);
1551 let score = query_weight * iter.weight();
1552 assert!(
1553 (score - 1.0).abs() < 0.01,
1554 "Doc 0 score should be 1.0, got {}",
1555 score
1556 );
1557
1558 iter.advance();
1559 assert_eq!(iter.doc(), 5);
1561 let score = query_weight * iter.weight();
1562 assert!(
1563 (score - 1.6).abs() < 0.01,
1564 "Doc 5 score should be 1.6, got {}",
1565 score
1566 );
1567
1568 iter.advance();
1569 assert_eq!(iter.doc(), 100);
1571 let score = query_weight * iter.weight();
1572 assert!(
1573 (score - 1.2).abs() < 0.01,
1574 "Doc 100 score should be 1.2, got {}",
1575 score
1576 );
1577
1578 iter.advance();
1579 assert_eq!(iter.doc(), 103);
1581 let score = query_weight * iter.weight();
1582 assert!(
1583 (score - 1.8).abs() < 0.01,
1584 "Doc 103 score should be 1.8, got {}",
1585 score
1586 );
1587 }
1588}