1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Cursor, Read, Write};
10
11use super::config::WeightQuantization;
12use crate::DocId;
13use crate::directories::OwnedBytes;
14use crate::structures::postings::TERMINATED;
15use crate::structures::simd;
16
17pub const BLOCK_SIZE: usize = 128;
18pub const MAX_BLOCK_SIZE: usize = 256;
19
20#[derive(Debug, Clone, Copy)]
21pub struct BlockHeader {
22 pub count: u16,
23 pub doc_id_bits: u8,
24 pub ordinal_bits: u8,
25 pub weight_quant: WeightQuantization,
26 pub first_doc_id: DocId,
27 pub max_weight: f32,
28}
29
30impl BlockHeader {
31 pub const SIZE: usize = 16;
32
33 pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
34 w.write_u16::<LittleEndian>(self.count)?;
35 w.write_u8(self.doc_id_bits)?;
36 w.write_u8(self.ordinal_bits)?;
37 w.write_u8(self.weight_quant as u8)?;
38 w.write_u8(0)?;
39 w.write_u16::<LittleEndian>(0)?;
40 w.write_u32::<LittleEndian>(self.first_doc_id)?;
41 w.write_f32::<LittleEndian>(self.max_weight)?;
42 Ok(())
43 }
44
45 pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
46 let count = r.read_u16::<LittleEndian>()?;
47 let doc_id_bits = r.read_u8()?;
48 let ordinal_bits = r.read_u8()?;
49 let weight_quant_byte = r.read_u8()?;
50 let _ = r.read_u8()?;
51 let _ = r.read_u16::<LittleEndian>()?;
52 let first_doc_id = r.read_u32::<LittleEndian>()?;
53 let max_weight = r.read_f32::<LittleEndian>()?;
54
55 let weight_quant = WeightQuantization::from_u8(weight_quant_byte)
56 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid weight quant"))?;
57
58 Ok(Self {
59 count,
60 doc_id_bits,
61 ordinal_bits,
62 weight_quant,
63 first_doc_id,
64 max_weight,
65 })
66 }
67}
68
69#[derive(Debug, Clone)]
70pub struct SparseBlock {
71 pub header: BlockHeader,
72 pub doc_ids_data: OwnedBytes,
74 pub ordinals_data: OwnedBytes,
76 pub weights_data: OwnedBytes,
78}
79
80impl SparseBlock {
81 pub fn from_postings(
82 postings: &[(DocId, u16, f32)],
83 weight_quant: WeightQuantization,
84 ) -> io::Result<Self> {
85 assert!(!postings.is_empty() && postings.len() <= MAX_BLOCK_SIZE);
86
87 let count = postings.len();
88 let first_doc_id = postings[0].0;
89
90 let mut deltas = Vec::with_capacity(count);
92 let mut prev = first_doc_id;
93 for &(doc_id, _, _) in postings {
94 deltas.push(doc_id.saturating_sub(prev));
95 prev = doc_id;
96 }
97 deltas[0] = 0;
98
99 let doc_id_bits = simd::round_bit_width(find_optimal_bit_width(&deltas[1..]));
100 let ordinals: Vec<u16> = postings.iter().map(|(_, o, _)| *o).collect();
101 let max_ordinal = ordinals.iter().copied().max().unwrap_or(0);
102 let ordinal_bits = if max_ordinal == 0 {
103 0
104 } else {
105 simd::round_bit_width(bits_needed_u16(max_ordinal))
106 };
107
108 let weights: Vec<f32> = postings.iter().map(|(_, _, w)| *w).collect();
109 let max_weight = weights
110 .iter()
111 .copied()
112 .fold(0.0f32, |acc, w| acc.max(w.abs()));
113
114 let doc_ids_data = OwnedBytes::new({
115 let rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
116 let num_deltas = count - 1;
117 let byte_count = num_deltas * rounded.bytes_per_value();
118 let mut data = vec![0u8; byte_count];
119 simd::pack_rounded(&deltas[1..], rounded, &mut data);
120 data
121 });
122 let ordinals_data = OwnedBytes::new(if ordinal_bits > 0 {
123 let rounded = simd::RoundedBitWidth::from_u8(ordinal_bits);
124 let byte_count = count * rounded.bytes_per_value();
125 let mut data = vec![0u8; byte_count];
126 let ord_u32: Vec<u32> = ordinals.iter().map(|&o| o as u32).collect();
127 simd::pack_rounded(&ord_u32, rounded, &mut data);
128 data
129 } else {
130 Vec::new()
131 });
132 let weights_data = OwnedBytes::new(encode_weights(&weights, weight_quant)?);
133
134 Ok(Self {
135 header: BlockHeader {
136 count: count as u16,
137 doc_id_bits,
138 ordinal_bits,
139 weight_quant,
140 first_doc_id,
141 max_weight,
142 },
143 doc_ids_data,
144 ordinals_data,
145 weights_data,
146 })
147 }
148
149 pub fn decode_doc_ids(&self) -> Vec<DocId> {
150 let mut out = Vec::with_capacity(self.header.count as usize);
151 self.decode_doc_ids_into(&mut out);
152 out
153 }
154
155 pub fn decode_doc_ids_into(&self, out: &mut Vec<DocId>) {
159 let count = self.header.count as usize;
160 out.clear();
161 out.resize(count, 0);
162 out[0] = self.header.first_doc_id;
163
164 if count > 1 {
165 let bits = self.header.doc_id_bits;
166 if bits == 0 {
167 out[1..].fill(self.header.first_doc_id);
169 } else {
170 simd::unpack_rounded(
172 &self.doc_ids_data,
173 simd::RoundedBitWidth::from_u8(bits),
174 &mut out[1..],
175 count - 1,
176 );
177 for i in 1..count {
179 out[i] += out[i - 1];
180 }
181 }
182 }
183 }
184
185 pub fn decode_ordinals(&self) -> Vec<u16> {
186 let mut out = Vec::with_capacity(self.header.count as usize);
187 self.decode_ordinals_into(&mut out);
188 out
189 }
190
191 pub fn decode_ordinals_into(&self, out: &mut Vec<u16>) {
195 let count = self.header.count as usize;
196 out.clear();
197 if self.header.ordinal_bits == 0 {
198 out.resize(count, 0u16);
199 } else {
200 let mut temp = [0u32; BLOCK_SIZE];
202 simd::unpack_rounded(
203 &self.ordinals_data,
204 simd::RoundedBitWidth::from_u8(self.header.ordinal_bits),
205 &mut temp[..count],
206 count,
207 );
208 out.reserve(count);
209 for &v in &temp[..count] {
210 out.push(v as u16);
211 }
212 }
213 }
214
215 pub fn decode_weights(&self) -> Vec<f32> {
216 let mut out = Vec::with_capacity(self.header.count as usize);
217 self.decode_weights_into(&mut out);
218 out
219 }
220
221 pub fn decode_weights_into(&self, out: &mut Vec<f32>) {
223 out.clear();
224 decode_weights_into(
225 &self.weights_data,
226 self.header.weight_quant,
227 self.header.count as usize,
228 out,
229 );
230 }
231
232 pub fn decode_scored_weights_into(&self, query_weight: f32, out: &mut Vec<f32>) {
240 out.clear();
241 let count = self.header.count as usize;
242 match self.header.weight_quant {
243 WeightQuantization::UInt8 if self.weights_data.len() >= 8 => {
244 let scale = f32::from_le_bytes([
246 self.weights_data[0],
247 self.weights_data[1],
248 self.weights_data[2],
249 self.weights_data[3],
250 ]);
251 let min_val = f32::from_le_bytes([
252 self.weights_data[4],
253 self.weights_data[5],
254 self.weights_data[6],
255 self.weights_data[7],
256 ]);
257 let eff_scale = query_weight * scale;
259 let eff_bias = query_weight * min_val;
260 out.resize(count, 0.0);
261 simd::dequantize_uint8(&self.weights_data[8..], out, eff_scale, eff_bias, count);
262 }
263 _ => {
264 decode_weights_into(&self.weights_data, self.header.weight_quant, count, out);
266 for w in out.iter_mut() {
267 *w *= query_weight;
268 }
269 }
270 }
271 }
272
273 #[inline]
285 pub fn accumulate_scored_weights(
286 &self,
287 query_weight: f32,
288 doc_ids: &[u32],
289 flat_scores: &mut [f32],
290 base_doc: u32,
291 dirty: &mut Vec<u32>,
292 ) -> usize {
293 let count = self.header.count as usize;
294 match self.header.weight_quant {
295 WeightQuantization::UInt8 if self.weights_data.len() >= 8 => {
296 let scale = f32::from_le_bytes([
298 self.weights_data[0],
299 self.weights_data[1],
300 self.weights_data[2],
301 self.weights_data[3],
302 ]);
303 let min_val = f32::from_le_bytes([
304 self.weights_data[4],
305 self.weights_data[5],
306 self.weights_data[6],
307 self.weights_data[7],
308 ]);
309 let eff_scale = query_weight * scale;
310 let eff_bias = query_weight * min_val;
311 let quant_data = &self.weights_data[8..];
312
313 for i in 0..count.min(quant_data.len()) {
314 let w = quant_data[i] as f32 * eff_scale + eff_bias;
315 let off = (doc_ids[i] - base_doc) as usize;
316 if flat_scores[off] == 0.0 {
317 dirty.push(doc_ids[i]);
318 }
319 flat_scores[off] += w;
320 }
321 count
322 }
323 _ => {
324 let mut weights_buf = Vec::with_capacity(count);
326 decode_weights_into(
327 &self.weights_data,
328 self.header.weight_quant,
329 count,
330 &mut weights_buf,
331 );
332 for i in 0..count {
333 let w = weights_buf[i] * query_weight;
334 let off = (doc_ids[i] - base_doc) as usize;
335 if flat_scores[off] == 0.0 {
336 dirty.push(doc_ids[i]);
337 }
338 flat_scores[off] += w;
339 }
340 count
341 }
342 }
343 }
344
345 pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
346 self.header.write(w)?;
347 w.write_u16::<LittleEndian>(self.doc_ids_data.len() as u16)?;
348 w.write_u16::<LittleEndian>(self.ordinals_data.len() as u16)?;
349 w.write_u16::<LittleEndian>(self.weights_data.len() as u16)?;
350 w.write_u16::<LittleEndian>(0)?;
351 w.write_all(&self.doc_ids_data)?;
352 w.write_all(&self.ordinals_data)?;
353 w.write_all(&self.weights_data)?;
354 Ok(())
355 }
356
357 pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
358 let header = BlockHeader::read(r)?;
359 let doc_ids_len = r.read_u16::<LittleEndian>()? as usize;
360 let ordinals_len = r.read_u16::<LittleEndian>()? as usize;
361 let weights_len = r.read_u16::<LittleEndian>()? as usize;
362 let _ = r.read_u16::<LittleEndian>()?;
363
364 let mut doc_ids_vec = vec![0u8; doc_ids_len];
365 r.read_exact(&mut doc_ids_vec)?;
366 let mut ordinals_vec = vec![0u8; ordinals_len];
367 r.read_exact(&mut ordinals_vec)?;
368 let mut weights_vec = vec![0u8; weights_len];
369 r.read_exact(&mut weights_vec)?;
370
371 Ok(Self {
372 header,
373 doc_ids_data: OwnedBytes::new(doc_ids_vec),
374 ordinals_data: OwnedBytes::new(ordinals_vec),
375 weights_data: OwnedBytes::new(weights_vec),
376 })
377 }
378
379 pub fn from_owned_bytes(data: crate::directories::OwnedBytes) -> crate::Result<Self> {
385 let b = data.as_slice();
386 if b.len() < BlockHeader::SIZE + 8 {
387 return Err(crate::Error::Corruption(
388 "sparse block too small".to_string(),
389 ));
390 }
391 let mut cursor = Cursor::new(&b[..BlockHeader::SIZE]);
392 let header =
393 BlockHeader::read(&mut cursor).map_err(|e| crate::Error::Corruption(e.to_string()))?;
394
395 if header.count == 0 {
396 let hex: String = b
397 .iter()
398 .take(32)
399 .map(|x| format!("{x:02x}"))
400 .collect::<Vec<_>>()
401 .join(" ");
402 return Err(crate::Error::Corruption(format!(
403 "sparse block has count=0 (data_len={}, first_32_bytes=[{}])",
404 b.len(),
405 hex
406 )));
407 }
408
409 let p = BlockHeader::SIZE;
410 let doc_ids_len = u16::from_le_bytes([b[p], b[p + 1]]) as usize;
411 let ordinals_len = u16::from_le_bytes([b[p + 2], b[p + 3]]) as usize;
412 let weights_len = u16::from_le_bytes([b[p + 4], b[p + 5]]) as usize;
413 let data_start = p + 8;
416 let ord_start = data_start + doc_ids_len;
417 let wt_start = ord_start + ordinals_len;
418 let expected_end = wt_start + weights_len;
419
420 if expected_end > b.len() {
421 let hex: String = b
422 .iter()
423 .take(32)
424 .map(|x| format!("{x:02x}"))
425 .collect::<Vec<_>>()
426 .join(" ");
427 return Err(crate::Error::Corruption(format!(
428 "sparse block sub-block overflow: count={} doc_ids={}B ords={}B wts={}B need={}B have={}B (first_32=[{}])",
429 header.count,
430 doc_ids_len,
431 ordinals_len,
432 weights_len,
433 expected_end,
434 b.len(),
435 hex
436 )));
437 }
438
439 Ok(Self {
440 header,
441 doc_ids_data: data.slice(data_start..ord_start),
442 ordinals_data: data.slice(ord_start..wt_start),
443 weights_data: data.slice(wt_start..wt_start + weights_len),
444 })
445 }
446
447 pub fn with_doc_offset(&self, doc_offset: u32) -> Self {
453 Self {
454 header: BlockHeader {
455 first_doc_id: self.header.first_doc_id + doc_offset,
456 ..self.header
457 },
458 doc_ids_data: self.doc_ids_data.clone(),
459 ordinals_data: self.ordinals_data.clone(),
460 weights_data: self.weights_data.clone(),
461 }
462 }
463}
464
465#[derive(Debug, Clone)]
470pub struct BlockSparsePostingList {
471 pub doc_count: u32,
472 pub blocks: Vec<SparseBlock>,
473}
474
475impl BlockSparsePostingList {
476 pub fn from_postings_with_block_size(
478 postings: &[(DocId, u16, f32)],
479 weight_quant: WeightQuantization,
480 block_size: usize,
481 ) -> io::Result<Self> {
482 if postings.is_empty() {
483 return Ok(Self {
484 doc_count: 0,
485 blocks: Vec::new(),
486 });
487 }
488
489 let block_size = block_size.max(16); let mut blocks = Vec::new();
491 for chunk in postings.chunks(block_size) {
492 blocks.push(SparseBlock::from_postings(chunk, weight_quant)?);
493 }
494
495 let mut unique_docs = 1u32;
500 for i in 1..postings.len() {
501 if postings[i].0 != postings[i - 1].0 {
502 unique_docs += 1;
503 }
504 }
505
506 Ok(Self {
507 doc_count: unique_docs,
508 blocks,
509 })
510 }
511
512 pub fn from_postings(
514 postings: &[(DocId, u16, f32)],
515 weight_quant: WeightQuantization,
516 ) -> io::Result<Self> {
517 Self::from_postings_with_block_size(postings, weight_quant, BLOCK_SIZE)
518 }
519
520 pub fn from_postings_with_partition(
526 postings: &[(DocId, u16, f32)],
527 weight_quant: WeightQuantization,
528 partition: &[usize],
529 ) -> io::Result<Self> {
530 if postings.is_empty() {
531 return Ok(Self {
532 doc_count: 0,
533 blocks: Vec::new(),
534 });
535 }
536
537 let mut blocks = Vec::with_capacity(partition.len());
538 let mut offset = 0;
539 for &block_size in partition {
540 let end = (offset + block_size).min(postings.len());
541 blocks.push(SparseBlock::from_postings(
542 &postings[offset..end],
543 weight_quant,
544 )?);
545 offset = end;
546 }
547
548 let mut unique_docs = 1u32;
549 for i in 1..postings.len() {
550 if postings[i].0 != postings[i - 1].0 {
551 unique_docs += 1;
552 }
553 }
554
555 Ok(Self {
556 doc_count: unique_docs,
557 blocks,
558 })
559 }
560
561 pub fn doc_count(&self) -> u32 {
562 self.doc_count
563 }
564
565 pub fn num_blocks(&self) -> usize {
566 self.blocks.len()
567 }
568
569 pub fn global_max_weight(&self) -> f32 {
570 self.blocks
571 .iter()
572 .map(|b| b.header.max_weight)
573 .fold(0.0f32, f32::max)
574 }
575
576 pub fn block_max_weight(&self, block_idx: usize) -> Option<f32> {
577 self.blocks.get(block_idx).map(|b| b.header.max_weight)
578 }
579
580 pub fn size_bytes(&self) -> usize {
582 use std::mem::size_of;
583
584 let header_size = size_of::<u32>() * 2; let blocks_size: usize = self
586 .blocks
587 .iter()
588 .map(|b| {
589 size_of::<BlockHeader>()
590 + b.doc_ids_data.len()
591 + b.ordinals_data.len()
592 + b.weights_data.len()
593 })
594 .sum();
595 header_size + blocks_size
596 }
597
598 pub fn iterator(&self) -> BlockSparsePostingIterator<'_> {
599 BlockSparsePostingIterator::new(self)
600 }
601
602 pub fn serialize(&self) -> io::Result<(Vec<u8>, Vec<super::SparseSkipEntry>)> {
608 let mut block_data = Vec::new();
610 let mut skip_entries = Vec::with_capacity(self.blocks.len());
611 let mut offset = 0u64;
612
613 for block in &self.blocks {
614 let mut buf = Vec::new();
615 block.write(&mut buf)?;
616 let length = buf.len() as u32;
617
618 let first_doc = block.header.first_doc_id;
619 let doc_ids = block.decode_doc_ids();
620 let last_doc = doc_ids.last().copied().unwrap_or(first_doc);
621
622 skip_entries.push(super::SparseSkipEntry::new(
623 first_doc,
624 last_doc,
625 offset,
626 length,
627 block.header.max_weight,
628 ));
629
630 block_data.extend_from_slice(&buf);
631 offset += length as u64;
632 }
633
634 Ok((block_data, skip_entries))
635 }
636
637 #[cfg(test)]
642 pub fn from_parts(
643 doc_count: u32,
644 block_data: &[u8],
645 skip_entries: &[super::SparseSkipEntry],
646 ) -> io::Result<Self> {
647 let mut blocks = Vec::with_capacity(skip_entries.len());
648 for entry in skip_entries {
649 let start = entry.offset as usize;
650 let end = start + entry.length as usize;
651 blocks.push(SparseBlock::read(&mut std::io::Cursor::new(
652 &block_data[start..end],
653 ))?);
654 }
655 Ok(Self { doc_count, blocks })
656 }
657
658 pub fn decode_all(&self) -> Vec<(DocId, u16, f32)> {
659 let total_postings: usize = self.blocks.iter().map(|b| b.header.count as usize).sum();
660 let mut result = Vec::with_capacity(total_postings);
661 for block in &self.blocks {
662 let doc_ids = block.decode_doc_ids();
663 let ordinals = block.decode_ordinals();
664 let weights = block.decode_weights();
665 for i in 0..block.header.count as usize {
666 result.push((doc_ids[i], ordinals[i], weights[i]));
667 }
668 }
669 result
670 }
671
672 pub fn merge_with_offsets(lists: &[(&BlockSparsePostingList, u32)]) -> Self {
683 if lists.is_empty() {
684 return Self {
685 doc_count: 0,
686 blocks: Vec::new(),
687 };
688 }
689
690 let total_blocks: usize = lists.iter().map(|(pl, _)| pl.blocks.len()).sum();
692 let total_docs: u32 = lists.iter().map(|(pl, _)| pl.doc_count).sum();
693
694 let mut merged_blocks = Vec::with_capacity(total_blocks);
695
696 for (posting_list, doc_offset) in lists {
698 for block in &posting_list.blocks {
699 merged_blocks.push(block.with_doc_offset(*doc_offset));
700 }
701 }
702
703 Self {
704 doc_count: total_docs,
705 blocks: merged_blocks,
706 }
707 }
708
709 fn find_block(&self, target: DocId) -> Option<usize> {
710 if self.blocks.is_empty() {
711 return None;
712 }
713 let idx = self
716 .blocks
717 .partition_point(|b| b.header.first_doc_id <= target);
718 if idx == 0 {
719 Some(0)
721 } else {
722 Some(idx - 1)
723 }
724 }
725}
726
727pub struct BlockSparsePostingIterator<'a> {
732 posting_list: &'a BlockSparsePostingList,
733 block_idx: usize,
734 in_block_idx: usize,
735 current_doc_ids: Vec<DocId>,
736 current_ordinals: Vec<u16>,
737 current_weights: Vec<f32>,
738 ordinals_decoded: bool,
740 exhausted: bool,
741}
742
743impl<'a> BlockSparsePostingIterator<'a> {
744 fn new(posting_list: &'a BlockSparsePostingList) -> Self {
745 let mut iter = Self {
746 posting_list,
747 block_idx: 0,
748 in_block_idx: 0,
749 current_doc_ids: Vec::with_capacity(128),
750 current_ordinals: Vec::with_capacity(128),
751 current_weights: Vec::with_capacity(128),
752 ordinals_decoded: false,
753 exhausted: posting_list.blocks.is_empty(),
754 };
755 if !iter.exhausted {
756 iter.load_block(0);
757 }
758 iter
759 }
760
761 fn load_block(&mut self, block_idx: usize) {
762 if let Some(block) = self.posting_list.blocks.get(block_idx) {
763 block.decode_doc_ids_into(&mut self.current_doc_ids);
764 block.decode_weights_into(&mut self.current_weights);
765 self.ordinals_decoded = false;
767 self.block_idx = block_idx;
768 self.in_block_idx = 0;
769 }
770 }
771
772 #[inline]
774 fn ensure_ordinals_decoded(&mut self) {
775 if !self.ordinals_decoded {
776 if let Some(block) = self.posting_list.blocks.get(self.block_idx) {
777 block.decode_ordinals_into(&mut self.current_ordinals);
778 }
779 self.ordinals_decoded = true;
780 }
781 }
782
783 #[inline]
784 pub fn doc(&self) -> DocId {
785 if self.exhausted {
786 TERMINATED
787 } else {
788 self.current_doc_ids[self.in_block_idx]
790 }
791 }
792
793 #[inline]
794 pub fn weight(&self) -> f32 {
795 if self.exhausted {
796 return 0.0;
797 }
798 self.current_weights[self.in_block_idx]
800 }
801
802 #[inline]
803 pub fn ordinal(&mut self) -> u16 {
804 if self.exhausted {
805 return 0;
806 }
807 self.ensure_ordinals_decoded();
808 self.current_ordinals[self.in_block_idx]
809 }
810
811 pub fn advance(&mut self) -> DocId {
812 if self.exhausted {
813 return TERMINATED;
814 }
815 self.in_block_idx += 1;
816 if self.in_block_idx >= self.current_doc_ids.len() {
817 self.block_idx += 1;
818 if self.block_idx >= self.posting_list.blocks.len() {
819 self.exhausted = true;
820 } else {
821 self.load_block(self.block_idx);
822 }
823 }
824 self.doc()
825 }
826
827 pub fn seek(&mut self, target: DocId) -> DocId {
828 if self.exhausted {
829 return TERMINATED;
830 }
831 if self.doc() >= target {
832 return self.doc();
833 }
834
835 if let Some(&last_doc) = self.current_doc_ids.last()
837 && last_doc >= target
838 {
839 let remaining = &self.current_doc_ids[self.in_block_idx..];
840 let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
841 self.in_block_idx += pos;
842 if self.in_block_idx >= self.current_doc_ids.len() {
843 self.block_idx += 1;
844 if self.block_idx >= self.posting_list.blocks.len() {
845 self.exhausted = true;
846 } else {
847 self.load_block(self.block_idx);
848 }
849 }
850 return self.doc();
851 }
852
853 if let Some(block_idx) = self.posting_list.find_block(target) {
855 self.load_block(block_idx);
856 let pos = crate::structures::simd::find_first_ge_u32(&self.current_doc_ids, target);
857 self.in_block_idx = pos;
858 if self.in_block_idx >= self.current_doc_ids.len() {
859 self.block_idx += 1;
860 if self.block_idx >= self.posting_list.blocks.len() {
861 self.exhausted = true;
862 } else {
863 self.load_block(self.block_idx);
864 }
865 }
866 } else {
867 self.exhausted = true;
868 }
869 self.doc()
870 }
871
872 pub fn skip_to_next_block(&mut self) -> DocId {
875 if self.exhausted {
876 return TERMINATED;
877 }
878 let next = self.block_idx + 1;
879 if next >= self.posting_list.blocks.len() {
880 self.exhausted = true;
881 return TERMINATED;
882 }
883 self.load_block(next);
884 self.doc()
885 }
886
887 pub fn is_exhausted(&self) -> bool {
888 self.exhausted
889 }
890
891 pub fn current_block_max_weight(&self) -> f32 {
892 self.posting_list
893 .blocks
894 .get(self.block_idx)
895 .map(|b| b.header.max_weight)
896 .unwrap_or(0.0)
897 }
898
899 pub fn current_block_max_contribution(&self, query_weight: f32) -> f32 {
900 query_weight * self.current_block_max_weight()
901 }
902}
903
904fn find_optimal_bit_width(values: &[u32]) -> u8 {
909 if values.is_empty() {
910 return 0;
911 }
912 let max_val = values.iter().copied().max().unwrap_or(0);
913 simd::bits_needed(max_val)
914}
915
916fn bits_needed_u16(val: u16) -> u8 {
917 if val == 0 {
918 0
919 } else {
920 16 - val.leading_zeros() as u8
921 }
922}
923
924fn encode_weights(weights: &[f32], quant: WeightQuantization) -> io::Result<Vec<u8>> {
929 let mut data = Vec::new();
930 match quant {
931 WeightQuantization::Float32 => {
932 for &w in weights {
933 data.write_f32::<LittleEndian>(w)?;
934 }
935 }
936 WeightQuantization::Float16 => {
937 use half::f16;
938 for &w in weights {
939 data.write_u16::<LittleEndian>(f16::from_f32(w).to_bits())?;
940 }
941 }
942 WeightQuantization::UInt8 => {
943 let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
944 let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
945 let range = max - min;
946 let scale = if range < f32::EPSILON {
947 1.0
948 } else {
949 range / 255.0
950 };
951 data.write_f32::<LittleEndian>(scale)?;
952 data.write_f32::<LittleEndian>(min)?;
953 for &w in weights {
954 data.write_u8(((w - min) / scale).round() as u8)?;
955 }
956 }
957 WeightQuantization::UInt4 => {
958 let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
959 let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
960 let range = max - min;
961 let scale = if range < f32::EPSILON {
962 1.0
963 } else {
964 range / 15.0
965 };
966 data.write_f32::<LittleEndian>(scale)?;
967 data.write_f32::<LittleEndian>(min)?;
968 let mut i = 0;
969 while i < weights.len() {
970 let q1 = ((weights[i] - min) / scale).round() as u8 & 0x0F;
971 let q2 = if i + 1 < weights.len() {
972 ((weights[i + 1] - min) / scale).round() as u8 & 0x0F
973 } else {
974 0
975 };
976 data.write_u8((q2 << 4) | q1)?;
977 i += 2;
978 }
979 }
980 }
981 Ok(data)
982}
983
984fn decode_weights_into(data: &[u8], quant: WeightQuantization, count: usize, out: &mut Vec<f32>) {
985 let mut cursor = Cursor::new(data);
986 match quant {
987 WeightQuantization::Float32 => {
988 for _ in 0..count {
989 out.push(cursor.read_f32::<LittleEndian>().unwrap_or(0.0));
990 }
991 }
992 WeightQuantization::Float16 => {
993 use half::f16;
994 for _ in 0..count {
995 let bits = cursor.read_u16::<LittleEndian>().unwrap_or(0);
996 out.push(f16::from_bits(bits).to_f32());
997 }
998 }
999 WeightQuantization::UInt8 => {
1000 let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
1001 let min_val = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
1002 let offset = cursor.position() as usize;
1003 out.resize(count, 0.0);
1004 simd::dequantize_uint8(&data[offset..], out, scale, min_val, count);
1005 }
1006 WeightQuantization::UInt4 => {
1007 let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
1008 let min = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
1009 let mut i = 0;
1010 while i < count {
1011 let byte = cursor.read_u8().unwrap_or(0);
1012 out.push((byte & 0x0F) as f32 * scale + min);
1013 i += 1;
1014 if i < count {
1015 out.push((byte >> 4) as f32 * scale + min);
1016 i += 1;
1017 }
1018 }
1019 }
1020 }
1021}
1022
1023#[cfg(test)]
1024mod tests {
1025 use super::*;
1026
1027 #[test]
1028 fn test_block_roundtrip() {
1029 let postings = vec![
1030 (10u32, 0u16, 1.5f32),
1031 (15, 0, 2.0),
1032 (20, 1, 0.5),
1033 (100, 0, 3.0),
1034 ];
1035 let block = SparseBlock::from_postings(&postings, WeightQuantization::Float32).unwrap();
1036
1037 assert_eq!(block.decode_doc_ids(), vec![10, 15, 20, 100]);
1038 assert_eq!(block.decode_ordinals(), vec![0, 0, 1, 0]);
1039 let weights = block.decode_weights();
1040 assert!((weights[0] - 1.5).abs() < 0.01);
1041 }
1042
1043 #[test]
1044 fn test_posting_list() {
1045 let postings: Vec<(DocId, u16, f32)> =
1046 (0..300).map(|i| (i * 2, 0, i as f32 * 0.1)).collect();
1047 let list =
1048 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1049
1050 assert_eq!(list.doc_count(), 300);
1051 assert_eq!(list.num_blocks(), 3);
1052
1053 let mut iter = list.iterator();
1054 assert_eq!(iter.doc(), 0);
1055 iter.advance();
1056 assert_eq!(iter.doc(), 2);
1057 }
1058
1059 #[test]
1060 fn test_serialization() {
1061 let postings = vec![(1u32, 0u16, 0.5f32), (10, 1, 1.5), (100, 0, 2.5)];
1062 let list =
1063 BlockSparsePostingList::from_postings(&postings, WeightQuantization::UInt8).unwrap();
1064
1065 let (block_data, skip_entries) = list.serialize().unwrap();
1066 let list2 =
1067 BlockSparsePostingList::from_parts(list.doc_count(), &block_data, &skip_entries)
1068 .unwrap();
1069
1070 assert_eq!(list.doc_count(), list2.doc_count());
1071 }
1072
1073 #[test]
1074 fn test_seek() {
1075 let postings: Vec<(DocId, u16, f32)> = (0..500).map(|i| (i * 3, 0, i as f32)).collect();
1076 let list =
1077 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1078
1079 let mut iter = list.iterator();
1080 assert_eq!(iter.seek(300), 300);
1081 assert_eq!(iter.seek(301), 303);
1082 assert_eq!(iter.seek(2000), TERMINATED);
1083 }
1084
1085 #[test]
1086 fn test_merge_with_offsets() {
1087 let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
1089 let list1 =
1090 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1091
1092 let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
1094 let list2 =
1095 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1096
1097 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1099
1100 assert_eq!(merged.doc_count(), 6);
1101
1102 let decoded = merged.decode_all();
1104 assert_eq!(decoded.len(), 6);
1105
1106 assert_eq!(decoded[0].0, 0);
1108 assert_eq!(decoded[1].0, 5);
1109 assert_eq!(decoded[2].0, 10);
1110
1111 assert_eq!(decoded[3].0, 100); assert_eq!(decoded[4].0, 103); assert_eq!(decoded[5].0, 107); assert!((decoded[0].2 - 1.0).abs() < 0.01);
1118 assert!((decoded[3].2 - 4.0).abs() < 0.01);
1119
1120 assert_eq!(decoded[2].1, 1); assert_eq!(decoded[4].1, 1); }
1124
1125 #[test]
1126 fn test_merge_with_offsets_multi_block() {
1127 let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
1129 let list1 =
1130 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1131 assert!(list1.num_blocks() > 1, "Should have multiple blocks");
1132
1133 let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
1134 let list2 =
1135 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1136
1137 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
1139
1140 assert_eq!(merged.doc_count(), 350);
1141 assert_eq!(merged.num_blocks(), list1.num_blocks() + list2.num_blocks());
1142
1143 let mut iter = merged.iterator();
1145
1146 assert_eq!(iter.doc(), 0);
1148
1149 let doc = iter.seek(1000);
1151 assert_eq!(doc, 1000); iter.advance();
1155 assert_eq!(iter.doc(), 1003); }
1157
1158 #[test]
1159 fn test_merge_with_offsets_serialize_roundtrip() {
1160 let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
1162 let list1 =
1163 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1164
1165 let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
1166 let list2 =
1167 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1168
1169 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1171
1172 let (block_data, skip_entries) = merged.serialize().unwrap();
1174 let loaded =
1175 BlockSparsePostingList::from_parts(merged.doc_count(), &block_data, &skip_entries)
1176 .unwrap();
1177
1178 let decoded = loaded.decode_all();
1180 assert_eq!(decoded.len(), 6);
1181
1182 assert_eq!(decoded[0].0, 0);
1184 assert_eq!(decoded[1].0, 5);
1185 assert_eq!(decoded[2].0, 10);
1186
1187 assert_eq!(decoded[3].0, 100, "First doc of seg2 should be 0+100=100");
1189 assert_eq!(decoded[4].0, 103, "Second doc of seg2 should be 3+100=103");
1190 assert_eq!(decoded[5].0, 107, "Third doc of seg2 should be 7+100=107");
1191
1192 let mut iter = loaded.iterator();
1194 assert_eq!(iter.doc(), 0);
1195 iter.advance();
1196 assert_eq!(iter.doc(), 5);
1197 iter.advance();
1198 assert_eq!(iter.doc(), 10);
1199 iter.advance();
1200 assert_eq!(iter.doc(), 100);
1201 iter.advance();
1202 assert_eq!(iter.doc(), 103);
1203 iter.advance();
1204 assert_eq!(iter.doc(), 107);
1205 }
1206
1207 #[test]
1208 fn test_merge_seek_after_roundtrip() {
1209 let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, 1.0)).collect();
1211 let list1 =
1212 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1213
1214 let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 0, 2.0)).collect();
1215 let list2 =
1216 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1217
1218 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
1220
1221 let (block_data, skip_entries) = merged.serialize().unwrap();
1223 let loaded =
1224 BlockSparsePostingList::from_parts(merged.doc_count(), &block_data, &skip_entries)
1225 .unwrap();
1226
1227 let mut iter = loaded.iterator();
1229
1230 let doc = iter.seek(100);
1232 assert_eq!(doc, 100, "Seek to 100 in segment 1");
1233
1234 let doc = iter.seek(1000);
1236 assert_eq!(doc, 1000, "Seek to 1000 (first doc of segment 2)");
1237
1238 let doc = iter.seek(1050);
1240 assert!(
1241 doc >= 1050,
1242 "Seek to 1050 should find doc >= 1050, got {}",
1243 doc
1244 );
1245
1246 let doc = iter.seek(500);
1248 assert!(
1249 doc >= 1050,
1250 "Seek backwards should not go back, got {}",
1251 doc
1252 );
1253
1254 let mut iter2 = loaded.iterator();
1256
1257 let mut count = 0;
1259 let mut prev_doc = 0;
1260 while iter2.doc() != super::TERMINATED {
1261 let current = iter2.doc();
1262 if count > 0 {
1263 assert!(
1264 current > prev_doc,
1265 "Docs should be monotonically increasing: {} vs {}",
1266 prev_doc,
1267 current
1268 );
1269 }
1270 prev_doc = current;
1271 iter2.advance();
1272 count += 1;
1273 }
1274 assert_eq!(count, 350, "Should have 350 total docs");
1275 }
1276
1277 #[test]
1278 fn test_doc_count_multi_value() {
1279 let postings: Vec<(DocId, u16, f32)> = vec![
1282 (0, 0, 1.0),
1283 (0, 1, 1.5),
1284 (0, 2, 2.0),
1285 (5, 0, 3.0),
1286 (5, 1, 3.5),
1287 (10, 0, 4.0),
1288 ];
1289 let list =
1290 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1291
1292 assert_eq!(list.doc_count(), 3);
1294
1295 let decoded = list.decode_all();
1297 assert_eq!(decoded.len(), 6);
1298 }
1299
1300 #[test]
1304 fn test_zero_copy_merge_patches_first_doc_id() {
1305 use crate::structures::SparseSkipEntry;
1306
1307 let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
1309 let list1 =
1310 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1311 assert!(list1.num_blocks() > 1);
1312
1313 let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
1314 let list2 =
1315 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1316
1317 let (raw1, skip1) = list1.serialize().unwrap();
1319 let (raw2, skip2) = list2.serialize().unwrap();
1320
1321 let doc_offset: u32 = 1000; let total_docs = list1.doc_count() + list2.doc_count();
1324
1325 let mut merged_skip = Vec::new();
1327 let mut cumulative_offset = 0u64;
1328 for entry in &skip1 {
1329 merged_skip.push(SparseSkipEntry::new(
1330 entry.first_doc,
1331 entry.last_doc,
1332 cumulative_offset + entry.offset,
1333 entry.length,
1334 entry.max_weight,
1335 ));
1336 }
1337 if let Some(last) = skip1.last() {
1338 cumulative_offset += last.offset + last.length as u64;
1339 }
1340 for entry in &skip2 {
1341 merged_skip.push(SparseSkipEntry::new(
1342 entry.first_doc + doc_offset,
1343 entry.last_doc + doc_offset,
1344 cumulative_offset + entry.offset,
1345 entry.length,
1346 entry.max_weight,
1347 ));
1348 }
1349
1350 let mut merged_block_data = Vec::new();
1352 merged_block_data.extend_from_slice(&raw1);
1353
1354 const FIRST_DOC_ID_OFFSET: usize = 8;
1355 let mut buf2 = raw2.to_vec();
1356 for entry in &skip2 {
1357 let off = entry.offset as usize + FIRST_DOC_ID_OFFSET;
1358 if off + 4 <= buf2.len() {
1359 let old = u32::from_le_bytes(buf2[off..off + 4].try_into().unwrap());
1360 let patched = (old + doc_offset).to_le_bytes();
1361 buf2[off..off + 4].copy_from_slice(&patched);
1362 }
1363 }
1364 merged_block_data.extend_from_slice(&buf2);
1365
1366 let loaded =
1368 BlockSparsePostingList::from_parts(total_docs, &merged_block_data, &merged_skip)
1369 .unwrap();
1370 assert_eq!(loaded.doc_count(), 350);
1371
1372 let mut iter = loaded.iterator();
1373
1374 assert_eq!(iter.doc(), 0);
1376 let doc = iter.seek(100);
1377 assert_eq!(doc, 100);
1378 let doc = iter.seek(398);
1379 assert_eq!(doc, 398);
1380
1381 let doc = iter.seek(1000);
1383 assert_eq!(doc, 1000, "First doc of segment 2 should be 1000");
1384 iter.advance();
1385 assert_eq!(iter.doc(), 1003, "Second doc of segment 2 should be 1003");
1386 let doc = iter.seek(1447);
1387 assert_eq!(doc, 1447, "Last doc of segment 2 should be 1447");
1388
1389 iter.advance();
1391 assert_eq!(iter.doc(), super::TERMINATED);
1392
1393 let reference =
1395 BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, doc_offset)]);
1396 let mut ref_iter = reference.iterator();
1397 let mut zc_iter = loaded.iterator();
1398 while ref_iter.doc() != super::TERMINATED {
1399 assert_eq!(
1400 ref_iter.doc(),
1401 zc_iter.doc(),
1402 "Zero-copy and reference merge should produce identical doc_ids"
1403 );
1404 assert!(
1405 (ref_iter.weight() - zc_iter.weight()).abs() < 0.01,
1406 "Weights should match: {} vs {}",
1407 ref_iter.weight(),
1408 zc_iter.weight()
1409 );
1410 ref_iter.advance();
1411 zc_iter.advance();
1412 }
1413 assert_eq!(zc_iter.doc(), super::TERMINATED);
1414 }
1415
1416 #[test]
1417 fn test_doc_count_single_value() {
1418 let postings: Vec<(DocId, u16, f32)> =
1420 vec![(0, 0, 1.0), (5, 0, 2.0), (10, 0, 3.0), (15, 0, 4.0)];
1421 let list =
1422 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1423
1424 assert_eq!(list.doc_count(), 4);
1426 }
1427
1428 #[test]
1429 fn test_doc_count_multi_value_serialization_roundtrip() {
1430 let postings: Vec<(DocId, u16, f32)> =
1432 vec![(0, 0, 1.0), (0, 1, 1.5), (5, 0, 2.0), (5, 1, 2.5)];
1433 let list =
1434 BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1435 assert_eq!(list.doc_count(), 2);
1436
1437 let (block_data, skip_entries) = list.serialize().unwrap();
1438 let loaded =
1439 BlockSparsePostingList::from_parts(list.doc_count(), &block_data, &skip_entries)
1440 .unwrap();
1441 assert_eq!(loaded.doc_count(), 2);
1442 }
1443
1444 #[test]
1445 fn test_merge_preserves_weights_and_ordinals() {
1446 let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.5), (5, 1, 2.5), (10, 2, 3.5)];
1448 let list1 =
1449 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1450
1451 let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.5), (3, 1, 5.5), (7, 3, 6.5)];
1452 let list2 =
1453 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1454
1455 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1457
1458 let (block_data, skip_entries) = merged.serialize().unwrap();
1460 let loaded =
1461 BlockSparsePostingList::from_parts(merged.doc_count(), &block_data, &skip_entries)
1462 .unwrap();
1463
1464 let mut iter = loaded.iterator();
1466
1467 assert_eq!(iter.doc(), 0);
1469 assert!(
1470 (iter.weight() - 1.5).abs() < 0.01,
1471 "Weight should be 1.5, got {}",
1472 iter.weight()
1473 );
1474 assert_eq!(iter.ordinal(), 0);
1475
1476 iter.advance();
1477 assert_eq!(iter.doc(), 5);
1478 assert!(
1479 (iter.weight() - 2.5).abs() < 0.01,
1480 "Weight should be 2.5, got {}",
1481 iter.weight()
1482 );
1483 assert_eq!(iter.ordinal(), 1);
1484
1485 iter.advance();
1486 assert_eq!(iter.doc(), 10);
1487 assert!(
1488 (iter.weight() - 3.5).abs() < 0.01,
1489 "Weight should be 3.5, got {}",
1490 iter.weight()
1491 );
1492 assert_eq!(iter.ordinal(), 2);
1493
1494 iter.advance();
1496 assert_eq!(iter.doc(), 100);
1497 assert!(
1498 (iter.weight() - 4.5).abs() < 0.01,
1499 "Weight should be 4.5, got {}",
1500 iter.weight()
1501 );
1502 assert_eq!(iter.ordinal(), 0);
1503
1504 iter.advance();
1505 assert_eq!(iter.doc(), 103);
1506 assert!(
1507 (iter.weight() - 5.5).abs() < 0.01,
1508 "Weight should be 5.5, got {}",
1509 iter.weight()
1510 );
1511 assert_eq!(iter.ordinal(), 1);
1512
1513 iter.advance();
1514 assert_eq!(iter.doc(), 107);
1515 assert!(
1516 (iter.weight() - 6.5).abs() < 0.01,
1517 "Weight should be 6.5, got {}",
1518 iter.weight()
1519 );
1520 assert_eq!(iter.ordinal(), 3);
1521
1522 iter.advance();
1524 assert_eq!(iter.doc(), super::TERMINATED);
1525 }
1526
1527 #[test]
1528 fn test_merge_global_max_weight() {
1529 let postings1: Vec<(DocId, u16, f32)> = vec![
1531 (0, 0, 3.0),
1532 (1, 0, 7.0), (2, 0, 2.0),
1534 ];
1535 let list1 =
1536 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1537
1538 let postings2: Vec<(DocId, u16, f32)> = vec![
1539 (0, 0, 5.0),
1540 (1, 0, 4.0),
1541 (2, 0, 6.0), ];
1543 let list2 =
1544 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1545
1546 assert!((list1.global_max_weight() - 7.0).abs() < 0.01);
1548 assert!((list2.global_max_weight() - 6.0).abs() < 0.01);
1549
1550 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1552
1553 assert!(
1555 (merged.global_max_weight() - 7.0).abs() < 0.01,
1556 "Global max should be 7.0, got {}",
1557 merged.global_max_weight()
1558 );
1559
1560 let (block_data, skip_entries) = merged.serialize().unwrap();
1562 let loaded =
1563 BlockSparsePostingList::from_parts(merged.doc_count(), &block_data, &skip_entries)
1564 .unwrap();
1565
1566 assert!(
1567 (loaded.global_max_weight() - 7.0).abs() < 0.01,
1568 "After roundtrip, global max should still be 7.0, got {}",
1569 loaded.global_max_weight()
1570 );
1571 }
1572
1573 #[test]
1574 fn test_scoring_simulation_after_merge() {
1575 let postings1: Vec<(DocId, u16, f32)> = vec![
1577 (0, 0, 0.5), (5, 0, 0.8), ];
1580 let list1 =
1581 BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1582
1583 let postings2: Vec<(DocId, u16, f32)> = vec![
1584 (0, 0, 0.6), (3, 0, 0.9), ];
1587 let list2 =
1588 BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1589
1590 let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1592
1593 let (block_data, skip_entries) = merged.serialize().unwrap();
1595 let loaded =
1596 BlockSparsePostingList::from_parts(merged.doc_count(), &block_data, &skip_entries)
1597 .unwrap();
1598
1599 let query_weight = 2.0f32;
1601 let mut iter = loaded.iterator();
1602
1603 assert_eq!(iter.doc(), 0);
1606 let score = query_weight * iter.weight();
1607 assert!(
1608 (score - 1.0).abs() < 0.01,
1609 "Doc 0 score should be 1.0, got {}",
1610 score
1611 );
1612
1613 iter.advance();
1614 assert_eq!(iter.doc(), 5);
1616 let score = query_weight * iter.weight();
1617 assert!(
1618 (score - 1.6).abs() < 0.01,
1619 "Doc 5 score should be 1.6, got {}",
1620 score
1621 );
1622
1623 iter.advance();
1624 assert_eq!(iter.doc(), 100);
1626 let score = query_weight * iter.weight();
1627 assert!(
1628 (score - 1.2).abs() < 0.01,
1629 "Doc 100 score should be 1.2, got {}",
1630 score
1631 );
1632
1633 iter.advance();
1634 assert_eq!(iter.doc(), 103);
1636 let score = query_weight * iter.weight();
1637 assert!(
1638 (score - 1.8).abs() < 0.01,
1639 "Doc 103 score should be 1.8, got {}",
1640 score
1641 );
1642 }
1643}