hermes_core/structures/postings/sparse/
block.rs

1//! Block-based sparse posting list with 3 sub-blocks
2//!
3//! Format per block (128 entries for SIMD alignment):
4//! - Doc IDs: delta-encoded, bit-packed
5//! - Ordinals: bit-packed small integers (lazy decode)
6//! - Weights: quantized (f32/f16/u8/u4)
7
8use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Cursor, Read, Write};
10
11use super::config::WeightQuantization;
12use crate::DocId;
13use crate::structures::postings::TERMINATED;
14use crate::structures::simd;
15
16pub const BLOCK_SIZE: usize = 128;
17
18#[derive(Debug, Clone, Copy)]
19pub struct BlockHeader {
20    pub count: u16,
21    pub doc_id_bits: u8,
22    pub ordinal_bits: u8,
23    pub weight_quant: WeightQuantization,
24    pub first_doc_id: DocId,
25    pub max_weight: f32,
26}
27
28impl BlockHeader {
29    pub const SIZE: usize = 16;
30
31    pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
32        w.write_u16::<LittleEndian>(self.count)?;
33        w.write_u8(self.doc_id_bits)?;
34        w.write_u8(self.ordinal_bits)?;
35        w.write_u8(self.weight_quant as u8)?;
36        w.write_u8(0)?;
37        w.write_u16::<LittleEndian>(0)?;
38        w.write_u32::<LittleEndian>(self.first_doc_id)?;
39        w.write_f32::<LittleEndian>(self.max_weight)?;
40        Ok(())
41    }
42
43    pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
44        let count = r.read_u16::<LittleEndian>()?;
45        let doc_id_bits = r.read_u8()?;
46        let ordinal_bits = r.read_u8()?;
47        let weight_quant_byte = r.read_u8()?;
48        let _ = r.read_u8()?;
49        let _ = r.read_u16::<LittleEndian>()?;
50        let first_doc_id = r.read_u32::<LittleEndian>()?;
51        let max_weight = r.read_f32::<LittleEndian>()?;
52
53        let weight_quant = WeightQuantization::from_u8(weight_quant_byte)
54            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid weight quant"))?;
55
56        Ok(Self {
57            count,
58            doc_id_bits,
59            ordinal_bits,
60            weight_quant,
61            first_doc_id,
62            max_weight,
63        })
64    }
65}
66
67#[derive(Debug, Clone)]
68pub struct SparseBlock {
69    pub header: BlockHeader,
70    pub doc_ids_data: Vec<u8>,
71    pub ordinals_data: Vec<u8>,
72    pub weights_data: Vec<u8>,
73}
74
75impl SparseBlock {
76    pub fn from_postings(
77        postings: &[(DocId, u16, f32)],
78        weight_quant: WeightQuantization,
79    ) -> io::Result<Self> {
80        assert!(!postings.is_empty() && postings.len() <= BLOCK_SIZE);
81
82        let count = postings.len();
83        let first_doc_id = postings[0].0;
84
85        // Delta encode doc IDs
86        let mut deltas = Vec::with_capacity(count);
87        let mut prev = first_doc_id;
88        for &(doc_id, _, _) in postings {
89            deltas.push(doc_id.saturating_sub(prev));
90            prev = doc_id;
91        }
92        deltas[0] = 0;
93
94        let doc_id_bits = find_optimal_bit_width(&deltas[1..]);
95        let ordinals: Vec<u16> = postings.iter().map(|(_, o, _)| *o).collect();
96        let max_ordinal = ordinals.iter().copied().max().unwrap_or(0);
97        let ordinal_bits = if max_ordinal == 0 {
98            0
99        } else {
100            bits_needed_u16(max_ordinal)
101        };
102
103        let weights: Vec<f32> = postings.iter().map(|(_, _, w)| *w).collect();
104        let max_weight = weights.iter().copied().fold(0.0f32, f32::max);
105
106        let doc_ids_data = pack_bit_array(&deltas[1..], doc_id_bits);
107        let ordinals_data = if ordinal_bits > 0 {
108            pack_bit_array_u16(&ordinals, ordinal_bits)
109        } else {
110            Vec::new()
111        };
112        let weights_data = encode_weights(&weights, weight_quant)?;
113
114        Ok(Self {
115            header: BlockHeader {
116                count: count as u16,
117                doc_id_bits,
118                ordinal_bits,
119                weight_quant,
120                first_doc_id,
121                max_weight,
122            },
123            doc_ids_data,
124            ordinals_data,
125            weights_data,
126        })
127    }
128
129    pub fn decode_doc_ids(&self) -> Vec<DocId> {
130        let count = self.header.count as usize;
131        let mut doc_ids = Vec::with_capacity(count);
132        doc_ids.push(self.header.first_doc_id);
133
134        if count > 1 {
135            let deltas = unpack_bit_array(&self.doc_ids_data, self.header.doc_id_bits, count - 1);
136            let mut prev = self.header.first_doc_id;
137            for delta in deltas {
138                prev += delta;
139                doc_ids.push(prev);
140            }
141        }
142        doc_ids
143    }
144
145    pub fn decode_ordinals(&self) -> Vec<u16> {
146        let count = self.header.count as usize;
147        if self.header.ordinal_bits == 0 {
148            vec![0u16; count]
149        } else {
150            unpack_bit_array_u16(&self.ordinals_data, self.header.ordinal_bits, count)
151        }
152    }
153
154    pub fn decode_weights(&self) -> Vec<f32> {
155        decode_weights(
156            &self.weights_data,
157            self.header.weight_quant,
158            self.header.count as usize,
159        )
160    }
161
162    pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
163        self.header.write(w)?;
164        w.write_u16::<LittleEndian>(self.doc_ids_data.len() as u16)?;
165        w.write_u16::<LittleEndian>(self.ordinals_data.len() as u16)?;
166        w.write_u16::<LittleEndian>(self.weights_data.len() as u16)?;
167        w.write_u16::<LittleEndian>(0)?;
168        w.write_all(&self.doc_ids_data)?;
169        w.write_all(&self.ordinals_data)?;
170        w.write_all(&self.weights_data)?;
171        Ok(())
172    }
173
174    pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
175        let header = BlockHeader::read(r)?;
176        let doc_ids_len = r.read_u16::<LittleEndian>()? as usize;
177        let ordinals_len = r.read_u16::<LittleEndian>()? as usize;
178        let weights_len = r.read_u16::<LittleEndian>()? as usize;
179        let _ = r.read_u16::<LittleEndian>()?;
180
181        let mut doc_ids_data = vec![0u8; doc_ids_len];
182        r.read_exact(&mut doc_ids_data)?;
183        let mut ordinals_data = vec![0u8; ordinals_len];
184        r.read_exact(&mut ordinals_data)?;
185        let mut weights_data = vec![0u8; weights_len];
186        r.read_exact(&mut weights_data)?;
187
188        Ok(Self {
189            header,
190            doc_ids_data,
191            ordinals_data,
192            weights_data,
193        })
194    }
195
196    /// Create a copy of this block with first_doc_id adjusted by offset.
197    ///
198    /// This is used during merge to remap doc_ids from different segments.
199    /// Only the first_doc_id needs adjustment - deltas within the block
200    /// remain unchanged since they're relative to the previous doc.
201    pub fn with_doc_offset(&self, doc_offset: u32) -> Self {
202        Self {
203            header: BlockHeader {
204                first_doc_id: self.header.first_doc_id + doc_offset,
205                ..self.header
206            },
207            doc_ids_data: self.doc_ids_data.clone(),
208            ordinals_data: self.ordinals_data.clone(),
209            weights_data: self.weights_data.clone(),
210        }
211    }
212}
213
214// ============================================================================
215// BlockSparsePostingList
216// ============================================================================
217
218#[derive(Debug, Clone)]
219pub struct BlockSparsePostingList {
220    pub doc_count: u32,
221    pub blocks: Vec<SparseBlock>,
222}
223
224impl BlockSparsePostingList {
225    /// Create from postings with configurable block size
226    pub fn from_postings_with_block_size(
227        postings: &[(DocId, u16, f32)],
228        weight_quant: WeightQuantization,
229        block_size: usize,
230    ) -> io::Result<Self> {
231        if postings.is_empty() {
232            return Ok(Self {
233                doc_count: 0,
234                blocks: Vec::new(),
235            });
236        }
237
238        let block_size = block_size.max(16); // minimum 16 for sanity
239        let mut blocks = Vec::new();
240        for chunk in postings.chunks(block_size) {
241            blocks.push(SparseBlock::from_postings(chunk, weight_quant)?);
242        }
243
244        // Count unique document IDs (not total postings).
245        // For multi-value fields, the same doc_id appears multiple times
246        // with different ordinals. Postings are sorted by (doc_id, ordinal),
247        // so we count transitions.
248        let mut unique_docs = 1u32;
249        for i in 1..postings.len() {
250            if postings[i].0 != postings[i - 1].0 {
251                unique_docs += 1;
252            }
253        }
254
255        Ok(Self {
256            doc_count: unique_docs,
257            blocks,
258        })
259    }
260
261    /// Create from postings with default block size (128)
262    pub fn from_postings(
263        postings: &[(DocId, u16, f32)],
264        weight_quant: WeightQuantization,
265    ) -> io::Result<Self> {
266        Self::from_postings_with_block_size(postings, weight_quant, BLOCK_SIZE)
267    }
268
269    pub fn doc_count(&self) -> u32 {
270        self.doc_count
271    }
272
273    pub fn num_blocks(&self) -> usize {
274        self.blocks.len()
275    }
276
277    pub fn global_max_weight(&self) -> f32 {
278        self.blocks
279            .iter()
280            .map(|b| b.header.max_weight)
281            .fold(0.0f32, f32::max)
282    }
283
284    pub fn block_max_weight(&self, block_idx: usize) -> Option<f32> {
285        self.blocks.get(block_idx).map(|b| b.header.max_weight)
286    }
287
288    /// Approximate memory usage in bytes
289    pub fn size_bytes(&self) -> usize {
290        use std::mem::size_of;
291
292        let header_size = size_of::<u32>() * 2; // doc_count + num_blocks
293        let blocks_size: usize = self
294            .blocks
295            .iter()
296            .map(|b| {
297                size_of::<BlockHeader>()
298                    + b.doc_ids_data.len()
299                    + b.ordinals_data.len()
300                    + b.weights_data.len()
301            })
302            .sum();
303        header_size + blocks_size
304    }
305
306    pub fn iterator(&self) -> BlockSparsePostingIterator<'_> {
307        BlockSparsePostingIterator::new(self)
308    }
309
310    /// Serialize with skip list header for lazy loading
311    ///
312    /// Format:
313    /// - doc_count: u32
314    /// - global_max_weight: f32
315    /// - num_blocks: u32
316    /// - skip_list: [SparseSkipEntry] × num_blocks (first_doc, last_doc, offset, length, max_weight)
317    /// - block_data: concatenated SparseBlock data
318    pub fn serialize<W: Write>(&self, w: &mut W) -> io::Result<()> {
319        use super::SparseSkipEntry;
320
321        w.write_u32::<LittleEndian>(self.doc_count)?;
322        w.write_f32::<LittleEndian>(self.global_max_weight())?;
323        w.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
324
325        // First pass: serialize blocks to get their sizes
326        let mut block_bytes: Vec<Vec<u8>> = Vec::with_capacity(self.blocks.len());
327        for block in &self.blocks {
328            let mut buf = Vec::new();
329            block.write(&mut buf)?;
330            block_bytes.push(buf);
331        }
332
333        // Write skip list entries
334        let mut offset = 0u32;
335        for (block, bytes) in self.blocks.iter().zip(block_bytes.iter()) {
336            let first_doc = block.header.first_doc_id;
337            let doc_ids = block.decode_doc_ids();
338            let last_doc = doc_ids.last().copied().unwrap_or(first_doc);
339            let length = bytes.len() as u32;
340
341            let entry =
342                SparseSkipEntry::new(first_doc, last_doc, offset, length, block.header.max_weight);
343            entry.write(w)?;
344            offset += length;
345        }
346
347        // Write block data
348        for bytes in block_bytes {
349            w.write_all(&bytes)?;
350        }
351
352        Ok(())
353    }
354
355    /// Deserialize fully (loads all blocks into memory)
356    /// For lazy loading, use deserialize_header() + load_block()
357    pub fn deserialize<R: Read>(r: &mut R) -> io::Result<Self> {
358        use super::SparseSkipEntry;
359
360        let doc_count = r.read_u32::<LittleEndian>()?;
361        let _global_max_weight = r.read_f32::<LittleEndian>()?;
362        let num_blocks = r.read_u32::<LittleEndian>()? as usize;
363
364        // Skip the skip list entries
365        for _ in 0..num_blocks {
366            let _ = SparseSkipEntry::read(r)?;
367        }
368
369        // Read all blocks
370        let mut blocks = Vec::with_capacity(num_blocks);
371        for _ in 0..num_blocks {
372            blocks.push(SparseBlock::read(r)?);
373        }
374        Ok(Self { doc_count, blocks })
375    }
376
377    /// Deserialize only the skip list header (for lazy loading)
378    /// Returns (doc_count, global_max_weight, skip_entries, header_size)
379    pub fn deserialize_header<R: Read>(
380        r: &mut R,
381    ) -> io::Result<(u32, f32, Vec<super::SparseSkipEntry>, usize)> {
382        use super::SparseSkipEntry;
383
384        let doc_count = r.read_u32::<LittleEndian>()?;
385        let global_max_weight = r.read_f32::<LittleEndian>()?;
386        let num_blocks = r.read_u32::<LittleEndian>()? as usize;
387
388        let mut entries = Vec::with_capacity(num_blocks);
389        for _ in 0..num_blocks {
390            entries.push(SparseSkipEntry::read(r)?);
391        }
392
393        // Header size: 4 + 4 + 4 + num_blocks * SparseSkipEntry::SIZE
394        let header_size = 4 + 4 + 4 + num_blocks * SparseSkipEntry::SIZE;
395
396        Ok((doc_count, global_max_weight, entries, header_size))
397    }
398
399    pub fn decode_all(&self) -> Vec<(DocId, u16, f32)> {
400        let total_postings: usize = self.blocks.iter().map(|b| b.header.count as usize).sum();
401        let mut result = Vec::with_capacity(total_postings);
402        for block in &self.blocks {
403            let doc_ids = block.decode_doc_ids();
404            let ordinals = block.decode_ordinals();
405            let weights = block.decode_weights();
406            for i in 0..block.header.count as usize {
407                result.push((doc_ids[i], ordinals[i], weights[i]));
408            }
409        }
410        result
411    }
412
413    /// Merge multiple posting lists from different segments with doc_id offsets.
414    ///
415    /// This is an optimized O(1) merge that stacks blocks without decode/re-encode.
416    /// Each posting list's blocks have their first_doc_id adjusted by the corresponding offset.
417    ///
418    /// # Arguments
419    /// * `lists` - Slice of (posting_list, doc_offset) pairs from each segment
420    ///
421    /// # Returns
422    /// A new posting list with all blocks concatenated and doc_ids remapped
423    pub fn merge_with_offsets(lists: &[(&BlockSparsePostingList, u32)]) -> Self {
424        if lists.is_empty() {
425            return Self {
426                doc_count: 0,
427                blocks: Vec::new(),
428            };
429        }
430
431        // Pre-calculate total capacity
432        let total_blocks: usize = lists.iter().map(|(pl, _)| pl.blocks.len()).sum();
433        let total_docs: u32 = lists.iter().map(|(pl, _)| pl.doc_count).sum();
434
435        let mut merged_blocks = Vec::with_capacity(total_blocks);
436
437        // Stack blocks from each segment with doc_id offset adjustment
438        for (posting_list, doc_offset) in lists {
439            for block in &posting_list.blocks {
440                merged_blocks.push(block.with_doc_offset(*doc_offset));
441            }
442        }
443
444        Self {
445            doc_count: total_docs,
446            blocks: merged_blocks,
447        }
448    }
449
450    fn find_block(&self, target: DocId) -> Option<usize> {
451        if self.blocks.is_empty() {
452            return None;
453        }
454        // Binary search on first_doc_id: find the last block whose first_doc_id <= target.
455        // O(log N) header comparisons — no block decode needed.
456        let idx = self
457            .blocks
458            .partition_point(|b| b.header.first_doc_id <= target);
459        if idx == 0 {
460            // target < first_doc_id of block 0 — return block 0 so caller can check
461            Some(0)
462        } else {
463            Some(idx - 1)
464        }
465    }
466}
467
468// ============================================================================
469// Iterator
470// ============================================================================
471
472pub struct BlockSparsePostingIterator<'a> {
473    posting_list: &'a BlockSparsePostingList,
474    block_idx: usize,
475    in_block_idx: usize,
476    current_doc_ids: Vec<DocId>,
477    current_ordinals: Vec<u16>,
478    current_weights: Vec<f32>,
479    exhausted: bool,
480}
481
482impl<'a> BlockSparsePostingIterator<'a> {
483    fn new(posting_list: &'a BlockSparsePostingList) -> Self {
484        let mut iter = Self {
485            posting_list,
486            block_idx: 0,
487            in_block_idx: 0,
488            current_doc_ids: Vec::new(),
489            current_ordinals: Vec::new(),
490            current_weights: Vec::new(),
491            exhausted: posting_list.blocks.is_empty(),
492        };
493        if !iter.exhausted {
494            iter.load_block(0);
495        }
496        iter
497    }
498
499    fn load_block(&mut self, block_idx: usize) {
500        if let Some(block) = self.posting_list.blocks.get(block_idx) {
501            self.current_doc_ids = block.decode_doc_ids();
502            self.current_ordinals = block.decode_ordinals();
503            self.current_weights = block.decode_weights();
504            self.block_idx = block_idx;
505            self.in_block_idx = 0;
506        }
507    }
508
509    pub fn doc(&self) -> DocId {
510        if self.exhausted {
511            TERMINATED
512        } else {
513            self.current_doc_ids
514                .get(self.in_block_idx)
515                .copied()
516                .unwrap_or(TERMINATED)
517        }
518    }
519
520    pub fn weight(&self) -> f32 {
521        self.current_weights
522            .get(self.in_block_idx)
523            .copied()
524            .unwrap_or(0.0)
525    }
526
527    pub fn ordinal(&self) -> u16 {
528        self.current_ordinals
529            .get(self.in_block_idx)
530            .copied()
531            .unwrap_or(0)
532    }
533
534    pub fn advance(&mut self) -> DocId {
535        if self.exhausted {
536            return TERMINATED;
537        }
538        self.in_block_idx += 1;
539        if self.in_block_idx >= self.current_doc_ids.len() {
540            self.block_idx += 1;
541            if self.block_idx >= self.posting_list.blocks.len() {
542                self.exhausted = true;
543            } else {
544                self.load_block(self.block_idx);
545            }
546        }
547        self.doc()
548    }
549
550    pub fn seek(&mut self, target: DocId) -> DocId {
551        if self.exhausted {
552            return TERMINATED;
553        }
554        if self.doc() >= target {
555            return self.doc();
556        }
557
558        // Check current block — binary search within decoded doc_ids
559        if let Some(&last_doc) = self.current_doc_ids.last()
560            && last_doc >= target
561        {
562            let remaining = &self.current_doc_ids[self.in_block_idx..];
563            let pos = remaining.partition_point(|&d| d < target);
564            self.in_block_idx += pos;
565            if self.in_block_idx >= self.current_doc_ids.len() {
566                self.block_idx += 1;
567                if self.block_idx >= self.posting_list.blocks.len() {
568                    self.exhausted = true;
569                } else {
570                    self.load_block(self.block_idx);
571                }
572            }
573            return self.doc();
574        }
575
576        // Find correct block
577        if let Some(block_idx) = self.posting_list.find_block(target) {
578            self.load_block(block_idx);
579            let pos = self.current_doc_ids.partition_point(|&d| d < target);
580            self.in_block_idx = pos;
581            if self.in_block_idx >= self.current_doc_ids.len() {
582                self.block_idx += 1;
583                if self.block_idx >= self.posting_list.blocks.len() {
584                    self.exhausted = true;
585                } else {
586                    self.load_block(self.block_idx);
587                }
588            }
589        } else {
590            self.exhausted = true;
591        }
592        self.doc()
593    }
594
595    /// Skip to the start of the next block, returning its first doc_id.
596    /// Used by block-max WAND to skip entire blocks that can't beat threshold.
597    pub fn skip_to_next_block(&mut self) -> DocId {
598        if self.exhausted {
599            return TERMINATED;
600        }
601        let next = self.block_idx + 1;
602        if next >= self.posting_list.blocks.len() {
603            self.exhausted = true;
604            return TERMINATED;
605        }
606        self.load_block(next);
607        self.doc()
608    }
609
610    pub fn is_exhausted(&self) -> bool {
611        self.exhausted
612    }
613
614    pub fn current_block_max_weight(&self) -> f32 {
615        self.posting_list
616            .blocks
617            .get(self.block_idx)
618            .map(|b| b.header.max_weight)
619            .unwrap_or(0.0)
620    }
621
622    pub fn current_block_max_contribution(&self, query_weight: f32) -> f32 {
623        query_weight * self.current_block_max_weight()
624    }
625}
626
627// ============================================================================
628// Bit-packing utilities
629// ============================================================================
630
631fn find_optimal_bit_width(values: &[u32]) -> u8 {
632    if values.is_empty() {
633        return 0;
634    }
635    let max_val = values.iter().copied().max().unwrap_or(0);
636    simd::bits_needed(max_val)
637}
638
639fn bits_needed_u16(val: u16) -> u8 {
640    if val == 0 {
641        0
642    } else {
643        16 - val.leading_zeros() as u8
644    }
645}
646
647fn pack_bit_array(values: &[u32], bits: u8) -> Vec<u8> {
648    if bits == 0 || values.is_empty() {
649        return Vec::new();
650    }
651    let total_bytes = (values.len() * bits as usize).div_ceil(8);
652    let mut result = vec![0u8; total_bytes];
653    let mut bit_pos = 0usize;
654    for &val in values {
655        pack_value(&mut result, bit_pos, val & ((1u32 << bits) - 1), bits);
656        bit_pos += bits as usize;
657    }
658    result
659}
660
661fn pack_bit_array_u16(values: &[u16], bits: u8) -> Vec<u8> {
662    if bits == 0 || values.is_empty() {
663        return Vec::new();
664    }
665    let total_bytes = (values.len() * bits as usize).div_ceil(8);
666    let mut result = vec![0u8; total_bytes];
667    let mut bit_pos = 0usize;
668    for &val in values {
669        pack_value(
670            &mut result,
671            bit_pos,
672            (val as u32) & ((1u32 << bits) - 1),
673            bits,
674        );
675        bit_pos += bits as usize;
676    }
677    result
678}
679
680#[inline]
681fn pack_value(data: &mut [u8], bit_pos: usize, val: u32, bits: u8) {
682    let mut remaining = bits as usize;
683    let mut val = val;
684    let mut byte = bit_pos / 8;
685    let mut offset = bit_pos % 8;
686    while remaining > 0 {
687        let space = 8 - offset;
688        let to_write = remaining.min(space);
689        let mask = (1u32 << to_write) - 1;
690        data[byte] |= ((val & mask) as u8) << offset;
691        val >>= to_write;
692        remaining -= to_write;
693        byte += 1;
694        offset = 0;
695    }
696}
697
698fn unpack_bit_array(data: &[u8], bits: u8, count: usize) -> Vec<u32> {
699    if bits == 0 || count == 0 {
700        return vec![0; count];
701    }
702    let mut result = Vec::with_capacity(count);
703    let mut bit_pos = 0usize;
704    for _ in 0..count {
705        result.push(unpack_value(data, bit_pos, bits));
706        bit_pos += bits as usize;
707    }
708    result
709}
710
711fn unpack_bit_array_u16(data: &[u8], bits: u8, count: usize) -> Vec<u16> {
712    if bits == 0 || count == 0 {
713        return vec![0; count];
714    }
715    let mut result = Vec::with_capacity(count);
716    let mut bit_pos = 0usize;
717    for _ in 0..count {
718        result.push(unpack_value(data, bit_pos, bits) as u16);
719        bit_pos += bits as usize;
720    }
721    result
722}
723
724#[inline]
725fn unpack_value(data: &[u8], bit_pos: usize, bits: u8) -> u32 {
726    let mut val = 0u32;
727    let mut remaining = bits as usize;
728    let mut byte = bit_pos / 8;
729    let mut offset = bit_pos % 8;
730    let mut shift = 0;
731    while remaining > 0 {
732        let space = 8 - offset;
733        let to_read = remaining.min(space);
734        let mask = (1u8 << to_read) - 1;
735        val |= (((data.get(byte).copied().unwrap_or(0) >> offset) & mask) as u32) << shift;
736        remaining -= to_read;
737        shift += to_read;
738        byte += 1;
739        offset = 0;
740    }
741    val
742}
743
744// ============================================================================
745// Weight encoding/decoding
746// ============================================================================
747
748fn encode_weights(weights: &[f32], quant: WeightQuantization) -> io::Result<Vec<u8>> {
749    let mut data = Vec::new();
750    match quant {
751        WeightQuantization::Float32 => {
752            for &w in weights {
753                data.write_f32::<LittleEndian>(w)?;
754            }
755        }
756        WeightQuantization::Float16 => {
757            use half::f16;
758            for &w in weights {
759                data.write_u16::<LittleEndian>(f16::from_f32(w).to_bits())?;
760            }
761        }
762        WeightQuantization::UInt8 => {
763            let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
764            let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
765            let range = max - min;
766            let scale = if range < f32::EPSILON {
767                1.0
768            } else {
769                range / 255.0
770            };
771            data.write_f32::<LittleEndian>(scale)?;
772            data.write_f32::<LittleEndian>(min)?;
773            for &w in weights {
774                data.write_u8(((w - min) / scale).round() as u8)?;
775            }
776        }
777        WeightQuantization::UInt4 => {
778            let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
779            let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
780            let range = max - min;
781            let scale = if range < f32::EPSILON {
782                1.0
783            } else {
784                range / 15.0
785            };
786            data.write_f32::<LittleEndian>(scale)?;
787            data.write_f32::<LittleEndian>(min)?;
788            let mut i = 0;
789            while i < weights.len() {
790                let q1 = ((weights[i] - min) / scale).round() as u8 & 0x0F;
791                let q2 = if i + 1 < weights.len() {
792                    ((weights[i + 1] - min) / scale).round() as u8 & 0x0F
793                } else {
794                    0
795                };
796                data.write_u8((q2 << 4) | q1)?;
797                i += 2;
798            }
799        }
800    }
801    Ok(data)
802}
803
804fn decode_weights(data: &[u8], quant: WeightQuantization, count: usize) -> Vec<f32> {
805    let mut cursor = Cursor::new(data);
806    let mut weights = Vec::with_capacity(count);
807    match quant {
808        WeightQuantization::Float32 => {
809            for _ in 0..count {
810                weights.push(cursor.read_f32::<LittleEndian>().unwrap_or(0.0));
811            }
812        }
813        WeightQuantization::Float16 => {
814            use half::f16;
815            for _ in 0..count {
816                let bits = cursor.read_u16::<LittleEndian>().unwrap_or(0);
817                weights.push(f16::from_bits(bits).to_f32());
818            }
819        }
820        WeightQuantization::UInt8 => {
821            let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
822            let min = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
823            for _ in 0..count {
824                let q = cursor.read_u8().unwrap_or(0);
825                weights.push(q as f32 * scale + min);
826            }
827        }
828        WeightQuantization::UInt4 => {
829            let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
830            let min = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
831            let mut i = 0;
832            while i < count {
833                let byte = cursor.read_u8().unwrap_or(0);
834                weights.push((byte & 0x0F) as f32 * scale + min);
835                i += 1;
836                if i < count {
837                    weights.push((byte >> 4) as f32 * scale + min);
838                    i += 1;
839                }
840            }
841        }
842    }
843    weights
844}
845
846#[cfg(test)]
847mod tests {
848    use super::*;
849
850    #[test]
851    fn test_block_roundtrip() {
852        let postings = vec![
853            (10u32, 0u16, 1.5f32),
854            (15, 0, 2.0),
855            (20, 1, 0.5),
856            (100, 0, 3.0),
857        ];
858        let block = SparseBlock::from_postings(&postings, WeightQuantization::Float32).unwrap();
859
860        assert_eq!(block.decode_doc_ids(), vec![10, 15, 20, 100]);
861        assert_eq!(block.decode_ordinals(), vec![0, 0, 1, 0]);
862        let weights = block.decode_weights();
863        assert!((weights[0] - 1.5).abs() < 0.01);
864    }
865
866    #[test]
867    fn test_posting_list() {
868        let postings: Vec<(DocId, u16, f32)> =
869            (0..300).map(|i| (i * 2, 0, i as f32 * 0.1)).collect();
870        let list =
871            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
872
873        assert_eq!(list.doc_count(), 300);
874        assert_eq!(list.num_blocks(), 3);
875
876        let mut iter = list.iterator();
877        assert_eq!(iter.doc(), 0);
878        iter.advance();
879        assert_eq!(iter.doc(), 2);
880    }
881
882    #[test]
883    fn test_serialization() {
884        let postings = vec![(1u32, 0u16, 0.5f32), (10, 1, 1.5), (100, 0, 2.5)];
885        let list =
886            BlockSparsePostingList::from_postings(&postings, WeightQuantization::UInt8).unwrap();
887
888        let mut buf = Vec::new();
889        list.serialize(&mut buf).unwrap();
890        let list2 = BlockSparsePostingList::deserialize(&mut Cursor::new(&buf)).unwrap();
891
892        assert_eq!(list.doc_count(), list2.doc_count());
893    }
894
895    #[test]
896    fn test_seek() {
897        let postings: Vec<(DocId, u16, f32)> = (0..500).map(|i| (i * 3, 0, i as f32)).collect();
898        let list =
899            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
900
901        let mut iter = list.iterator();
902        assert_eq!(iter.seek(300), 300);
903        assert_eq!(iter.seek(301), 303);
904        assert_eq!(iter.seek(2000), TERMINATED);
905    }
906
907    #[test]
908    fn test_merge_with_offsets() {
909        // Segment 1: docs 0, 5, 10 with weights
910        let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
911        let list1 =
912            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
913
914        // Segment 2: docs 0, 3, 7 with weights (will become 100, 103, 107 after merge)
915        let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
916        let list2 =
917            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
918
919        // Merge with offsets: segment 1 at offset 0, segment 2 at offset 100
920        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
921
922        assert_eq!(merged.doc_count(), 6);
923
924        // Verify all doc_ids are correct after merge
925        let decoded = merged.decode_all();
926        assert_eq!(decoded.len(), 6);
927
928        // Segment 1 docs (offset 0)
929        assert_eq!(decoded[0].0, 0);
930        assert_eq!(decoded[1].0, 5);
931        assert_eq!(decoded[2].0, 10);
932
933        // Segment 2 docs (offset 100)
934        assert_eq!(decoded[3].0, 100); // 0 + 100
935        assert_eq!(decoded[4].0, 103); // 3 + 100
936        assert_eq!(decoded[5].0, 107); // 7 + 100
937
938        // Verify weights preserved
939        assert!((decoded[0].2 - 1.0).abs() < 0.01);
940        assert!((decoded[3].2 - 4.0).abs() < 0.01);
941
942        // Verify ordinals preserved
943        assert_eq!(decoded[2].1, 1); // ordinal from segment 1
944        assert_eq!(decoded[4].1, 1); // ordinal from segment 2
945    }
946
947    #[test]
948    fn test_merge_with_offsets_multi_block() {
949        // Create posting lists that span multiple blocks
950        let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
951        let list1 =
952            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
953        assert!(list1.num_blocks() > 1, "Should have multiple blocks");
954
955        let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
956        let list2 =
957            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
958
959        // Merge with offset 1000 for segment 2
960        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
961
962        assert_eq!(merged.doc_count(), 350);
963        assert_eq!(merged.num_blocks(), list1.num_blocks() + list2.num_blocks());
964
965        // Verify via iterator
966        let mut iter = merged.iterator();
967
968        // First segment docs start at 0
969        assert_eq!(iter.doc(), 0);
970
971        // Seek to segment 2 (should be at offset 1000)
972        let doc = iter.seek(1000);
973        assert_eq!(doc, 1000); // First doc of segment 2: 0 + 1000 = 1000
974
975        // Next doc in segment 2
976        iter.advance();
977        assert_eq!(iter.doc(), 1003); // 3 + 1000 = 1003
978    }
979
980    #[test]
981    fn test_merge_with_offsets_serialize_roundtrip() {
982        // Verify that serialization preserves adjusted doc_ids
983        let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
984        let list1 =
985            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
986
987        let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
988        let list2 =
989            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
990
991        // Merge with offset 100 for segment 2
992        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
993
994        // Serialize
995        let mut bytes = Vec::new();
996        merged.serialize(&mut bytes).unwrap();
997
998        // Deserialize
999        let mut cursor = std::io::Cursor::new(&bytes);
1000        let loaded = BlockSparsePostingList::deserialize(&mut cursor).unwrap();
1001
1002        // Verify doc_ids are preserved after round-trip
1003        let decoded = loaded.decode_all();
1004        assert_eq!(decoded.len(), 6);
1005
1006        // Segment 1 docs (offset 0)
1007        assert_eq!(decoded[0].0, 0);
1008        assert_eq!(decoded[1].0, 5);
1009        assert_eq!(decoded[2].0, 10);
1010
1011        // Segment 2 docs (offset 100) - CRITICAL: these must be offset-adjusted
1012        assert_eq!(decoded[3].0, 100, "First doc of seg2 should be 0+100=100");
1013        assert_eq!(decoded[4].0, 103, "Second doc of seg2 should be 3+100=103");
1014        assert_eq!(decoded[5].0, 107, "Third doc of seg2 should be 7+100=107");
1015
1016        // Verify iterator also works correctly
1017        let mut iter = loaded.iterator();
1018        assert_eq!(iter.doc(), 0);
1019        iter.advance();
1020        assert_eq!(iter.doc(), 5);
1021        iter.advance();
1022        assert_eq!(iter.doc(), 10);
1023        iter.advance();
1024        assert_eq!(iter.doc(), 100);
1025        iter.advance();
1026        assert_eq!(iter.doc(), 103);
1027        iter.advance();
1028        assert_eq!(iter.doc(), 107);
1029    }
1030
1031    #[test]
1032    fn test_merge_seek_after_roundtrip() {
1033        // Create posting lists that span multiple blocks to test seek after merge
1034        let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, 1.0)).collect();
1035        let list1 =
1036            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1037
1038        let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 0, 2.0)).collect();
1039        let list2 =
1040            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1041
1042        // Merge with offset 1000 for segment 2
1043        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
1044
1045        // Serialize and deserialize (simulating what happens after merge file is written)
1046        let mut bytes = Vec::new();
1047        merged.serialize(&mut bytes).unwrap();
1048        let loaded =
1049            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1050
1051        // Test seeking to various positions
1052        let mut iter = loaded.iterator();
1053
1054        // Seek to doc in segment 1
1055        let doc = iter.seek(100);
1056        assert_eq!(doc, 100, "Seek to 100 in segment 1");
1057
1058        // Seek to doc in segment 2 (1000 + offset)
1059        let doc = iter.seek(1000);
1060        assert_eq!(doc, 1000, "Seek to 1000 (first doc of segment 2)");
1061
1062        // Seek to middle of segment 2
1063        let doc = iter.seek(1050);
1064        assert!(
1065            doc >= 1050,
1066            "Seek to 1050 should find doc >= 1050, got {}",
1067            doc
1068        );
1069
1070        // Seek backwards should stay at current position (seek only goes forward)
1071        let doc = iter.seek(500);
1072        assert!(
1073            doc >= 1050,
1074            "Seek backwards should not go back, got {}",
1075            doc
1076        );
1077
1078        // Fresh iterator - verify block boundaries work
1079        let mut iter2 = loaded.iterator();
1080
1081        // Verify we can iterate through all docs
1082        let mut count = 0;
1083        let mut prev_doc = 0;
1084        while iter2.doc() != super::TERMINATED {
1085            let current = iter2.doc();
1086            if count > 0 {
1087                assert!(
1088                    current > prev_doc,
1089                    "Docs should be monotonically increasing: {} vs {}",
1090                    prev_doc,
1091                    current
1092                );
1093            }
1094            prev_doc = current;
1095            iter2.advance();
1096            count += 1;
1097        }
1098        assert_eq!(count, 350, "Should have 350 total docs");
1099    }
1100
1101    #[test]
1102    fn test_doc_count_multi_value() {
1103        // Multi-value: same doc_id with different ordinals
1104        // doc 0 has 3 ordinals, doc 5 has 2, doc 10 has 1 = 3 unique docs
1105        let postings: Vec<(DocId, u16, f32)> = vec![
1106            (0, 0, 1.0),
1107            (0, 1, 1.5),
1108            (0, 2, 2.0),
1109            (5, 0, 3.0),
1110            (5, 1, 3.5),
1111            (10, 0, 4.0),
1112        ];
1113        let list =
1114            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1115
1116        // doc_count should be 3 (unique docs), not 6 (total postings)
1117        assert_eq!(list.doc_count(), 3);
1118
1119        // But we should still have all 6 postings accessible
1120        let decoded = list.decode_all();
1121        assert_eq!(decoded.len(), 6);
1122    }
1123
1124    /// Test the zero-copy merge path used by the actual sparse merger:
1125    /// serialize → parse raw skip entries + block data → patch first_doc_id → reassemble.
1126    /// This is the exact code path in `segment/merger/sparse_vectors.rs`.
1127    #[test]
1128    fn test_zero_copy_merge_patches_first_doc_id() {
1129        use crate::structures::SparseSkipEntry;
1130
1131        // Build two multi-block posting lists
1132        let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
1133        let list1 =
1134            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1135        assert!(list1.num_blocks() > 1);
1136
1137        let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
1138        let list2 =
1139            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1140
1141        // Serialize both (this is what the builder writes to disk)
1142        let mut bytes1 = Vec::new();
1143        list1.serialize(&mut bytes1).unwrap();
1144        let mut bytes2 = Vec::new();
1145        list2.serialize(&mut bytes2).unwrap();
1146
1147        // --- Simulate read_dim_raw: parse header + skip entries, extract raw block data ---
1148        fn parse_raw(data: &[u8]) -> (u32, f32, Vec<SparseSkipEntry>, &[u8]) {
1149            let doc_count = u32::from_le_bytes(data[0..4].try_into().unwrap());
1150            let global_max = f32::from_le_bytes(data[4..8].try_into().unwrap());
1151            let num_blocks = u32::from_le_bytes(data[8..12].try_into().unwrap()) as usize;
1152            let mut pos = 12;
1153            let mut skip = Vec::new();
1154            for _ in 0..num_blocks {
1155                let first_doc = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
1156                let last_doc = u32::from_le_bytes(data[pos + 4..pos + 8].try_into().unwrap());
1157                let offset = u32::from_le_bytes(data[pos + 8..pos + 12].try_into().unwrap());
1158                let length = u32::from_le_bytes(data[pos + 12..pos + 16].try_into().unwrap());
1159                let max_w = f32::from_le_bytes(data[pos + 16..pos + 20].try_into().unwrap());
1160                skip.push(SparseSkipEntry::new(
1161                    first_doc, last_doc, offset, length, max_w,
1162                ));
1163                pos += 20;
1164            }
1165            (doc_count, global_max, skip, &data[pos..])
1166        }
1167
1168        let (dc1, gm1, skip1, raw1) = parse_raw(&bytes1);
1169        let (dc2, gm2, skip2, raw2) = parse_raw(&bytes2);
1170
1171        // --- Simulate the merger's zero-copy reassembly ---
1172        let doc_offset: u32 = 1000; // segment 2 starts at doc 1000
1173        let total_docs = dc1 + dc2;
1174        let global_max = gm1.max(gm2);
1175        let total_blocks = (skip1.len() + skip2.len()) as u32;
1176
1177        let mut output = Vec::new();
1178        // Write header
1179        output.extend_from_slice(&total_docs.to_le_bytes());
1180        output.extend_from_slice(&global_max.to_le_bytes());
1181        output.extend_from_slice(&total_blocks.to_le_bytes());
1182
1183        // Write adjusted skip entries
1184        let mut block_data_offset = 0u32;
1185        for entry in &skip1 {
1186            let adjusted = SparseSkipEntry::new(
1187                entry.first_doc,
1188                entry.last_doc,
1189                block_data_offset + entry.offset,
1190                entry.length,
1191                entry.max_weight,
1192            );
1193            adjusted.write(&mut output).unwrap();
1194        }
1195        if let Some(last) = skip1.last() {
1196            block_data_offset += last.offset + last.length;
1197        }
1198        for entry in &skip2 {
1199            let adjusted = SparseSkipEntry::new(
1200                entry.first_doc + doc_offset,
1201                entry.last_doc + doc_offset,
1202                block_data_offset + entry.offset,
1203                entry.length,
1204                entry.max_weight,
1205            );
1206            adjusted.write(&mut output).unwrap();
1207        }
1208
1209        // Write raw block data: source 1 verbatim, source 2 with first_doc_id patched
1210        output.extend_from_slice(raw1);
1211
1212        const FIRST_DOC_ID_OFFSET: usize = 8;
1213        let mut buf2 = raw2.to_vec();
1214        for entry in &skip2 {
1215            let off = entry.offset as usize + FIRST_DOC_ID_OFFSET;
1216            if off + 4 <= buf2.len() {
1217                let old = u32::from_le_bytes(buf2[off..off + 4].try_into().unwrap());
1218                let patched = (old + doc_offset).to_le_bytes();
1219                buf2[off..off + 4].copy_from_slice(&patched);
1220            }
1221        }
1222        output.extend_from_slice(&buf2);
1223
1224        // --- Deserialize the reassembled posting list and verify ---
1225        let loaded = BlockSparsePostingList::deserialize(&mut Cursor::new(&output)).unwrap();
1226        assert_eq!(loaded.doc_count(), 350);
1227
1228        let mut iter = loaded.iterator();
1229
1230        // Segment 1: docs 0, 2, 4, ..., 398
1231        assert_eq!(iter.doc(), 0);
1232        let doc = iter.seek(100);
1233        assert_eq!(doc, 100);
1234        let doc = iter.seek(398);
1235        assert_eq!(doc, 398);
1236
1237        // Segment 2: docs 1000, 1003, 1006, ..., 1000 + 149*3 = 1447
1238        let doc = iter.seek(1000);
1239        assert_eq!(doc, 1000, "First doc of segment 2 should be 1000");
1240        iter.advance();
1241        assert_eq!(iter.doc(), 1003, "Second doc of segment 2 should be 1003");
1242        let doc = iter.seek(1447);
1243        assert_eq!(doc, 1447, "Last doc of segment 2 should be 1447");
1244
1245        // Exhausted
1246        iter.advance();
1247        assert_eq!(iter.doc(), super::TERMINATED);
1248
1249        // Also verify with merge_with_offsets to confirm identical results
1250        let reference =
1251            BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, doc_offset)]);
1252        let mut ref_iter = reference.iterator();
1253        let mut zc_iter = loaded.iterator();
1254        while ref_iter.doc() != super::TERMINATED {
1255            assert_eq!(
1256                ref_iter.doc(),
1257                zc_iter.doc(),
1258                "Zero-copy and reference merge should produce identical doc_ids"
1259            );
1260            assert!(
1261                (ref_iter.weight() - zc_iter.weight()).abs() < 0.01,
1262                "Weights should match: {} vs {}",
1263                ref_iter.weight(),
1264                zc_iter.weight()
1265            );
1266            ref_iter.advance();
1267            zc_iter.advance();
1268        }
1269        assert_eq!(zc_iter.doc(), super::TERMINATED);
1270    }
1271
1272    #[test]
1273    fn test_doc_count_single_value() {
1274        // Single-value: each doc_id appears once (ordinal always 0)
1275        let postings: Vec<(DocId, u16, f32)> =
1276            vec![(0, 0, 1.0), (5, 0, 2.0), (10, 0, 3.0), (15, 0, 4.0)];
1277        let list =
1278            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1279
1280        // doc_count == total postings for single-value
1281        assert_eq!(list.doc_count(), 4);
1282    }
1283
1284    #[test]
1285    fn test_doc_count_multi_value_serialization_roundtrip() {
1286        // Verify doc_count survives serialization
1287        let postings: Vec<(DocId, u16, f32)> =
1288            vec![(0, 0, 1.0), (0, 1, 1.5), (5, 0, 2.0), (5, 1, 2.5)];
1289        let list =
1290            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1291        assert_eq!(list.doc_count(), 2);
1292
1293        let mut buf = Vec::new();
1294        list.serialize(&mut buf).unwrap();
1295        let loaded = BlockSparsePostingList::deserialize(&mut Cursor::new(&buf)).unwrap();
1296        assert_eq!(loaded.doc_count(), 2);
1297    }
1298
1299    #[test]
1300    fn test_merge_preserves_weights_and_ordinals() {
1301        // Test that weights and ordinals are preserved after merge + roundtrip
1302        let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.5), (5, 1, 2.5), (10, 2, 3.5)];
1303        let list1 =
1304            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1305
1306        let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.5), (3, 1, 5.5), (7, 3, 6.5)];
1307        let list2 =
1308            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1309
1310        // Merge with offset 100 for segment 2
1311        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1312
1313        // Serialize and deserialize
1314        let mut bytes = Vec::new();
1315        merged.serialize(&mut bytes).unwrap();
1316        let loaded =
1317            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1318
1319        // Verify all postings via iterator
1320        let mut iter = loaded.iterator();
1321
1322        // Segment 1 postings
1323        assert_eq!(iter.doc(), 0);
1324        assert!(
1325            (iter.weight() - 1.5).abs() < 0.01,
1326            "Weight should be 1.5, got {}",
1327            iter.weight()
1328        );
1329        assert_eq!(iter.ordinal(), 0);
1330
1331        iter.advance();
1332        assert_eq!(iter.doc(), 5);
1333        assert!(
1334            (iter.weight() - 2.5).abs() < 0.01,
1335            "Weight should be 2.5, got {}",
1336            iter.weight()
1337        );
1338        assert_eq!(iter.ordinal(), 1);
1339
1340        iter.advance();
1341        assert_eq!(iter.doc(), 10);
1342        assert!(
1343            (iter.weight() - 3.5).abs() < 0.01,
1344            "Weight should be 3.5, got {}",
1345            iter.weight()
1346        );
1347        assert_eq!(iter.ordinal(), 2);
1348
1349        // Segment 2 postings (with offset 100)
1350        iter.advance();
1351        assert_eq!(iter.doc(), 100);
1352        assert!(
1353            (iter.weight() - 4.5).abs() < 0.01,
1354            "Weight should be 4.5, got {}",
1355            iter.weight()
1356        );
1357        assert_eq!(iter.ordinal(), 0);
1358
1359        iter.advance();
1360        assert_eq!(iter.doc(), 103);
1361        assert!(
1362            (iter.weight() - 5.5).abs() < 0.01,
1363            "Weight should be 5.5, got {}",
1364            iter.weight()
1365        );
1366        assert_eq!(iter.ordinal(), 1);
1367
1368        iter.advance();
1369        assert_eq!(iter.doc(), 107);
1370        assert!(
1371            (iter.weight() - 6.5).abs() < 0.01,
1372            "Weight should be 6.5, got {}",
1373            iter.weight()
1374        );
1375        assert_eq!(iter.ordinal(), 3);
1376
1377        // Verify exhausted
1378        iter.advance();
1379        assert_eq!(iter.doc(), super::TERMINATED);
1380    }
1381
1382    #[test]
1383    fn test_merge_global_max_weight() {
1384        // Verify global_max_weight is correct after merge
1385        let postings1: Vec<(DocId, u16, f32)> = vec![
1386            (0, 0, 3.0),
1387            (1, 0, 7.0), // max in segment 1
1388            (2, 0, 2.0),
1389        ];
1390        let list1 =
1391            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1392
1393        let postings2: Vec<(DocId, u16, f32)> = vec![
1394            (0, 0, 5.0),
1395            (1, 0, 4.0),
1396            (2, 0, 6.0), // max in segment 2
1397        ];
1398        let list2 =
1399            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1400
1401        // Verify original global max weights
1402        assert!((list1.global_max_weight() - 7.0).abs() < 0.01);
1403        assert!((list2.global_max_weight() - 6.0).abs() < 0.01);
1404
1405        // Merge
1406        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1407
1408        // Global max should be 7.0 (from segment 1)
1409        assert!(
1410            (merged.global_max_weight() - 7.0).abs() < 0.01,
1411            "Global max should be 7.0, got {}",
1412            merged.global_max_weight()
1413        );
1414
1415        // Roundtrip
1416        let mut bytes = Vec::new();
1417        merged.serialize(&mut bytes).unwrap();
1418        let loaded =
1419            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1420
1421        assert!(
1422            (loaded.global_max_weight() - 7.0).abs() < 0.01,
1423            "After roundtrip, global max should still be 7.0, got {}",
1424            loaded.global_max_weight()
1425        );
1426    }
1427
1428    #[test]
1429    fn test_scoring_simulation_after_merge() {
1430        // Simulate what SparseTermScorer does - compute query_weight * stored_weight
1431        let postings1: Vec<(DocId, u16, f32)> = vec![
1432            (0, 0, 0.5), // doc 0, weight 0.5
1433            (5, 0, 0.8), // doc 5, weight 0.8
1434        ];
1435        let list1 =
1436            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1437
1438        let postings2: Vec<(DocId, u16, f32)> = vec![
1439            (0, 0, 0.6), // doc 100 after offset, weight 0.6
1440            (3, 0, 0.9), // doc 103 after offset, weight 0.9
1441        ];
1442        let list2 =
1443            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1444
1445        // Merge with offset 100
1446        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1447
1448        // Roundtrip
1449        let mut bytes = Vec::new();
1450        merged.serialize(&mut bytes).unwrap();
1451        let loaded =
1452            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1453
1454        // Simulate scoring with query_weight = 2.0
1455        let query_weight = 2.0f32;
1456        let mut iter = loaded.iterator();
1457
1458        // Expected scores: query_weight * stored_weight
1459        // Doc 0: 2.0 * 0.5 = 1.0
1460        assert_eq!(iter.doc(), 0);
1461        let score = query_weight * iter.weight();
1462        assert!(
1463            (score - 1.0).abs() < 0.01,
1464            "Doc 0 score should be 1.0, got {}",
1465            score
1466        );
1467
1468        iter.advance();
1469        // Doc 5: 2.0 * 0.8 = 1.6
1470        assert_eq!(iter.doc(), 5);
1471        let score = query_weight * iter.weight();
1472        assert!(
1473            (score - 1.6).abs() < 0.01,
1474            "Doc 5 score should be 1.6, got {}",
1475            score
1476        );
1477
1478        iter.advance();
1479        // Doc 100: 2.0 * 0.6 = 1.2
1480        assert_eq!(iter.doc(), 100);
1481        let score = query_weight * iter.weight();
1482        assert!(
1483            (score - 1.2).abs() < 0.01,
1484            "Doc 100 score should be 1.2, got {}",
1485            score
1486        );
1487
1488        iter.advance();
1489        // Doc 103: 2.0 * 0.9 = 1.8
1490        assert_eq!(iter.doc(), 103);
1491        let score = query_weight * iter.weight();
1492        assert!(
1493            (score - 1.8).abs() < 0.01,
1494            "Doc 103 score should be 1.8, got {}",
1495            score
1496        );
1497    }
1498}
hermes_core/structures/postings/sparse/block.rs

hermes_core/structures/postings/sparse/
block.rs