hermes_core/structures/postings/sparse/
block.rs

1//! Block-based sparse posting list with 3 sub-blocks
2//!
3//! Format per block (128 entries for SIMD alignment):
4//! - Doc IDs: delta-encoded, bit-packed
5//! - Ordinals: bit-packed small integers (lazy decode)
6//! - Weights: quantized (f32/f16/u8/u4)
7
8use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Cursor, Read, Write};
10
11use super::config::WeightQuantization;
12use crate::DocId;
13use crate::structures::postings::TERMINATED;
14use crate::structures::simd;
15
16pub const BLOCK_SIZE: usize = 128;
17
18#[derive(Debug, Clone, Copy)]
19pub struct BlockHeader {
20    pub count: u16,
21    pub doc_id_bits: u8,
22    pub ordinal_bits: u8,
23    pub weight_quant: WeightQuantization,
24    pub first_doc_id: DocId,
25    pub max_weight: f32,
26}
27
28impl BlockHeader {
29    pub const SIZE: usize = 16;
30
31    pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
32        w.write_u16::<LittleEndian>(self.count)?;
33        w.write_u8(self.doc_id_bits)?;
34        w.write_u8(self.ordinal_bits)?;
35        w.write_u8(self.weight_quant as u8)?;
36        w.write_u8(0)?;
37        w.write_u16::<LittleEndian>(0)?;
38        w.write_u32::<LittleEndian>(self.first_doc_id)?;
39        w.write_f32::<LittleEndian>(self.max_weight)?;
40        Ok(())
41    }
42
43    pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
44        let count = r.read_u16::<LittleEndian>()?;
45        let doc_id_bits = r.read_u8()?;
46        let ordinal_bits = r.read_u8()?;
47        let weight_quant_byte = r.read_u8()?;
48        let _ = r.read_u8()?;
49        let _ = r.read_u16::<LittleEndian>()?;
50        let first_doc_id = r.read_u32::<LittleEndian>()?;
51        let max_weight = r.read_f32::<LittleEndian>()?;
52
53        let weight_quant = WeightQuantization::from_u8(weight_quant_byte)
54            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid weight quant"))?;
55
56        Ok(Self {
57            count,
58            doc_id_bits,
59            ordinal_bits,
60            weight_quant,
61            first_doc_id,
62            max_weight,
63        })
64    }
65}
66
67#[derive(Debug, Clone)]
68pub struct SparseBlock {
69    pub header: BlockHeader,
70    pub doc_ids_data: Vec<u8>,
71    pub ordinals_data: Vec<u8>,
72    pub weights_data: Vec<u8>,
73}
74
75impl SparseBlock {
76    pub fn from_postings(
77        postings: &[(DocId, u16, f32)],
78        weight_quant: WeightQuantization,
79    ) -> io::Result<Self> {
80        assert!(!postings.is_empty() && postings.len() <= BLOCK_SIZE);
81
82        let count = postings.len();
83        let first_doc_id = postings[0].0;
84
85        // Delta encode doc IDs
86        let mut deltas = Vec::with_capacity(count);
87        let mut prev = first_doc_id;
88        for &(doc_id, _, _) in postings {
89            deltas.push(doc_id.saturating_sub(prev));
90            prev = doc_id;
91        }
92        deltas[0] = 0;
93
94        let doc_id_bits = find_optimal_bit_width(&deltas[1..]);
95        let ordinals: Vec<u16> = postings.iter().map(|(_, o, _)| *o).collect();
96        let max_ordinal = ordinals.iter().copied().max().unwrap_or(0);
97        let ordinal_bits = if max_ordinal == 0 {
98            0
99        } else {
100            bits_needed_u16(max_ordinal)
101        };
102
103        let weights: Vec<f32> = postings.iter().map(|(_, _, w)| *w).collect();
104        let max_weight = weights.iter().copied().fold(0.0f32, f32::max);
105
106        let doc_ids_data = pack_bit_array(&deltas[1..], doc_id_bits);
107        let ordinals_data = if ordinal_bits > 0 {
108            pack_bit_array_u16(&ordinals, ordinal_bits)
109        } else {
110            Vec::new()
111        };
112        let weights_data = encode_weights(&weights, weight_quant)?;
113
114        Ok(Self {
115            header: BlockHeader {
116                count: count as u16,
117                doc_id_bits,
118                ordinal_bits,
119                weight_quant,
120                first_doc_id,
121                max_weight,
122            },
123            doc_ids_data,
124            ordinals_data,
125            weights_data,
126        })
127    }
128
129    pub fn decode_doc_ids(&self) -> Vec<DocId> {
130        let mut out = Vec::with_capacity(self.header.count as usize);
131        self.decode_doc_ids_into(&mut out);
132        out
133    }
134
135    /// Decode doc IDs into an existing Vec (avoids allocation on reuse).
136    pub fn decode_doc_ids_into(&self, out: &mut Vec<DocId>) {
137        let count = self.header.count as usize;
138        out.clear();
139        out.push(self.header.first_doc_id);
140
141        if count > 1 {
142            let deltas = unpack_bit_array(&self.doc_ids_data, self.header.doc_id_bits, count - 1);
143            let mut prev = self.header.first_doc_id;
144            for delta in deltas {
145                prev += delta;
146                out.push(prev);
147            }
148        }
149    }
150
151    pub fn decode_ordinals(&self) -> Vec<u16> {
152        let mut out = Vec::with_capacity(self.header.count as usize);
153        self.decode_ordinals_into(&mut out);
154        out
155    }
156
157    /// Decode ordinals into an existing Vec (avoids allocation on reuse).
158    pub fn decode_ordinals_into(&self, out: &mut Vec<u16>) {
159        let count = self.header.count as usize;
160        out.clear();
161        if self.header.ordinal_bits == 0 {
162            out.resize(count, 0u16);
163        } else {
164            unpack_bit_array_u16_into(&self.ordinals_data, self.header.ordinal_bits, count, out);
165        }
166    }
167
168    pub fn decode_weights(&self) -> Vec<f32> {
169        let mut out = Vec::with_capacity(self.header.count as usize);
170        self.decode_weights_into(&mut out);
171        out
172    }
173
174    /// Decode weights into an existing Vec (avoids allocation on reuse).
175    pub fn decode_weights_into(&self, out: &mut Vec<f32>) {
176        out.clear();
177        decode_weights_into(
178            &self.weights_data,
179            self.header.weight_quant,
180            self.header.count as usize,
181            out,
182        );
183    }
184
185    pub fn write<W: Write>(&self, w: &mut W) -> io::Result<()> {
186        self.header.write(w)?;
187        w.write_u16::<LittleEndian>(self.doc_ids_data.len() as u16)?;
188        w.write_u16::<LittleEndian>(self.ordinals_data.len() as u16)?;
189        w.write_u16::<LittleEndian>(self.weights_data.len() as u16)?;
190        w.write_u16::<LittleEndian>(0)?;
191        w.write_all(&self.doc_ids_data)?;
192        w.write_all(&self.ordinals_data)?;
193        w.write_all(&self.weights_data)?;
194        Ok(())
195    }
196
197    pub fn read<R: Read>(r: &mut R) -> io::Result<Self> {
198        let header = BlockHeader::read(r)?;
199        let doc_ids_len = r.read_u16::<LittleEndian>()? as usize;
200        let ordinals_len = r.read_u16::<LittleEndian>()? as usize;
201        let weights_len = r.read_u16::<LittleEndian>()? as usize;
202        let _ = r.read_u16::<LittleEndian>()?;
203
204        let mut doc_ids_data = vec![0u8; doc_ids_len];
205        r.read_exact(&mut doc_ids_data)?;
206        let mut ordinals_data = vec![0u8; ordinals_len];
207        r.read_exact(&mut ordinals_data)?;
208        let mut weights_data = vec![0u8; weights_len];
209        r.read_exact(&mut weights_data)?;
210
211        Ok(Self {
212            header,
213            doc_ids_data,
214            ordinals_data,
215            weights_data,
216        })
217    }
218
219    /// Create a copy of this block with first_doc_id adjusted by offset.
220    ///
221    /// This is used during merge to remap doc_ids from different segments.
222    /// Only the first_doc_id needs adjustment - deltas within the block
223    /// remain unchanged since they're relative to the previous doc.
224    pub fn with_doc_offset(&self, doc_offset: u32) -> Self {
225        Self {
226            header: BlockHeader {
227                first_doc_id: self.header.first_doc_id + doc_offset,
228                ..self.header
229            },
230            doc_ids_data: self.doc_ids_data.clone(),
231            ordinals_data: self.ordinals_data.clone(),
232            weights_data: self.weights_data.clone(),
233        }
234    }
235}
236
237// ============================================================================
238// BlockSparsePostingList
239// ============================================================================
240
241#[derive(Debug, Clone)]
242pub struct BlockSparsePostingList {
243    pub doc_count: u32,
244    pub blocks: Vec<SparseBlock>,
245}
246
247impl BlockSparsePostingList {
248    /// Create from postings with configurable block size
249    pub fn from_postings_with_block_size(
250        postings: &[(DocId, u16, f32)],
251        weight_quant: WeightQuantization,
252        block_size: usize,
253    ) -> io::Result<Self> {
254        if postings.is_empty() {
255            return Ok(Self {
256                doc_count: 0,
257                blocks: Vec::new(),
258            });
259        }
260
261        let block_size = block_size.max(16); // minimum 16 for sanity
262        let mut blocks = Vec::new();
263        for chunk in postings.chunks(block_size) {
264            blocks.push(SparseBlock::from_postings(chunk, weight_quant)?);
265        }
266
267        // Count unique document IDs (not total postings).
268        // For multi-value fields, the same doc_id appears multiple times
269        // with different ordinals. Postings are sorted by (doc_id, ordinal),
270        // so we count transitions.
271        let mut unique_docs = 1u32;
272        for i in 1..postings.len() {
273            if postings[i].0 != postings[i - 1].0 {
274                unique_docs += 1;
275            }
276        }
277
278        Ok(Self {
279            doc_count: unique_docs,
280            blocks,
281        })
282    }
283
284    /// Create from postings with default block size (128)
285    pub fn from_postings(
286        postings: &[(DocId, u16, f32)],
287        weight_quant: WeightQuantization,
288    ) -> io::Result<Self> {
289        Self::from_postings_with_block_size(postings, weight_quant, BLOCK_SIZE)
290    }
291
292    pub fn doc_count(&self) -> u32 {
293        self.doc_count
294    }
295
296    pub fn num_blocks(&self) -> usize {
297        self.blocks.len()
298    }
299
300    pub fn global_max_weight(&self) -> f32 {
301        self.blocks
302            .iter()
303            .map(|b| b.header.max_weight)
304            .fold(0.0f32, f32::max)
305    }
306
307    pub fn block_max_weight(&self, block_idx: usize) -> Option<f32> {
308        self.blocks.get(block_idx).map(|b| b.header.max_weight)
309    }
310
311    /// Approximate memory usage in bytes
312    pub fn size_bytes(&self) -> usize {
313        use std::mem::size_of;
314
315        let header_size = size_of::<u32>() * 2; // doc_count + num_blocks
316        let blocks_size: usize = self
317            .blocks
318            .iter()
319            .map(|b| {
320                size_of::<BlockHeader>()
321                    + b.doc_ids_data.len()
322                    + b.ordinals_data.len()
323                    + b.weights_data.len()
324            })
325            .sum();
326        header_size + blocks_size
327    }
328
329    pub fn iterator(&self) -> BlockSparsePostingIterator<'_> {
330        BlockSparsePostingIterator::new(self)
331    }
332
333    /// Serialize with skip list header for lazy loading
334    ///
335    /// Format:
336    /// - doc_count: u32
337    /// - global_max_weight: f32
338    /// - num_blocks: u32
339    /// - skip_list: [SparseSkipEntry] × num_blocks (first_doc, last_doc, offset, length, max_weight)
340    /// - block_data: concatenated SparseBlock data
341    pub fn serialize<W: Write>(&self, w: &mut W) -> io::Result<()> {
342        use super::SparseSkipEntry;
343
344        w.write_u32::<LittleEndian>(self.doc_count)?;
345        w.write_f32::<LittleEndian>(self.global_max_weight())?;
346        w.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
347
348        // First pass: serialize blocks to get their sizes
349        let mut block_bytes: Vec<Vec<u8>> = Vec::with_capacity(self.blocks.len());
350        for block in &self.blocks {
351            let mut buf = Vec::new();
352            block.write(&mut buf)?;
353            block_bytes.push(buf);
354        }
355
356        // Write skip list entries
357        let mut offset = 0u32;
358        for (block, bytes) in self.blocks.iter().zip(block_bytes.iter()) {
359            let first_doc = block.header.first_doc_id;
360            let doc_ids = block.decode_doc_ids();
361            let last_doc = doc_ids.last().copied().unwrap_or(first_doc);
362            let length = bytes.len() as u32;
363
364            let entry =
365                SparseSkipEntry::new(first_doc, last_doc, offset, length, block.header.max_weight);
366            entry.write(w)?;
367            offset += length;
368        }
369
370        // Write block data
371        for bytes in block_bytes {
372            w.write_all(&bytes)?;
373        }
374
375        Ok(())
376    }
377
378    /// Deserialize fully (loads all blocks into memory)
379    /// For lazy loading, use deserialize_header() + load_block()
380    pub fn deserialize<R: Read>(r: &mut R) -> io::Result<Self> {
381        use super::SparseSkipEntry;
382
383        let doc_count = r.read_u32::<LittleEndian>()?;
384        let _global_max_weight = r.read_f32::<LittleEndian>()?;
385        let num_blocks = r.read_u32::<LittleEndian>()? as usize;
386
387        // Skip the skip list entries
388        for _ in 0..num_blocks {
389            let _ = SparseSkipEntry::read(r)?;
390        }
391
392        // Read all blocks
393        let mut blocks = Vec::with_capacity(num_blocks);
394        for _ in 0..num_blocks {
395            blocks.push(SparseBlock::read(r)?);
396        }
397        Ok(Self { doc_count, blocks })
398    }
399
400    /// Deserialize only the skip list header (for lazy loading)
401    /// Returns (doc_count, global_max_weight, skip_entries, header_size)
402    pub fn deserialize_header<R: Read>(
403        r: &mut R,
404    ) -> io::Result<(u32, f32, Vec<super::SparseSkipEntry>, usize)> {
405        use super::SparseSkipEntry;
406
407        let doc_count = r.read_u32::<LittleEndian>()?;
408        let global_max_weight = r.read_f32::<LittleEndian>()?;
409        let num_blocks = r.read_u32::<LittleEndian>()? as usize;
410
411        let mut entries = Vec::with_capacity(num_blocks);
412        for _ in 0..num_blocks {
413            entries.push(SparseSkipEntry::read(r)?);
414        }
415
416        // Header size: 4 + 4 + 4 + num_blocks * SparseSkipEntry::SIZE
417        let header_size = 4 + 4 + 4 + num_blocks * SparseSkipEntry::SIZE;
418
419        Ok((doc_count, global_max_weight, entries, header_size))
420    }
421
422    pub fn decode_all(&self) -> Vec<(DocId, u16, f32)> {
423        let total_postings: usize = self.blocks.iter().map(|b| b.header.count as usize).sum();
424        let mut result = Vec::with_capacity(total_postings);
425        for block in &self.blocks {
426            let doc_ids = block.decode_doc_ids();
427            let ordinals = block.decode_ordinals();
428            let weights = block.decode_weights();
429            for i in 0..block.header.count as usize {
430                result.push((doc_ids[i], ordinals[i], weights[i]));
431            }
432        }
433        result
434    }
435
436    /// Merge multiple posting lists from different segments with doc_id offsets.
437    ///
438    /// This is an optimized O(1) merge that stacks blocks without decode/re-encode.
439    /// Each posting list's blocks have their first_doc_id adjusted by the corresponding offset.
440    ///
441    /// # Arguments
442    /// * `lists` - Slice of (posting_list, doc_offset) pairs from each segment
443    ///
444    /// # Returns
445    /// A new posting list with all blocks concatenated and doc_ids remapped
446    pub fn merge_with_offsets(lists: &[(&BlockSparsePostingList, u32)]) -> Self {
447        if lists.is_empty() {
448            return Self {
449                doc_count: 0,
450                blocks: Vec::new(),
451            };
452        }
453
454        // Pre-calculate total capacity
455        let total_blocks: usize = lists.iter().map(|(pl, _)| pl.blocks.len()).sum();
456        let total_docs: u32 = lists.iter().map(|(pl, _)| pl.doc_count).sum();
457
458        let mut merged_blocks = Vec::with_capacity(total_blocks);
459
460        // Stack blocks from each segment with doc_id offset adjustment
461        for (posting_list, doc_offset) in lists {
462            for block in &posting_list.blocks {
463                merged_blocks.push(block.with_doc_offset(*doc_offset));
464            }
465        }
466
467        Self {
468            doc_count: total_docs,
469            blocks: merged_blocks,
470        }
471    }
472
473    fn find_block(&self, target: DocId) -> Option<usize> {
474        if self.blocks.is_empty() {
475            return None;
476        }
477        // Binary search on first_doc_id: find the last block whose first_doc_id <= target.
478        // O(log N) header comparisons — no block decode needed.
479        let idx = self
480            .blocks
481            .partition_point(|b| b.header.first_doc_id <= target);
482        if idx == 0 {
483            // target < first_doc_id of block 0 — return block 0 so caller can check
484            Some(0)
485        } else {
486            Some(idx - 1)
487        }
488    }
489}
490
491// ============================================================================
492// Iterator
493// ============================================================================
494
495pub struct BlockSparsePostingIterator<'a> {
496    posting_list: &'a BlockSparsePostingList,
497    block_idx: usize,
498    in_block_idx: usize,
499    current_doc_ids: Vec<DocId>,
500    current_ordinals: Vec<u16>,
501    current_weights: Vec<f32>,
502    exhausted: bool,
503}
504
505impl<'a> BlockSparsePostingIterator<'a> {
506    fn new(posting_list: &'a BlockSparsePostingList) -> Self {
507        let mut iter = Self {
508            posting_list,
509            block_idx: 0,
510            in_block_idx: 0,
511            current_doc_ids: Vec::new(),
512            current_ordinals: Vec::new(),
513            current_weights: Vec::new(),
514            exhausted: posting_list.blocks.is_empty(),
515        };
516        if !iter.exhausted {
517            iter.load_block(0);
518        }
519        iter
520    }
521
522    fn load_block(&mut self, block_idx: usize) {
523        if let Some(block) = self.posting_list.blocks.get(block_idx) {
524            block.decode_doc_ids_into(&mut self.current_doc_ids);
525            block.decode_ordinals_into(&mut self.current_ordinals);
526            block.decode_weights_into(&mut self.current_weights);
527            self.block_idx = block_idx;
528            self.in_block_idx = 0;
529        }
530    }
531
532    pub fn doc(&self) -> DocId {
533        if self.exhausted {
534            TERMINATED
535        } else {
536            self.current_doc_ids
537                .get(self.in_block_idx)
538                .copied()
539                .unwrap_or(TERMINATED)
540        }
541    }
542
543    pub fn weight(&self) -> f32 {
544        self.current_weights
545            .get(self.in_block_idx)
546            .copied()
547            .unwrap_or(0.0)
548    }
549
550    pub fn ordinal(&self) -> u16 {
551        self.current_ordinals
552            .get(self.in_block_idx)
553            .copied()
554            .unwrap_or(0)
555    }
556
557    pub fn advance(&mut self) -> DocId {
558        if self.exhausted {
559            return TERMINATED;
560        }
561        self.in_block_idx += 1;
562        if self.in_block_idx >= self.current_doc_ids.len() {
563            self.block_idx += 1;
564            if self.block_idx >= self.posting_list.blocks.len() {
565                self.exhausted = true;
566            } else {
567                self.load_block(self.block_idx);
568            }
569        }
570        self.doc()
571    }
572
573    pub fn seek(&mut self, target: DocId) -> DocId {
574        if self.exhausted {
575            return TERMINATED;
576        }
577        if self.doc() >= target {
578            return self.doc();
579        }
580
581        // Check current block — binary search within decoded doc_ids
582        if let Some(&last_doc) = self.current_doc_ids.last()
583            && last_doc >= target
584        {
585            let remaining = &self.current_doc_ids[self.in_block_idx..];
586            let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
587            self.in_block_idx += pos;
588            if self.in_block_idx >= self.current_doc_ids.len() {
589                self.block_idx += 1;
590                if self.block_idx >= self.posting_list.blocks.len() {
591                    self.exhausted = true;
592                } else {
593                    self.load_block(self.block_idx);
594                }
595            }
596            return self.doc();
597        }
598
599        // Find correct block
600        if let Some(block_idx) = self.posting_list.find_block(target) {
601            self.load_block(block_idx);
602            let pos = crate::structures::simd::find_first_ge_u32(&self.current_doc_ids, target);
603            self.in_block_idx = pos;
604            if self.in_block_idx >= self.current_doc_ids.len() {
605                self.block_idx += 1;
606                if self.block_idx >= self.posting_list.blocks.len() {
607                    self.exhausted = true;
608                } else {
609                    self.load_block(self.block_idx);
610                }
611            }
612        } else {
613            self.exhausted = true;
614        }
615        self.doc()
616    }
617
618    /// Skip to the start of the next block, returning its first doc_id.
619    /// Used by block-max WAND to skip entire blocks that can't beat threshold.
620    pub fn skip_to_next_block(&mut self) -> DocId {
621        if self.exhausted {
622            return TERMINATED;
623        }
624        let next = self.block_idx + 1;
625        if next >= self.posting_list.blocks.len() {
626            self.exhausted = true;
627            return TERMINATED;
628        }
629        self.load_block(next);
630        self.doc()
631    }
632
633    pub fn is_exhausted(&self) -> bool {
634        self.exhausted
635    }
636
637    pub fn current_block_max_weight(&self) -> f32 {
638        self.posting_list
639            .blocks
640            .get(self.block_idx)
641            .map(|b| b.header.max_weight)
642            .unwrap_or(0.0)
643    }
644
645    pub fn current_block_max_contribution(&self, query_weight: f32) -> f32 {
646        query_weight * self.current_block_max_weight()
647    }
648}
649
650// ============================================================================
651// Bit-packing utilities
652// ============================================================================
653
654fn find_optimal_bit_width(values: &[u32]) -> u8 {
655    if values.is_empty() {
656        return 0;
657    }
658    let max_val = values.iter().copied().max().unwrap_or(0);
659    simd::bits_needed(max_val)
660}
661
662fn bits_needed_u16(val: u16) -> u8 {
663    if val == 0 {
664        0
665    } else {
666        16 - val.leading_zeros() as u8
667    }
668}
669
670fn pack_bit_array(values: &[u32], bits: u8) -> Vec<u8> {
671    if bits == 0 || values.is_empty() {
672        return Vec::new();
673    }
674    let total_bytes = (values.len() * bits as usize).div_ceil(8);
675    let mut result = vec![0u8; total_bytes];
676    let mut bit_pos = 0usize;
677    for &val in values {
678        pack_value(&mut result, bit_pos, val & ((1u32 << bits) - 1), bits);
679        bit_pos += bits as usize;
680    }
681    result
682}
683
684fn pack_bit_array_u16(values: &[u16], bits: u8) -> Vec<u8> {
685    if bits == 0 || values.is_empty() {
686        return Vec::new();
687    }
688    let total_bytes = (values.len() * bits as usize).div_ceil(8);
689    let mut result = vec![0u8; total_bytes];
690    let mut bit_pos = 0usize;
691    for &val in values {
692        pack_value(
693            &mut result,
694            bit_pos,
695            (val as u32) & ((1u32 << bits) - 1),
696            bits,
697        );
698        bit_pos += bits as usize;
699    }
700    result
701}
702
703#[inline]
704fn pack_value(data: &mut [u8], bit_pos: usize, val: u32, bits: u8) {
705    let mut remaining = bits as usize;
706    let mut val = val;
707    let mut byte = bit_pos / 8;
708    let mut offset = bit_pos % 8;
709    while remaining > 0 {
710        let space = 8 - offset;
711        let to_write = remaining.min(space);
712        let mask = (1u32 << to_write) - 1;
713        data[byte] |= ((val & mask) as u8) << offset;
714        val >>= to_write;
715        remaining -= to_write;
716        byte += 1;
717        offset = 0;
718    }
719}
720
721fn unpack_bit_array(data: &[u8], bits: u8, count: usize) -> Vec<u32> {
722    if bits == 0 || count == 0 {
723        return vec![0; count];
724    }
725    let mut result = Vec::with_capacity(count);
726    let mut bit_pos = 0usize;
727    for _ in 0..count {
728        result.push(unpack_value(data, bit_pos, bits));
729        bit_pos += bits as usize;
730    }
731    result
732}
733
734fn unpack_bit_array_u16_into(data: &[u8], bits: u8, count: usize, out: &mut Vec<u16>) {
735    if bits == 0 || count == 0 {
736        out.resize(count, 0u16);
737        return;
738    }
739    let mut bit_pos = 0usize;
740    for _ in 0..count {
741        out.push(unpack_value(data, bit_pos, bits) as u16);
742        bit_pos += bits as usize;
743    }
744}
745
746#[inline]
747fn unpack_value(data: &[u8], bit_pos: usize, bits: u8) -> u32 {
748    let mut val = 0u32;
749    let mut remaining = bits as usize;
750    let mut byte = bit_pos / 8;
751    let mut offset = bit_pos % 8;
752    let mut shift = 0;
753    while remaining > 0 {
754        let space = 8 - offset;
755        let to_read = remaining.min(space);
756        let mask = (1u8 << to_read) - 1;
757        val |= (((data.get(byte).copied().unwrap_or(0) >> offset) & mask) as u32) << shift;
758        remaining -= to_read;
759        shift += to_read;
760        byte += 1;
761        offset = 0;
762    }
763    val
764}
765
766// ============================================================================
767// Weight encoding/decoding
768// ============================================================================
769
770fn encode_weights(weights: &[f32], quant: WeightQuantization) -> io::Result<Vec<u8>> {
771    let mut data = Vec::new();
772    match quant {
773        WeightQuantization::Float32 => {
774            for &w in weights {
775                data.write_f32::<LittleEndian>(w)?;
776            }
777        }
778        WeightQuantization::Float16 => {
779            use half::f16;
780            for &w in weights {
781                data.write_u16::<LittleEndian>(f16::from_f32(w).to_bits())?;
782            }
783        }
784        WeightQuantization::UInt8 => {
785            let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
786            let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
787            let range = max - min;
788            let scale = if range < f32::EPSILON {
789                1.0
790            } else {
791                range / 255.0
792            };
793            data.write_f32::<LittleEndian>(scale)?;
794            data.write_f32::<LittleEndian>(min)?;
795            for &w in weights {
796                data.write_u8(((w - min) / scale).round() as u8)?;
797            }
798        }
799        WeightQuantization::UInt4 => {
800            let min = weights.iter().copied().fold(f32::INFINITY, f32::min);
801            let max = weights.iter().copied().fold(f32::NEG_INFINITY, f32::max);
802            let range = max - min;
803            let scale = if range < f32::EPSILON {
804                1.0
805            } else {
806                range / 15.0
807            };
808            data.write_f32::<LittleEndian>(scale)?;
809            data.write_f32::<LittleEndian>(min)?;
810            let mut i = 0;
811            while i < weights.len() {
812                let q1 = ((weights[i] - min) / scale).round() as u8 & 0x0F;
813                let q2 = if i + 1 < weights.len() {
814                    ((weights[i + 1] - min) / scale).round() as u8 & 0x0F
815                } else {
816                    0
817                };
818                data.write_u8((q2 << 4) | q1)?;
819                i += 2;
820            }
821        }
822    }
823    Ok(data)
824}
825
826fn decode_weights_into(data: &[u8], quant: WeightQuantization, count: usize, out: &mut Vec<f32>) {
827    let mut cursor = Cursor::new(data);
828    match quant {
829        WeightQuantization::Float32 => {
830            for _ in 0..count {
831                out.push(cursor.read_f32::<LittleEndian>().unwrap_or(0.0));
832            }
833        }
834        WeightQuantization::Float16 => {
835            use half::f16;
836            for _ in 0..count {
837                let bits = cursor.read_u16::<LittleEndian>().unwrap_or(0);
838                out.push(f16::from_bits(bits).to_f32());
839            }
840        }
841        WeightQuantization::UInt8 => {
842            let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
843            let min = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
844            for _ in 0..count {
845                let q = cursor.read_u8().unwrap_or(0);
846                out.push(q as f32 * scale + min);
847            }
848        }
849        WeightQuantization::UInt4 => {
850            let scale = cursor.read_f32::<LittleEndian>().unwrap_or(1.0);
851            let min = cursor.read_f32::<LittleEndian>().unwrap_or(0.0);
852            let mut i = 0;
853            while i < count {
854                let byte = cursor.read_u8().unwrap_or(0);
855                out.push((byte & 0x0F) as f32 * scale + min);
856                i += 1;
857                if i < count {
858                    out.push((byte >> 4) as f32 * scale + min);
859                    i += 1;
860                }
861            }
862        }
863    }
864}
865
866#[cfg(test)]
867mod tests {
868    use super::*;
869
870    #[test]
871    fn test_block_roundtrip() {
872        let postings = vec![
873            (10u32, 0u16, 1.5f32),
874            (15, 0, 2.0),
875            (20, 1, 0.5),
876            (100, 0, 3.0),
877        ];
878        let block = SparseBlock::from_postings(&postings, WeightQuantization::Float32).unwrap();
879
880        assert_eq!(block.decode_doc_ids(), vec![10, 15, 20, 100]);
881        assert_eq!(block.decode_ordinals(), vec![0, 0, 1, 0]);
882        let weights = block.decode_weights();
883        assert!((weights[0] - 1.5).abs() < 0.01);
884    }
885
886    #[test]
887    fn test_posting_list() {
888        let postings: Vec<(DocId, u16, f32)> =
889            (0..300).map(|i| (i * 2, 0, i as f32 * 0.1)).collect();
890        let list =
891            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
892
893        assert_eq!(list.doc_count(), 300);
894        assert_eq!(list.num_blocks(), 3);
895
896        let mut iter = list.iterator();
897        assert_eq!(iter.doc(), 0);
898        iter.advance();
899        assert_eq!(iter.doc(), 2);
900    }
901
902    #[test]
903    fn test_serialization() {
904        let postings = vec![(1u32, 0u16, 0.5f32), (10, 1, 1.5), (100, 0, 2.5)];
905        let list =
906            BlockSparsePostingList::from_postings(&postings, WeightQuantization::UInt8).unwrap();
907
908        let mut buf = Vec::new();
909        list.serialize(&mut buf).unwrap();
910        let list2 = BlockSparsePostingList::deserialize(&mut Cursor::new(&buf)).unwrap();
911
912        assert_eq!(list.doc_count(), list2.doc_count());
913    }
914
915    #[test]
916    fn test_seek() {
917        let postings: Vec<(DocId, u16, f32)> = (0..500).map(|i| (i * 3, 0, i as f32)).collect();
918        let list =
919            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
920
921        let mut iter = list.iterator();
922        assert_eq!(iter.seek(300), 300);
923        assert_eq!(iter.seek(301), 303);
924        assert_eq!(iter.seek(2000), TERMINATED);
925    }
926
927    #[test]
928    fn test_merge_with_offsets() {
929        // Segment 1: docs 0, 5, 10 with weights
930        let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
931        let list1 =
932            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
933
934        // Segment 2: docs 0, 3, 7 with weights (will become 100, 103, 107 after merge)
935        let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
936        let list2 =
937            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
938
939        // Merge with offsets: segment 1 at offset 0, segment 2 at offset 100
940        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
941
942        assert_eq!(merged.doc_count(), 6);
943
944        // Verify all doc_ids are correct after merge
945        let decoded = merged.decode_all();
946        assert_eq!(decoded.len(), 6);
947
948        // Segment 1 docs (offset 0)
949        assert_eq!(decoded[0].0, 0);
950        assert_eq!(decoded[1].0, 5);
951        assert_eq!(decoded[2].0, 10);
952
953        // Segment 2 docs (offset 100)
954        assert_eq!(decoded[3].0, 100); // 0 + 100
955        assert_eq!(decoded[4].0, 103); // 3 + 100
956        assert_eq!(decoded[5].0, 107); // 7 + 100
957
958        // Verify weights preserved
959        assert!((decoded[0].2 - 1.0).abs() < 0.01);
960        assert!((decoded[3].2 - 4.0).abs() < 0.01);
961
962        // Verify ordinals preserved
963        assert_eq!(decoded[2].1, 1); // ordinal from segment 1
964        assert_eq!(decoded[4].1, 1); // ordinal from segment 2
965    }
966
967    #[test]
968    fn test_merge_with_offsets_multi_block() {
969        // Create posting lists that span multiple blocks
970        let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
971        let list1 =
972            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
973        assert!(list1.num_blocks() > 1, "Should have multiple blocks");
974
975        let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
976        let list2 =
977            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
978
979        // Merge with offset 1000 for segment 2
980        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
981
982        assert_eq!(merged.doc_count(), 350);
983        assert_eq!(merged.num_blocks(), list1.num_blocks() + list2.num_blocks());
984
985        // Verify via iterator
986        let mut iter = merged.iterator();
987
988        // First segment docs start at 0
989        assert_eq!(iter.doc(), 0);
990
991        // Seek to segment 2 (should be at offset 1000)
992        let doc = iter.seek(1000);
993        assert_eq!(doc, 1000); // First doc of segment 2: 0 + 1000 = 1000
994
995        // Next doc in segment 2
996        iter.advance();
997        assert_eq!(iter.doc(), 1003); // 3 + 1000 = 1003
998    }
999
1000    #[test]
1001    fn test_merge_with_offsets_serialize_roundtrip() {
1002        // Verify that serialization preserves adjusted doc_ids
1003        let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.0), (5, 0, 2.0), (10, 1, 3.0)];
1004        let list1 =
1005            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1006
1007        let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.0), (3, 1, 5.0), (7, 0, 6.0)];
1008        let list2 =
1009            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1010
1011        // Merge with offset 100 for segment 2
1012        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1013
1014        // Serialize
1015        let mut bytes = Vec::new();
1016        merged.serialize(&mut bytes).unwrap();
1017
1018        // Deserialize
1019        let mut cursor = std::io::Cursor::new(&bytes);
1020        let loaded = BlockSparsePostingList::deserialize(&mut cursor).unwrap();
1021
1022        // Verify doc_ids are preserved after round-trip
1023        let decoded = loaded.decode_all();
1024        assert_eq!(decoded.len(), 6);
1025
1026        // Segment 1 docs (offset 0)
1027        assert_eq!(decoded[0].0, 0);
1028        assert_eq!(decoded[1].0, 5);
1029        assert_eq!(decoded[2].0, 10);
1030
1031        // Segment 2 docs (offset 100) - CRITICAL: these must be offset-adjusted
1032        assert_eq!(decoded[3].0, 100, "First doc of seg2 should be 0+100=100");
1033        assert_eq!(decoded[4].0, 103, "Second doc of seg2 should be 3+100=103");
1034        assert_eq!(decoded[5].0, 107, "Third doc of seg2 should be 7+100=107");
1035
1036        // Verify iterator also works correctly
1037        let mut iter = loaded.iterator();
1038        assert_eq!(iter.doc(), 0);
1039        iter.advance();
1040        assert_eq!(iter.doc(), 5);
1041        iter.advance();
1042        assert_eq!(iter.doc(), 10);
1043        iter.advance();
1044        assert_eq!(iter.doc(), 100);
1045        iter.advance();
1046        assert_eq!(iter.doc(), 103);
1047        iter.advance();
1048        assert_eq!(iter.doc(), 107);
1049    }
1050
1051    #[test]
1052    fn test_merge_seek_after_roundtrip() {
1053        // Create posting lists that span multiple blocks to test seek after merge
1054        let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, 1.0)).collect();
1055        let list1 =
1056            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1057
1058        let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 0, 2.0)).collect();
1059        let list2 =
1060            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1061
1062        // Merge with offset 1000 for segment 2
1063        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 1000)]);
1064
1065        // Serialize and deserialize (simulating what happens after merge file is written)
1066        let mut bytes = Vec::new();
1067        merged.serialize(&mut bytes).unwrap();
1068        let loaded =
1069            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1070
1071        // Test seeking to various positions
1072        let mut iter = loaded.iterator();
1073
1074        // Seek to doc in segment 1
1075        let doc = iter.seek(100);
1076        assert_eq!(doc, 100, "Seek to 100 in segment 1");
1077
1078        // Seek to doc in segment 2 (1000 + offset)
1079        let doc = iter.seek(1000);
1080        assert_eq!(doc, 1000, "Seek to 1000 (first doc of segment 2)");
1081
1082        // Seek to middle of segment 2
1083        let doc = iter.seek(1050);
1084        assert!(
1085            doc >= 1050,
1086            "Seek to 1050 should find doc >= 1050, got {}",
1087            doc
1088        );
1089
1090        // Seek backwards should stay at current position (seek only goes forward)
1091        let doc = iter.seek(500);
1092        assert!(
1093            doc >= 1050,
1094            "Seek backwards should not go back, got {}",
1095            doc
1096        );
1097
1098        // Fresh iterator - verify block boundaries work
1099        let mut iter2 = loaded.iterator();
1100
1101        // Verify we can iterate through all docs
1102        let mut count = 0;
1103        let mut prev_doc = 0;
1104        while iter2.doc() != super::TERMINATED {
1105            let current = iter2.doc();
1106            if count > 0 {
1107                assert!(
1108                    current > prev_doc,
1109                    "Docs should be monotonically increasing: {} vs {}",
1110                    prev_doc,
1111                    current
1112                );
1113            }
1114            prev_doc = current;
1115            iter2.advance();
1116            count += 1;
1117        }
1118        assert_eq!(count, 350, "Should have 350 total docs");
1119    }
1120
1121    #[test]
1122    fn test_doc_count_multi_value() {
1123        // Multi-value: same doc_id with different ordinals
1124        // doc 0 has 3 ordinals, doc 5 has 2, doc 10 has 1 = 3 unique docs
1125        let postings: Vec<(DocId, u16, f32)> = vec![
1126            (0, 0, 1.0),
1127            (0, 1, 1.5),
1128            (0, 2, 2.0),
1129            (5, 0, 3.0),
1130            (5, 1, 3.5),
1131            (10, 0, 4.0),
1132        ];
1133        let list =
1134            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1135
1136        // doc_count should be 3 (unique docs), not 6 (total postings)
1137        assert_eq!(list.doc_count(), 3);
1138
1139        // But we should still have all 6 postings accessible
1140        let decoded = list.decode_all();
1141        assert_eq!(decoded.len(), 6);
1142    }
1143
1144    /// Test the zero-copy merge path used by the actual sparse merger:
1145    /// serialize → parse raw skip entries + block data → patch first_doc_id → reassemble.
1146    /// This is the exact code path in `segment/merger/sparse_vectors.rs`.
1147    #[test]
1148    fn test_zero_copy_merge_patches_first_doc_id() {
1149        use crate::structures::SparseSkipEntry;
1150
1151        // Build two multi-block posting lists
1152        let postings1: Vec<(DocId, u16, f32)> = (0..200).map(|i| (i * 2, 0, i as f32)).collect();
1153        let list1 =
1154            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1155        assert!(list1.num_blocks() > 1);
1156
1157        let postings2: Vec<(DocId, u16, f32)> = (0..150).map(|i| (i * 3, 1, i as f32)).collect();
1158        let list2 =
1159            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1160
1161        // Serialize both (this is what the builder writes to disk)
1162        let mut bytes1 = Vec::new();
1163        list1.serialize(&mut bytes1).unwrap();
1164        let mut bytes2 = Vec::new();
1165        list2.serialize(&mut bytes2).unwrap();
1166
1167        // --- Simulate read_dim_raw: parse header + skip entries, extract raw block data ---
1168        fn parse_raw(data: &[u8]) -> (u32, f32, Vec<SparseSkipEntry>, &[u8]) {
1169            let doc_count = u32::from_le_bytes(data[0..4].try_into().unwrap());
1170            let global_max = f32::from_le_bytes(data[4..8].try_into().unwrap());
1171            let num_blocks = u32::from_le_bytes(data[8..12].try_into().unwrap()) as usize;
1172            let mut pos = 12;
1173            let mut skip = Vec::new();
1174            for _ in 0..num_blocks {
1175                let first_doc = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
1176                let last_doc = u32::from_le_bytes(data[pos + 4..pos + 8].try_into().unwrap());
1177                let offset = u32::from_le_bytes(data[pos + 8..pos + 12].try_into().unwrap());
1178                let length = u32::from_le_bytes(data[pos + 12..pos + 16].try_into().unwrap());
1179                let max_w = f32::from_le_bytes(data[pos + 16..pos + 20].try_into().unwrap());
1180                skip.push(SparseSkipEntry::new(
1181                    first_doc, last_doc, offset, length, max_w,
1182                ));
1183                pos += 20;
1184            }
1185            (doc_count, global_max, skip, &data[pos..])
1186        }
1187
1188        let (dc1, gm1, skip1, raw1) = parse_raw(&bytes1);
1189        let (dc2, gm2, skip2, raw2) = parse_raw(&bytes2);
1190
1191        // --- Simulate the merger's zero-copy reassembly ---
1192        let doc_offset: u32 = 1000; // segment 2 starts at doc 1000
1193        let total_docs = dc1 + dc2;
1194        let global_max = gm1.max(gm2);
1195        let total_blocks = (skip1.len() + skip2.len()) as u32;
1196
1197        let mut output = Vec::new();
1198        // Write header
1199        output.extend_from_slice(&total_docs.to_le_bytes());
1200        output.extend_from_slice(&global_max.to_le_bytes());
1201        output.extend_from_slice(&total_blocks.to_le_bytes());
1202
1203        // Write adjusted skip entries
1204        let mut block_data_offset = 0u32;
1205        for entry in &skip1 {
1206            let adjusted = SparseSkipEntry::new(
1207                entry.first_doc,
1208                entry.last_doc,
1209                block_data_offset + entry.offset,
1210                entry.length,
1211                entry.max_weight,
1212            );
1213            adjusted.write(&mut output).unwrap();
1214        }
1215        if let Some(last) = skip1.last() {
1216            block_data_offset += last.offset + last.length;
1217        }
1218        for entry in &skip2 {
1219            let adjusted = SparseSkipEntry::new(
1220                entry.first_doc + doc_offset,
1221                entry.last_doc + doc_offset,
1222                block_data_offset + entry.offset,
1223                entry.length,
1224                entry.max_weight,
1225            );
1226            adjusted.write(&mut output).unwrap();
1227        }
1228
1229        // Write raw block data: source 1 verbatim, source 2 with first_doc_id patched
1230        output.extend_from_slice(raw1);
1231
1232        const FIRST_DOC_ID_OFFSET: usize = 8;
1233        let mut buf2 = raw2.to_vec();
1234        for entry in &skip2 {
1235            let off = entry.offset as usize + FIRST_DOC_ID_OFFSET;
1236            if off + 4 <= buf2.len() {
1237                let old = u32::from_le_bytes(buf2[off..off + 4].try_into().unwrap());
1238                let patched = (old + doc_offset).to_le_bytes();
1239                buf2[off..off + 4].copy_from_slice(&patched);
1240            }
1241        }
1242        output.extend_from_slice(&buf2);
1243
1244        // --- Deserialize the reassembled posting list and verify ---
1245        let loaded = BlockSparsePostingList::deserialize(&mut Cursor::new(&output)).unwrap();
1246        assert_eq!(loaded.doc_count(), 350);
1247
1248        let mut iter = loaded.iterator();
1249
1250        // Segment 1: docs 0, 2, 4, ..., 398
1251        assert_eq!(iter.doc(), 0);
1252        let doc = iter.seek(100);
1253        assert_eq!(doc, 100);
1254        let doc = iter.seek(398);
1255        assert_eq!(doc, 398);
1256
1257        // Segment 2: docs 1000, 1003, 1006, ..., 1000 + 149*3 = 1447
1258        let doc = iter.seek(1000);
1259        assert_eq!(doc, 1000, "First doc of segment 2 should be 1000");
1260        iter.advance();
1261        assert_eq!(iter.doc(), 1003, "Second doc of segment 2 should be 1003");
1262        let doc = iter.seek(1447);
1263        assert_eq!(doc, 1447, "Last doc of segment 2 should be 1447");
1264
1265        // Exhausted
1266        iter.advance();
1267        assert_eq!(iter.doc(), super::TERMINATED);
1268
1269        // Also verify with merge_with_offsets to confirm identical results
1270        let reference =
1271            BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, doc_offset)]);
1272        let mut ref_iter = reference.iterator();
1273        let mut zc_iter = loaded.iterator();
1274        while ref_iter.doc() != super::TERMINATED {
1275            assert_eq!(
1276                ref_iter.doc(),
1277                zc_iter.doc(),
1278                "Zero-copy and reference merge should produce identical doc_ids"
1279            );
1280            assert!(
1281                (ref_iter.weight() - zc_iter.weight()).abs() < 0.01,
1282                "Weights should match: {} vs {}",
1283                ref_iter.weight(),
1284                zc_iter.weight()
1285            );
1286            ref_iter.advance();
1287            zc_iter.advance();
1288        }
1289        assert_eq!(zc_iter.doc(), super::TERMINATED);
1290    }
1291
1292    #[test]
1293    fn test_doc_count_single_value() {
1294        // Single-value: each doc_id appears once (ordinal always 0)
1295        let postings: Vec<(DocId, u16, f32)> =
1296            vec![(0, 0, 1.0), (5, 0, 2.0), (10, 0, 3.0), (15, 0, 4.0)];
1297        let list =
1298            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1299
1300        // doc_count == total postings for single-value
1301        assert_eq!(list.doc_count(), 4);
1302    }
1303
1304    #[test]
1305    fn test_doc_count_multi_value_serialization_roundtrip() {
1306        // Verify doc_count survives serialization
1307        let postings: Vec<(DocId, u16, f32)> =
1308            vec![(0, 0, 1.0), (0, 1, 1.5), (5, 0, 2.0), (5, 1, 2.5)];
1309        let list =
1310            BlockSparsePostingList::from_postings(&postings, WeightQuantization::Float32).unwrap();
1311        assert_eq!(list.doc_count(), 2);
1312
1313        let mut buf = Vec::new();
1314        list.serialize(&mut buf).unwrap();
1315        let loaded = BlockSparsePostingList::deserialize(&mut Cursor::new(&buf)).unwrap();
1316        assert_eq!(loaded.doc_count(), 2);
1317    }
1318
1319    #[test]
1320    fn test_merge_preserves_weights_and_ordinals() {
1321        // Test that weights and ordinals are preserved after merge + roundtrip
1322        let postings1: Vec<(DocId, u16, f32)> = vec![(0, 0, 1.5), (5, 1, 2.5), (10, 2, 3.5)];
1323        let list1 =
1324            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1325
1326        let postings2: Vec<(DocId, u16, f32)> = vec![(0, 0, 4.5), (3, 1, 5.5), (7, 3, 6.5)];
1327        let list2 =
1328            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1329
1330        // Merge with offset 100 for segment 2
1331        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1332
1333        // Serialize and deserialize
1334        let mut bytes = Vec::new();
1335        merged.serialize(&mut bytes).unwrap();
1336        let loaded =
1337            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1338
1339        // Verify all postings via iterator
1340        let mut iter = loaded.iterator();
1341
1342        // Segment 1 postings
1343        assert_eq!(iter.doc(), 0);
1344        assert!(
1345            (iter.weight() - 1.5).abs() < 0.01,
1346            "Weight should be 1.5, got {}",
1347            iter.weight()
1348        );
1349        assert_eq!(iter.ordinal(), 0);
1350
1351        iter.advance();
1352        assert_eq!(iter.doc(), 5);
1353        assert!(
1354            (iter.weight() - 2.5).abs() < 0.01,
1355            "Weight should be 2.5, got {}",
1356            iter.weight()
1357        );
1358        assert_eq!(iter.ordinal(), 1);
1359
1360        iter.advance();
1361        assert_eq!(iter.doc(), 10);
1362        assert!(
1363            (iter.weight() - 3.5).abs() < 0.01,
1364            "Weight should be 3.5, got {}",
1365            iter.weight()
1366        );
1367        assert_eq!(iter.ordinal(), 2);
1368
1369        // Segment 2 postings (with offset 100)
1370        iter.advance();
1371        assert_eq!(iter.doc(), 100);
1372        assert!(
1373            (iter.weight() - 4.5).abs() < 0.01,
1374            "Weight should be 4.5, got {}",
1375            iter.weight()
1376        );
1377        assert_eq!(iter.ordinal(), 0);
1378
1379        iter.advance();
1380        assert_eq!(iter.doc(), 103);
1381        assert!(
1382            (iter.weight() - 5.5).abs() < 0.01,
1383            "Weight should be 5.5, got {}",
1384            iter.weight()
1385        );
1386        assert_eq!(iter.ordinal(), 1);
1387
1388        iter.advance();
1389        assert_eq!(iter.doc(), 107);
1390        assert!(
1391            (iter.weight() - 6.5).abs() < 0.01,
1392            "Weight should be 6.5, got {}",
1393            iter.weight()
1394        );
1395        assert_eq!(iter.ordinal(), 3);
1396
1397        // Verify exhausted
1398        iter.advance();
1399        assert_eq!(iter.doc(), super::TERMINATED);
1400    }
1401
1402    #[test]
1403    fn test_merge_global_max_weight() {
1404        // Verify global_max_weight is correct after merge
1405        let postings1: Vec<(DocId, u16, f32)> = vec![
1406            (0, 0, 3.0),
1407            (1, 0, 7.0), // max in segment 1
1408            (2, 0, 2.0),
1409        ];
1410        let list1 =
1411            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1412
1413        let postings2: Vec<(DocId, u16, f32)> = vec![
1414            (0, 0, 5.0),
1415            (1, 0, 4.0),
1416            (2, 0, 6.0), // max in segment 2
1417        ];
1418        let list2 =
1419            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1420
1421        // Verify original global max weights
1422        assert!((list1.global_max_weight() - 7.0).abs() < 0.01);
1423        assert!((list2.global_max_weight() - 6.0).abs() < 0.01);
1424
1425        // Merge
1426        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1427
1428        // Global max should be 7.0 (from segment 1)
1429        assert!(
1430            (merged.global_max_weight() - 7.0).abs() < 0.01,
1431            "Global max should be 7.0, got {}",
1432            merged.global_max_weight()
1433        );
1434
1435        // Roundtrip
1436        let mut bytes = Vec::new();
1437        merged.serialize(&mut bytes).unwrap();
1438        let loaded =
1439            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1440
1441        assert!(
1442            (loaded.global_max_weight() - 7.0).abs() < 0.01,
1443            "After roundtrip, global max should still be 7.0, got {}",
1444            loaded.global_max_weight()
1445        );
1446    }
1447
1448    #[test]
1449    fn test_scoring_simulation_after_merge() {
1450        // Simulate what SparseTermScorer does - compute query_weight * stored_weight
1451        let postings1: Vec<(DocId, u16, f32)> = vec![
1452            (0, 0, 0.5), // doc 0, weight 0.5
1453            (5, 0, 0.8), // doc 5, weight 0.8
1454        ];
1455        let list1 =
1456            BlockSparsePostingList::from_postings(&postings1, WeightQuantization::Float32).unwrap();
1457
1458        let postings2: Vec<(DocId, u16, f32)> = vec![
1459            (0, 0, 0.6), // doc 100 after offset, weight 0.6
1460            (3, 0, 0.9), // doc 103 after offset, weight 0.9
1461        ];
1462        let list2 =
1463            BlockSparsePostingList::from_postings(&postings2, WeightQuantization::Float32).unwrap();
1464
1465        // Merge with offset 100
1466        let merged = BlockSparsePostingList::merge_with_offsets(&[(&list1, 0), (&list2, 100)]);
1467
1468        // Roundtrip
1469        let mut bytes = Vec::new();
1470        merged.serialize(&mut bytes).unwrap();
1471        let loaded =
1472            BlockSparsePostingList::deserialize(&mut std::io::Cursor::new(&bytes)).unwrap();
1473
1474        // Simulate scoring with query_weight = 2.0
1475        let query_weight = 2.0f32;
1476        let mut iter = loaded.iterator();
1477
1478        // Expected scores: query_weight * stored_weight
1479        // Doc 0: 2.0 * 0.5 = 1.0
1480        assert_eq!(iter.doc(), 0);
1481        let score = query_weight * iter.weight();
1482        assert!(
1483            (score - 1.0).abs() < 0.01,
1484            "Doc 0 score should be 1.0, got {}",
1485            score
1486        );
1487
1488        iter.advance();
1489        // Doc 5: 2.0 * 0.8 = 1.6
1490        assert_eq!(iter.doc(), 5);
1491        let score = query_weight * iter.weight();
1492        assert!(
1493            (score - 1.6).abs() < 0.01,
1494            "Doc 5 score should be 1.6, got {}",
1495            score
1496        );
1497
1498        iter.advance();
1499        // Doc 100: 2.0 * 0.6 = 1.2
1500        assert_eq!(iter.doc(), 100);
1501        let score = query_weight * iter.weight();
1502        assert!(
1503            (score - 1.2).abs() < 0.01,
1504            "Doc 100 score should be 1.2, got {}",
1505            score
1506        );
1507
1508        iter.advance();
1509        // Doc 103: 2.0 * 0.9 = 1.8
1510        assert_eq!(iter.doc(), 103);
1511        let score = query_weight * iter.weight();
1512        assert!(
1513            (score - 1.8).abs() < 0.01,
1514            "Doc 103 score should be 1.8, got {}",
1515            score
1516        );
1517    }
1518}
hermes_core/structures/postings/sparse/block.rs

hermes_core/structures/postings/sparse/
block.rs