hermes_core/structures/postings/
posting.rs

1//! Posting list implementation with compact representation
2//!
3//! Text blocks use SIMD-friendly packed bit-width encoding:
4//! - Doc IDs: delta-encoded, packed at rounded bit width (0/8/16/32)
5//! - Term frequencies: packed at rounded bit width
6//! - Same SIMD primitives as sparse blocks (`simd::pack_rounded` / `unpack_rounded`)
7
8use byteorder::{LittleEndian, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use super::posting_common::{read_vint, write_vint};
12use crate::DocId;
13use crate::directories::OwnedBytes;
14use crate::structures::simd;
15
16/// A posting entry containing doc_id and term frequency
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct Posting {
19    pub doc_id: DocId,
20    pub term_freq: u32,
21}
22
23/// Compact posting list with delta encoding
24#[derive(Debug, Clone, Default)]
25pub struct PostingList {
26    postings: Vec<Posting>,
27}
28
29impl PostingList {
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    pub fn with_capacity(capacity: usize) -> Self {
35        Self {
36            postings: Vec::with_capacity(capacity),
37        }
38    }
39
40    /// Add a posting (must be added in doc_id order)
41    pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
42        debug_assert!(
43            self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
44            "Postings must be added in sorted order"
45        );
46        self.postings.push(Posting { doc_id, term_freq });
47    }
48
49    /// Add a posting, incrementing term_freq if doc already exists
50    pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
51        if let Some(last) = self.postings.last_mut()
52            && last.doc_id == doc_id
53        {
54            last.term_freq += term_freq;
55            return;
56        }
57        self.postings.push(Posting { doc_id, term_freq });
58    }
59
60    /// Get document count
61    pub fn doc_count(&self) -> u32 {
62        self.postings.len() as u32
63    }
64
65    pub fn len(&self) -> usize {
66        self.postings.len()
67    }
68
69    pub fn is_empty(&self) -> bool {
70        self.postings.is_empty()
71    }
72
73    pub fn iter(&self) -> impl Iterator<Item = &Posting> {
74        self.postings.iter()
75    }
76
77    /// Serialize to bytes using delta encoding and varint
78    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
79        // Write number of postings
80        write_vint(writer, self.postings.len() as u64)?;
81
82        let mut prev_doc_id = 0u32;
83        for posting in &self.postings {
84            // Delta encode doc_id
85            let delta = posting.doc_id - prev_doc_id;
86            write_vint(writer, delta as u64)?;
87            write_vint(writer, posting.term_freq as u64)?;
88            prev_doc_id = posting.doc_id;
89        }
90
91        Ok(())
92    }
93
94    /// Deserialize from bytes
95    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
96        let count = read_vint(reader)? as usize;
97        let mut postings = Vec::with_capacity(count);
98
99        let mut prev_doc_id = 0u32;
100        for _ in 0..count {
101            let delta = read_vint(reader)? as u32;
102            let term_freq = read_vint(reader)? as u32;
103            let doc_id = prev_doc_id + delta;
104            postings.push(Posting { doc_id, term_freq });
105            prev_doc_id = doc_id;
106        }
107
108        Ok(Self { postings })
109    }
110}
111
112/// Iterator over posting list that supports seeking
113pub struct PostingListIterator<'a> {
114    postings: &'a [Posting],
115    position: usize,
116}
117
118impl<'a> PostingListIterator<'a> {
119    pub fn new(posting_list: &'a PostingList) -> Self {
120        Self {
121            postings: &posting_list.postings,
122            position: 0,
123        }
124    }
125
126    /// Current document ID, or TERMINATED if exhausted
127    pub fn doc(&self) -> DocId {
128        if self.position < self.postings.len() {
129            self.postings[self.position].doc_id
130        } else {
131            TERMINATED
132        }
133    }
134
135    /// Current term frequency
136    pub fn term_freq(&self) -> u32 {
137        if self.position < self.postings.len() {
138            self.postings[self.position].term_freq
139        } else {
140            0
141        }
142    }
143
144    /// Advance to next posting, returns new doc_id or TERMINATED
145    pub fn advance(&mut self) -> DocId {
146        self.position += 1;
147        self.doc()
148    }
149
150    /// Seek to first doc_id >= target (binary search on remaining postings)
151    pub fn seek(&mut self, target: DocId) -> DocId {
152        let remaining = &self.postings[self.position..];
153        let offset = remaining.partition_point(|p| p.doc_id < target);
154        self.position += offset;
155        self.doc()
156    }
157
158    /// Size hint for remaining elements
159    pub fn size_hint(&self) -> usize {
160        self.postings.len().saturating_sub(self.position)
161    }
162}
163
164/// Sentinel value indicating iterator is exhausted
165pub const TERMINATED: DocId = DocId::MAX;
166
167/// Block-based posting list with 2-level skip index.
168///
169/// Each block contains up to `BLOCK_SIZE` postings encoded as packed bit-width arrays.
170/// Skip entries use a compact 2-level structure for cache-friendly seeking:
171/// - **Level-0** (16 bytes/block): `first_doc`, `last_doc`, `offset`, `max_weight`
172/// - **Level-1** (4 bytes/group): `last_doc` per `L1_INTERVAL` blocks
173///
174/// Seek algorithm: binary search L1, then linear scan ≤`L1_INTERVAL` L0 entries.
175pub const BLOCK_SIZE: usize = 128;
176
177/// Number of L0 blocks per L1 skip entry.
178const L1_INTERVAL: usize = 8;
179
180/// Compact level-0 skip entry — 16 bytes.
181/// `length` is omitted: computable from the block's 8-byte header.
182const L0_SIZE: usize = 16;
183
184/// Level-1 skip entry — 4 bytes (just `last_doc`).
185const L1_SIZE: usize = 4;
186
187/// Footer: stream_len(8) + l0_count(4) + l1_count(4) + doc_count(4) + max_tf(4) = 24 bytes.
188const FOOTER_SIZE: usize = 24;
189
190/// Read a compact L0 entry from raw bytes at the given index.
191///
192/// Uses a single bounds check (`[..L0_SIZE]`) instead of 4× `try_into().unwrap()`.
193#[inline]
194fn read_l0(bytes: &[u8], idx: usize) -> (u32, u32, u32, f32) {
195    let b = &bytes[idx * L0_SIZE..][..L0_SIZE];
196    let first_doc = u32::from_le_bytes([b[0], b[1], b[2], b[3]]);
197    let last_doc = u32::from_le_bytes([b[4], b[5], b[6], b[7]]);
198    let offset = u32::from_le_bytes([b[8], b[9], b[10], b[11]]);
199    let max_weight = f32::from_le_bytes([b[12], b[13], b[14], b[15]]);
200    (first_doc, last_doc, offset, max_weight)
201}
202
203/// Write a compact L0 entry.
204#[inline]
205fn write_l0(buf: &mut Vec<u8>, first_doc: u32, last_doc: u32, offset: u32, max_weight: f32) {
206    buf.extend_from_slice(&first_doc.to_le_bytes());
207    buf.extend_from_slice(&last_doc.to_le_bytes());
208    buf.extend_from_slice(&offset.to_le_bytes());
209    buf.extend_from_slice(&max_weight.to_le_bytes());
210}
211
212/// Compute block data size from the 8-byte header at `stream[pos..]`.
213///
214/// Header: `[count: u16][first_doc: u32][doc_id_bits: u8][tf_bits: u8]`
215/// Data size = 8 + (count-1) × bytes_per_value(doc_id_bits) + count × bytes_per_value(tf_bits)
216#[inline]
217fn block_data_size(stream: &[u8], pos: usize) -> usize {
218    let count = u16::from_le_bytes(stream[pos..pos + 2].try_into().unwrap()) as usize;
219    let doc_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 6]);
220    let tf_rounded = simd::RoundedBitWidth::from_u8(stream[pos + 7]);
221    let delta_bytes = if count > 1 {
222        (count - 1) * doc_rounded.bytes_per_value()
223    } else {
224        0
225    };
226    8 + delta_bytes + count * tf_rounded.bytes_per_value()
227}
228
229#[derive(Debug, Clone)]
230pub struct BlockPostingList {
231    /// Block data stream (packed blocks laid out sequentially).
232    stream: OwnedBytes,
233    /// Level-0 skip entries: `(first_doc, last_doc, offset, max_weight)` × `l0_count`.
234    /// 16 bytes per entry. Supports O(1) random access by block index.
235    l0_bytes: OwnedBytes,
236    /// Number of blocks (= number of L0 entries).
237    l0_count: usize,
238    /// Level-1 skip `last_doc` values — one per `L1_INTERVAL` blocks.
239    /// Stored as `Vec<u32>` for direct SIMD-accelerated `find_first_ge_u32`.
240    l1_docs: Vec<u32>,
241    /// Total posting count.
242    doc_count: u32,
243    /// Max TF across all blocks.
244    max_tf: u32,
245}
246
247impl BlockPostingList {
248    /// Read L0 entry by block index. Returns `(first_doc, last_doc, offset, max_weight)`.
249    #[inline]
250    fn read_l0_entry(&self, idx: usize) -> (u32, u32, u32, f32) {
251        read_l0(&self.l0_bytes, idx)
252    }
253
254    /// Build from a posting list.
255    ///
256    /// Block format (8-byte header + packed arrays):
257    /// ```text
258    /// [count: u16][first_doc: u32][doc_id_bits: u8][tf_bits: u8]
259    /// [packed doc_id deltas: (count-1) × bytes_per_value(doc_id_bits)]
260    /// [packed tfs: count × bytes_per_value(tf_bits)]
261    /// ```
262    pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
263        let mut stream: Vec<u8> = Vec::new();
264        let mut l0_buf: Vec<u8> = Vec::new();
265        let mut l1_docs: Vec<u32> = Vec::new();
266        let mut l0_count = 0usize;
267        let mut max_tf = 0u32;
268
269        let postings = &list.postings;
270        let mut i = 0;
271
272        // Temp buffers reused across blocks
273        let mut deltas = Vec::with_capacity(BLOCK_SIZE);
274        let mut tf_buf = Vec::with_capacity(BLOCK_SIZE);
275
276        while i < postings.len() {
277            if stream.len() > u32::MAX as usize {
278                return Err(io::Error::new(
279                    io::ErrorKind::InvalidData,
280                    "posting list stream exceeds u32::MAX bytes",
281                ));
282            }
283            let block_start = stream.len() as u32;
284            let block_end = (i + BLOCK_SIZE).min(postings.len());
285            let block = &postings[i..block_end];
286            let count = block.len();
287
288            // Compute block's max term frequency for block-max pruning
289            let block_max_tf = block.iter().map(|p| p.term_freq).max().unwrap_or(0);
290            max_tf = max_tf.max(block_max_tf);
291
292            let base_doc_id = block.first().unwrap().doc_id;
293            let last_doc_id = block.last().unwrap().doc_id;
294
295            // Delta-encode doc IDs (skip first — stored in header)
296            deltas.clear();
297            let mut prev = base_doc_id;
298            for posting in block.iter().skip(1) {
299                deltas.push(posting.doc_id - prev);
300                prev = posting.doc_id;
301            }
302            let max_delta = deltas.iter().copied().max().unwrap_or(0);
303            let doc_id_bits = simd::round_bit_width(simd::bits_needed(max_delta));
304
305            // Collect TFs
306            tf_buf.clear();
307            tf_buf.extend(block.iter().map(|p| p.term_freq));
308            let tf_bits = simd::round_bit_width(simd::bits_needed(block_max_tf));
309
310            // Write 8-byte header: [count: u16][first_doc: u32][doc_id_bits: u8][tf_bits: u8]
311            stream.write_u16::<LittleEndian>(count as u16)?;
312            stream.write_u32::<LittleEndian>(base_doc_id)?;
313            stream.push(doc_id_bits);
314            stream.push(tf_bits);
315
316            // Write packed doc_id deltas ((count-1) values)
317            if count > 1 {
318                let rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
319                let byte_count = (count - 1) * rounded.bytes_per_value();
320                let start = stream.len();
321                stream.resize(start + byte_count, 0);
322                simd::pack_rounded(&deltas, rounded, &mut stream[start..]);
323            }
324
325            // Write packed TFs (count values)
326            {
327                let rounded = simd::RoundedBitWidth::from_u8(tf_bits);
328                let byte_count = count * rounded.bytes_per_value();
329                let start = stream.len();
330                stream.resize(start + byte_count, 0);
331                simd::pack_rounded(&tf_buf, rounded, &mut stream[start..]);
332            }
333
334            // L0 skip entry
335            write_l0(
336                &mut l0_buf,
337                base_doc_id,
338                last_doc_id,
339                block_start,
340                block_max_tf as f32,
341            );
342            l0_count += 1;
343
344            // L1 entry at the end of each L1_INTERVAL group
345            if l0_count.is_multiple_of(L1_INTERVAL) {
346                l1_docs.push(last_doc_id);
347            }
348
349            i = block_end;
350        }
351
352        // Final L1 entry for partial group
353        if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
354            let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
355            l1_docs.push(last_doc);
356        }
357
358        Ok(Self {
359            stream: OwnedBytes::new(stream),
360            l0_bytes: OwnedBytes::new(l0_buf),
361            l0_count,
362            l1_docs,
363            doc_count: postings.len() as u32,
364            max_tf,
365        })
366    }
367
368    /// Serialize the block posting list (footer-based: stream first).
369    ///
370    /// Format:
371    /// ```text
372    /// [stream: block data]
373    /// [L0 entries: l0_count × 16 bytes (first_doc, last_doc, offset, max_weight)]
374    /// [L1 entries: l1_count × 4 bytes (last_doc)]
375    /// [footer: stream_len(8) + l0_count(4) + l1_count(4) + doc_count(4) + max_tf(4) = 24 bytes]
376    /// ```
377    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
378        writer.write_all(&self.stream)?;
379        writer.write_all(&self.l0_bytes)?;
380        for &doc in &self.l1_docs {
381            writer.write_u32::<LittleEndian>(doc)?;
382        }
383
384        // Footer (24 bytes)
385        writer.write_u64::<LittleEndian>(self.stream.len() as u64)?;
386        writer.write_u32::<LittleEndian>(self.l0_count as u32)?;
387        writer.write_u32::<LittleEndian>(self.l1_docs.len() as u32)?;
388        writer.write_u32::<LittleEndian>(self.doc_count)?;
389        writer.write_u32::<LittleEndian>(self.max_tf)?;
390
391        Ok(())
392    }
393
394    /// Deserialize from a byte slice (footer-based format).
395    pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
396        if raw.len() < FOOTER_SIZE {
397            return Err(io::Error::new(
398                io::ErrorKind::InvalidData,
399                "posting data too short",
400            ));
401        }
402
403        let f = raw.len() - FOOTER_SIZE;
404        let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
405        let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
406        let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
407        let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
408        let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
409
410        let l0_start = stream_len;
411        let l0_end = l0_start + l0_count * L0_SIZE;
412        let l1_start = l0_end;
413
414        let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
415
416        Ok(Self {
417            stream: OwnedBytes::new(raw[..stream_len].to_vec()),
418            l0_bytes: OwnedBytes::new(raw[l0_start..l0_end].to_vec()),
419            l0_count,
420            l1_docs,
421            doc_count,
422            max_tf,
423        })
424    }
425
426    /// Zero-copy deserialization from OwnedBytes.
427    /// Stream and L0 are sliced from the source without copying.
428    /// L1 is extracted into a `Vec<u32>` for SIMD-friendly access (tiny: ≤ N/8 entries).
429    pub fn deserialize_zero_copy(raw: OwnedBytes) -> io::Result<Self> {
430        if raw.len() < FOOTER_SIZE {
431            return Err(io::Error::new(
432                io::ErrorKind::InvalidData,
433                "posting data too short",
434            ));
435        }
436
437        let f = raw.len() - FOOTER_SIZE;
438        let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
439        let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
440        let l1_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap()) as usize;
441        let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
442        let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
443
444        let l0_start = stream_len;
445        let l0_end = l0_start + l0_count * L0_SIZE;
446        let l1_start = l0_end;
447
448        let l1_docs = Self::extract_l1_docs(&raw[l1_start..], l1_count);
449
450        Ok(Self {
451            stream: raw.slice(0..stream_len),
452            l0_bytes: raw.slice(l0_start..l0_end),
453            l0_count,
454            l1_docs,
455            doc_count,
456            max_tf,
457        })
458    }
459
460    /// Extract L1 last_doc values from raw LE bytes into a Vec<u32>.
461    fn extract_l1_docs(bytes: &[u8], count: usize) -> Vec<u32> {
462        let mut docs = Vec::with_capacity(count);
463        for i in 0..count {
464            let p = i * L1_SIZE;
465            docs.push(u32::from_le_bytes(bytes[p..p + 4].try_into().unwrap()));
466        }
467        docs
468    }
469
470    pub fn doc_count(&self) -> u32 {
471        self.doc_count
472    }
473
474    /// Get maximum term frequency (for MaxScore upper bound computation)
475    pub fn max_tf(&self) -> u32 {
476        self.max_tf
477    }
478
479    /// Get number of blocks
480    pub fn num_blocks(&self) -> usize {
481        self.l0_count
482    }
483
484    /// Get block's max term frequency for block-max pruning
485    pub fn block_max_tf(&self, block_idx: usize) -> Option<u32> {
486        if block_idx >= self.l0_count {
487            return None;
488        }
489        let (_, _, _, max_weight) = self.read_l0_entry(block_idx);
490        Some(max_weight as u32)
491    }
492
493    /// Concatenate blocks from multiple posting lists with doc_id remapping.
494    /// This is O(num_blocks) instead of O(num_postings).
495    pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
496        let mut stream: Vec<u8> = Vec::new();
497        let mut l0_buf: Vec<u8> = Vec::new();
498        let mut l1_docs: Vec<u32> = Vec::new();
499        let mut l0_count = 0usize;
500        let mut total_docs = 0u32;
501        let mut max_tf = 0u32;
502
503        for (source, doc_offset) in sources {
504            max_tf = max_tf.max(source.max_tf);
505            for block_idx in 0..source.num_blocks() {
506                let (first_doc, last_doc, offset, max_weight) = source.read_l0_entry(block_idx);
507                let blk_size = block_data_size(&source.stream, offset as usize);
508                let block_bytes = &source.stream[offset as usize..offset as usize + blk_size];
509
510                let count = u16::from_le_bytes(block_bytes[0..2].try_into().unwrap());
511                if stream.len() > u32::MAX as usize {
512                    return Err(io::Error::new(
513                        io::ErrorKind::InvalidData,
514                        "posting list stream exceeds u32::MAX bytes during concatenation",
515                    ));
516                }
517                let new_offset = stream.len() as u32;
518
519                // Write patched header + copy packed arrays verbatim
520                stream.write_u16::<LittleEndian>(count)?;
521                stream.write_u32::<LittleEndian>(first_doc + doc_offset)?;
522                stream.extend_from_slice(&block_bytes[6..]);
523
524                let new_last = last_doc + doc_offset;
525                write_l0(
526                    &mut l0_buf,
527                    first_doc + doc_offset,
528                    new_last,
529                    new_offset,
530                    max_weight,
531                );
532                l0_count += 1;
533                total_docs += count as u32;
534
535                if l0_count.is_multiple_of(L1_INTERVAL) {
536                    l1_docs.push(new_last);
537                }
538            }
539        }
540
541        // Final L1 entry for partial group
542        if !l0_count.is_multiple_of(L1_INTERVAL) && l0_count > 0 {
543            let (_, last_doc, _, _) = read_l0(&l0_buf, l0_count - 1);
544            l1_docs.push(last_doc);
545        }
546
547        Ok(Self {
548            stream: OwnedBytes::new(stream),
549            l0_bytes: OwnedBytes::new(l0_buf),
550            l0_count,
551            l1_docs,
552            doc_count: total_docs,
553            max_tf,
554        })
555    }
556
557    /// Streaming merge: write blocks directly to output writer (bounded memory).
558    ///
559    /// **Zero-materializing**: reads L0 entries directly from source bytes
560    /// (mmap or &[u8]) without parsing into Vecs. Block sizes computed from
561    /// the 8-byte header (deterministic with packed encoding).
562    ///
563    /// Output L0 + L1 are buffered (bounded O(total_blocks × 16 + total_blocks/8 × 4)).
564    /// Block data flows source → output writer without intermediate buffering.
565    ///
566    /// Returns `(doc_count, bytes_written)`.
567    pub fn concatenate_streaming<W: Write>(
568        sources: &[(&[u8], u32)], // (serialized_bytes, doc_offset)
569        writer: &mut W,
570    ) -> io::Result<(u32, usize)> {
571        struct SourceMeta {
572            stream_len: usize,
573            l0_count: usize,
574        }
575
576        let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
577        let mut total_docs = 0u32;
578        let mut merged_max_tf = 0u32;
579
580        for (raw, _) in sources {
581            if raw.len() < FOOTER_SIZE {
582                continue;
583            }
584            let f = raw.len() - FOOTER_SIZE;
585            let stream_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
586            let l0_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
587            // l1_count not needed — we rebuild L1
588            let doc_count = u32::from_le_bytes(raw[f + 16..f + 20].try_into().unwrap());
589            let max_tf = u32::from_le_bytes(raw[f + 20..f + 24].try_into().unwrap());
590            total_docs += doc_count;
591            merged_max_tf = merged_max_tf.max(max_tf);
592            metas.push(SourceMeta {
593                stream_len,
594                l0_count,
595            });
596        }
597
598        // Phase 1: Stream block data, reading L0 entries on-the-fly.
599        // Accumulate output L0 + L1 (bounded).
600        let mut out_l0: Vec<u8> = Vec::new();
601        let mut out_l1_docs: Vec<u32> = Vec::new();
602        let mut out_l0_count = 0usize;
603        let mut stream_written = 0u64;
604        let mut patch_buf = [0u8; 8];
605
606        for (src_idx, meta) in metas.iter().enumerate() {
607            let (raw, doc_offset) = &sources[src_idx];
608            let l0_base = meta.stream_len; // L0 entries start right after stream
609            let src_stream = &raw[..meta.stream_len];
610
611            for i in 0..meta.l0_count {
612                // Read source L0 entry directly from raw bytes
613                let (first_doc, last_doc, offset, max_weight) = read_l0(&raw[l0_base..], i);
614
615                // Compute block size from header
616                let blk_size = block_data_size(src_stream, offset as usize);
617                let block = &src_stream[offset as usize..offset as usize + blk_size];
618
619                // Write output L0 entry
620                let new_last = last_doc + doc_offset;
621                if stream_written > u32::MAX as u64 {
622                    return Err(io::Error::new(
623                        io::ErrorKind::InvalidData,
624                        "posting list stream exceeds u32::MAX bytes during streaming merge",
625                    ));
626                }
627                write_l0(
628                    &mut out_l0,
629                    first_doc + doc_offset,
630                    new_last,
631                    stream_written as u32,
632                    max_weight,
633                );
634                out_l0_count += 1;
635
636                // L1 entry at group boundary
637                if out_l0_count.is_multiple_of(L1_INTERVAL) {
638                    out_l1_docs.push(new_last);
639                }
640
641                // Patch 8-byte header: [count: u16][first_doc: u32][bits: 2 bytes]
642                patch_buf.copy_from_slice(&block[0..8]);
643                let blk_first = u32::from_le_bytes(patch_buf[2..6].try_into().unwrap());
644                patch_buf[2..6].copy_from_slice(&(blk_first + doc_offset).to_le_bytes());
645                writer.write_all(&patch_buf)?;
646                writer.write_all(&block[8..])?;
647
648                stream_written += blk_size as u64;
649            }
650        }
651
652        // Final L1 entry for partial group
653        if !out_l0_count.is_multiple_of(L1_INTERVAL) && out_l0_count > 0 {
654            let (_, last_doc, _, _) = read_l0(&out_l0, out_l0_count - 1);
655            out_l1_docs.push(last_doc);
656        }
657
658        // Phase 2: Write L0 + L1 + footer
659        writer.write_all(&out_l0)?;
660        for &doc in &out_l1_docs {
661            writer.write_u32::<LittleEndian>(doc)?;
662        }
663
664        writer.write_u64::<LittleEndian>(stream_written)?;
665        writer.write_u32::<LittleEndian>(out_l0_count as u32)?;
666        writer.write_u32::<LittleEndian>(out_l1_docs.len() as u32)?;
667        writer.write_u32::<LittleEndian>(total_docs)?;
668        writer.write_u32::<LittleEndian>(merged_max_tf)?;
669
670        let l1_bytes_len = out_l1_docs.len() * L1_SIZE;
671        let total_bytes = stream_written as usize + out_l0.len() + l1_bytes_len + FOOTER_SIZE;
672        Ok((total_docs, total_bytes))
673    }
674
675    /// Decode a specific block into caller-provided buffers.
676    ///
677    /// Returns `true` if the block was decoded, `false` if `block_idx` is out of range.
678    /// Reuses `doc_ids` and `tfs` buffers (cleared before filling).
679    ///
680    /// Uses SIMD-accelerated unpack for 8/16/32-bit packed arrays.
681    pub fn decode_block_into(
682        &self,
683        block_idx: usize,
684        doc_ids: &mut Vec<u32>,
685        tfs: &mut Vec<u32>,
686    ) -> bool {
687        if let Some((offset, tf_start, count)) = self.decode_block_doc_ids_only(block_idx, doc_ids)
688        {
689            self.decode_block_tfs_deferred(offset, tf_start, count, tfs);
690            true
691        } else {
692            false
693        }
694    }
695
696    /// Decode only doc IDs from a block (no TF decoding).
697    ///
698    /// Returns `(block_data_offset, tf_start_within_block, count)` for deferred TF decode,
699    /// or `None` if block_idx is out of range.
700    pub fn decode_block_doc_ids_only(
701        &self,
702        block_idx: usize,
703        doc_ids: &mut Vec<u32>,
704    ) -> Option<(usize, usize, usize)> {
705        if block_idx >= self.l0_count {
706            return None;
707        }
708
709        let (_, _, offset, _) = self.read_l0_entry(block_idx);
710        let pos = offset as usize;
711        let blk_size = block_data_size(&self.stream, pos);
712        let block_data = &self.stream[pos..pos + blk_size];
713
714        // 8-byte header: [count: u16][first_doc: u32][doc_id_bits: u8][tf_bits: u8]
715        let count = u16::from_le_bytes(block_data[0..2].try_into().unwrap()) as usize;
716        let first_doc = u32::from_le_bytes(block_data[2..6].try_into().unwrap());
717        let doc_id_bits = block_data[6];
718
719        doc_ids.clear();
720        doc_ids.resize(count, 0);
721        doc_ids[0] = first_doc;
722
723        let doc_rounded = simd::RoundedBitWidth::from_u8(doc_id_bits);
724        let deltas_bytes = if count > 1 {
725            (count - 1) * doc_rounded.bytes_per_value()
726        } else {
727            0
728        };
729
730        if count > 1 {
731            simd::unpack_rounded(
732                &block_data[8..8 + deltas_bytes],
733                doc_rounded,
734                &mut doc_ids[1..],
735                count - 1,
736            );
737            for i in 1..count {
738                doc_ids[i] += doc_ids[i - 1];
739            }
740        }
741
742        let tfs_start = 8 + deltas_bytes;
743        Some((pos, tfs_start, count))
744    }
745
746    /// Decode TFs from a previously loaded block (deferred decode).
747    ///
748    /// `block_offset` and `tf_start` are returned by `decode_block_doc_ids_only`.
749    pub fn decode_block_tfs_deferred(
750        &self,
751        block_offset: usize,
752        tf_start: usize,
753        count: usize,
754        tfs: &mut Vec<u32>,
755    ) {
756        let blk_size = block_data_size(&self.stream, block_offset);
757        let block_data = &self.stream[block_offset..block_offset + blk_size];
758        let tf_bits = block_data[7];
759        let tf_rounded = simd::RoundedBitWidth::from_u8(tf_bits);
760
761        tfs.clear();
762        tfs.resize(count, 0);
763        simd::unpack_rounded(
764            &block_data[tf_start..tf_start + count * tf_rounded.bytes_per_value()],
765            tf_rounded,
766            tfs,
767            count,
768        );
769    }
770
771    /// First doc_id of a block (from L0 skip entry). Returns `None` if out of range.
772    #[inline]
773    pub fn block_first_doc(&self, block_idx: usize) -> Option<DocId> {
774        if block_idx >= self.l0_count {
775            return None;
776        }
777        let (first_doc, _, _, _) = self.read_l0_entry(block_idx);
778        Some(first_doc)
779    }
780
781    /// Last doc_id of a block (from L0 skip entry). Returns `None` if out of range.
782    #[inline]
783    pub fn block_last_doc(&self, block_idx: usize) -> Option<DocId> {
784        if block_idx >= self.l0_count {
785            return None;
786        }
787        let (_, last_doc, _, _) = self.read_l0_entry(block_idx);
788        Some(last_doc)
789    }
790
791    /// Find the first block whose `last_doc >= target`, starting from `from_block`.
792    ///
793    /// Uses SIMD-accelerated linear scan:
794    /// 1. `find_first_ge_u32` on the contiguous L1 `last_doc` array
795    /// 2. Extract ≤`L1_INTERVAL` L0 `last_doc` values into a stack buffer → `find_first_ge_u32`
796    ///
797    /// Returns `None` if no block contains `target`.
798    pub fn seek_block(&self, target: DocId, from_block: usize) -> Option<usize> {
799        if from_block >= self.l0_count {
800            return None;
801        }
802
803        let from_l1 = from_block / L1_INTERVAL;
804
805        // SIMD scan L1 to find the group containing target
806        let l1_idx = if !self.l1_docs.is_empty() {
807            let idx = from_l1 + simd::find_first_ge_u32(&self.l1_docs[from_l1..], target);
808            if idx >= self.l1_docs.len() {
809                return None;
810            }
811            idx
812        } else {
813            return None;
814        };
815
816        // Extract L0 last_doc values within the group into a stack buffer for SIMD scan
817        let start = (l1_idx * L1_INTERVAL).max(from_block);
818        let end = ((l1_idx + 1) * L1_INTERVAL).min(self.l0_count);
819        let count = end - start;
820
821        let mut last_docs = [u32::MAX; L1_INTERVAL];
822        for (j, idx) in (start..end).enumerate() {
823            let (_, ld, _, _) = read_l0(&self.l0_bytes, idx);
824            last_docs[j] = ld;
825        }
826        let within = simd::find_first_ge_u32(&last_docs[..count], target);
827        let block_idx = start + within;
828
829        if block_idx < self.l0_count {
830            Some(block_idx)
831        } else {
832            None
833        }
834    }
835
836    /// Create an iterator with skip support
837    pub fn iterator(&self) -> BlockPostingIterator<'_> {
838        BlockPostingIterator::new(self)
839    }
840
841    /// Create an owned iterator that doesn't borrow self
842    pub fn into_iterator(self) -> BlockPostingIterator<'static> {
843        BlockPostingIterator::owned(self)
844    }
845}
846
847/// Iterator over block posting list with skip support
848/// Can be either borrowed or owned via Cow
849///
850/// Uses struct-of-arrays layout: separate Vec<u32> for doc_ids and term_freqs.
851/// This is more cache-friendly for SIMD seek (contiguous doc_ids) and halves
852/// memory vs the previous AoS + separate doc_ids approach.
853pub struct BlockPostingIterator<'a> {
854    block_list: std::borrow::Cow<'a, BlockPostingList>,
855    current_block: usize,
856    block_doc_ids: Vec<u32>,
857    block_tfs: Vec<u32>,
858    position_in_block: usize,
859    exhausted: bool,
860}
861
862impl<'a> BlockPostingIterator<'a> {
863    fn new(block_list: &'a BlockPostingList) -> Self {
864        let exhausted = block_list.l0_count == 0;
865        let mut iter = Self {
866            block_list: std::borrow::Cow::Borrowed(block_list),
867            current_block: 0,
868            block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
869            block_tfs: Vec::with_capacity(BLOCK_SIZE),
870            position_in_block: 0,
871            exhausted,
872        };
873        if !iter.exhausted {
874            iter.load_block(0);
875        }
876        iter
877    }
878
879    fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
880        let exhausted = block_list.l0_count == 0;
881        let mut iter = BlockPostingIterator {
882            block_list: std::borrow::Cow::Owned(block_list),
883            current_block: 0,
884            block_doc_ids: Vec::with_capacity(BLOCK_SIZE),
885            block_tfs: Vec::with_capacity(BLOCK_SIZE),
886            position_in_block: 0,
887            exhausted,
888        };
889        if !iter.exhausted {
890            iter.load_block(0);
891        }
892        iter
893    }
894
895    fn load_block(&mut self, block_idx: usize) {
896        if block_idx >= self.block_list.l0_count {
897            self.exhausted = true;
898            return;
899        }
900
901        self.current_block = block_idx;
902        self.position_in_block = 0;
903
904        self.block_list
905            .decode_block_into(block_idx, &mut self.block_doc_ids, &mut self.block_tfs);
906    }
907
908    pub fn doc(&self) -> DocId {
909        if self.exhausted {
910            TERMINATED
911        } else if self.position_in_block < self.block_doc_ids.len() {
912            self.block_doc_ids[self.position_in_block]
913        } else {
914            TERMINATED
915        }
916    }
917
918    pub fn term_freq(&self) -> u32 {
919        if self.exhausted || self.position_in_block >= self.block_tfs.len() {
920            0
921        } else {
922            self.block_tfs[self.position_in_block]
923        }
924    }
925
926    pub fn advance(&mut self) -> DocId {
927        if self.exhausted {
928            return TERMINATED;
929        }
930
931        self.position_in_block += 1;
932        if self.position_in_block >= self.block_doc_ids.len() {
933            self.load_block(self.current_block + 1);
934        }
935        self.doc()
936    }
937
938    pub fn seek(&mut self, target: DocId) -> DocId {
939        if self.exhausted {
940            return TERMINATED;
941        }
942
943        // SIMD-accelerated 2-level seek (forward from current block)
944        let block_idx = match self.block_list.seek_block(target, self.current_block) {
945            Some(idx) => idx,
946            None => {
947                self.exhausted = true;
948                return TERMINATED;
949            }
950        };
951
952        if block_idx != self.current_block {
953            self.load_block(block_idx);
954        }
955
956        // SIMD linear scan within block on cached doc_ids
957        let remaining = &self.block_doc_ids[self.position_in_block..];
958        let pos = crate::structures::simd::find_first_ge_u32(remaining, target);
959        self.position_in_block += pos;
960
961        if self.position_in_block >= self.block_doc_ids.len() {
962            self.load_block(self.current_block + 1);
963        }
964        self.doc()
965    }
966
967    /// Skip to the next block, returning the first doc_id in the new block
968    /// This is used for block-max pruning when the current block's
969    /// max score can't beat the threshold.
970    pub fn skip_to_next_block(&mut self) -> DocId {
971        if self.exhausted {
972            return TERMINATED;
973        }
974        self.load_block(self.current_block + 1);
975        self.doc()
976    }
977
978    /// Get the current block index
979    #[inline]
980    pub fn current_block_idx(&self) -> usize {
981        self.current_block
982    }
983
984    /// Get total number of blocks
985    #[inline]
986    pub fn num_blocks(&self) -> usize {
987        self.block_list.l0_count
988    }
989
990    /// Get the current block's max term frequency for block-max pruning
991    #[inline]
992    pub fn current_block_max_tf(&self) -> u32 {
993        if self.exhausted || self.current_block >= self.block_list.l0_count {
994            0
995        } else {
996            let (_, _, _, max_weight) = self.block_list.read_l0_entry(self.current_block);
997            max_weight as u32
998        }
999    }
1000}
1001
1002#[cfg(test)]
1003mod tests {
1004    use super::*;
1005
1006    #[test]
1007    fn test_posting_list_basic() {
1008        let mut list = PostingList::new();
1009        list.push(1, 2);
1010        list.push(5, 1);
1011        list.push(10, 3);
1012
1013        assert_eq!(list.len(), 3);
1014
1015        let mut iter = PostingListIterator::new(&list);
1016        assert_eq!(iter.doc(), 1);
1017        assert_eq!(iter.term_freq(), 2);
1018
1019        assert_eq!(iter.advance(), 5);
1020        assert_eq!(iter.term_freq(), 1);
1021
1022        assert_eq!(iter.advance(), 10);
1023        assert_eq!(iter.term_freq(), 3);
1024
1025        assert_eq!(iter.advance(), TERMINATED);
1026    }
1027
1028    #[test]
1029    fn test_posting_list_serialization() {
1030        let mut list = PostingList::new();
1031        for i in 0..100 {
1032            list.push(i * 3, (i % 5) + 1);
1033        }
1034
1035        let mut buffer = Vec::new();
1036        list.serialize(&mut buffer).unwrap();
1037
1038        let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
1039        assert_eq!(deserialized.len(), list.len());
1040
1041        for (a, b) in list.iter().zip(deserialized.iter()) {
1042            assert_eq!(a, b);
1043        }
1044    }
1045
1046    #[test]
1047    fn test_posting_list_seek() {
1048        let mut list = PostingList::new();
1049        for i in 0..100 {
1050            list.push(i * 2, 1);
1051        }
1052
1053        let mut iter = PostingListIterator::new(&list);
1054
1055        assert_eq!(iter.seek(50), 50);
1056        assert_eq!(iter.seek(51), 52);
1057        assert_eq!(iter.seek(200), TERMINATED);
1058    }
1059
1060    #[test]
1061    fn test_block_posting_list() {
1062        let mut list = PostingList::new();
1063        for i in 0..500 {
1064            list.push(i * 2, (i % 10) + 1);
1065        }
1066
1067        let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1068        assert_eq!(block_list.doc_count(), 500);
1069
1070        let mut iter = block_list.iterator();
1071        assert_eq!(iter.doc(), 0);
1072        assert_eq!(iter.term_freq(), 1);
1073
1074        // Test seek across blocks
1075        assert_eq!(iter.seek(500), 500);
1076        assert_eq!(iter.seek(998), 998);
1077        assert_eq!(iter.seek(1000), TERMINATED);
1078    }
1079
1080    #[test]
1081    fn test_block_posting_list_serialization() {
1082        let mut list = PostingList::new();
1083        for i in 0..300 {
1084            list.push(i * 3, i + 1);
1085        }
1086
1087        let block_list = BlockPostingList::from_posting_list(&list).unwrap();
1088
1089        let mut buffer = Vec::new();
1090        block_list.serialize(&mut buffer).unwrap();
1091
1092        let deserialized = BlockPostingList::deserialize(&buffer[..]).unwrap();
1093        assert_eq!(deserialized.doc_count(), block_list.doc_count());
1094
1095        // Verify iteration produces same results
1096        let mut iter1 = block_list.iterator();
1097        let mut iter2 = deserialized.iterator();
1098
1099        while iter1.doc() != TERMINATED {
1100            assert_eq!(iter1.doc(), iter2.doc());
1101            assert_eq!(iter1.term_freq(), iter2.term_freq());
1102            iter1.advance();
1103            iter2.advance();
1104        }
1105        assert_eq!(iter2.doc(), TERMINATED);
1106    }
1107
1108    /// Helper: collect all (doc_id, tf) from a BlockPostingIterator
1109    fn collect_postings(bpl: &BlockPostingList) -> Vec<(u32, u32)> {
1110        let mut result = Vec::new();
1111        let mut it = bpl.iterator();
1112        while it.doc() != TERMINATED {
1113            result.push((it.doc(), it.term_freq()));
1114            it.advance();
1115        }
1116        result
1117    }
1118
1119    /// Helper: build a BlockPostingList from (doc_id, tf) pairs
1120    fn build_bpl(postings: &[(u32, u32)]) -> BlockPostingList {
1121        let mut pl = PostingList::new();
1122        for &(doc_id, tf) in postings {
1123            pl.push(doc_id, tf);
1124        }
1125        BlockPostingList::from_posting_list(&pl).unwrap()
1126    }
1127
1128    /// Helper: serialize a BlockPostingList to bytes
1129    fn serialize_bpl(bpl: &BlockPostingList) -> Vec<u8> {
1130        let mut buf = Vec::new();
1131        bpl.serialize(&mut buf).unwrap();
1132        buf
1133    }
1134
1135    #[test]
1136    fn test_concatenate_blocks_two_segments() {
1137        // Segment A: docs 0,2,4,...,198 (100 docs, tf=1..100)
1138        let a: Vec<(u32, u32)> = (0..100).map(|i| (i * 2, i + 1)).collect();
1139        let bpl_a = build_bpl(&a);
1140
1141        // Segment B: docs 0,3,6,...,297 (100 docs, tf=2..101)
1142        let b: Vec<(u32, u32)> = (0..100).map(|i| (i * 3, i + 2)).collect();
1143        let bpl_b = build_bpl(&b);
1144
1145        // Merge: segment B starts at doc_offset=200
1146        let merged =
1147            BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 200)])
1148                .unwrap();
1149
1150        assert_eq!(merged.doc_count(), 200);
1151
1152        let postings = collect_postings(&merged);
1153        assert_eq!(postings.len(), 200);
1154
1155        // First 100 from A (unchanged)
1156        for (i, p) in postings.iter().enumerate().take(100) {
1157            assert_eq!(*p, (i as u32 * 2, i as u32 + 1));
1158        }
1159        // Next 100 from B (doc_id += 200)
1160        for i in 0..100 {
1161            assert_eq!(postings[100 + i], (i as u32 * 3 + 200, i as u32 + 2));
1162        }
1163    }
1164
1165    #[test]
1166    fn test_concatenate_streaming_matches_blocks() {
1167        // Build 3 segments with different doc distributions
1168        let seg_a: Vec<(u32, u32)> = (0..250).map(|i| (i * 2, (i % 7) + 1)).collect();
1169        let seg_b: Vec<(u32, u32)> = (0..180).map(|i| (i * 5, (i % 3) + 1)).collect();
1170        let seg_c: Vec<(u32, u32)> = (0..90).map(|i| (i * 10, (i % 11) + 1)).collect();
1171
1172        let bpl_a = build_bpl(&seg_a);
1173        let bpl_b = build_bpl(&seg_b);
1174        let bpl_c = build_bpl(&seg_c);
1175
1176        let offset_b = 1000u32;
1177        let offset_c = 2000u32;
1178
1179        // Method 1: concatenate_blocks (in-memory reference)
1180        let ref_merged = BlockPostingList::concatenate_blocks(&[
1181            (bpl_a.clone(), 0),
1182            (bpl_b.clone(), offset_b),
1183            (bpl_c.clone(), offset_c),
1184        ])
1185        .unwrap();
1186        let mut ref_buf = Vec::new();
1187        ref_merged.serialize(&mut ref_buf).unwrap();
1188
1189        // Method 2: concatenate_streaming (footer-based, writes to output)
1190        let bytes_a = serialize_bpl(&bpl_a);
1191        let bytes_b = serialize_bpl(&bpl_b);
1192        let bytes_c = serialize_bpl(&bpl_c);
1193
1194        let sources: Vec<(&[u8], u32)> =
1195            vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
1196        let mut stream_buf = Vec::new();
1197        let (doc_count, bytes_written) =
1198            BlockPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
1199
1200        assert_eq!(doc_count, 520); // 250 + 180 + 90
1201        assert_eq!(bytes_written, stream_buf.len());
1202
1203        // Deserialize both and verify identical postings
1204        let ref_postings = collect_postings(&BlockPostingList::deserialize(&ref_buf).unwrap());
1205        let stream_postings =
1206            collect_postings(&BlockPostingList::deserialize(&stream_buf).unwrap());
1207
1208        assert_eq!(ref_postings.len(), stream_postings.len());
1209        for (i, (r, s)) in ref_postings.iter().zip(stream_postings.iter()).enumerate() {
1210            assert_eq!(r, s, "mismatch at posting {}", i);
1211        }
1212    }
1213
1214    #[test]
1215    fn test_multi_round_merge() {
1216        // Simulate 3 rounds of merging (like tiered merge policy)
1217        //
1218        // Round 0: 4 small segments built independently
1219        // Round 1: merge pairs → 2 medium segments
1220        // Round 2: merge those → 1 large segment
1221
1222        let segments: Vec<Vec<(u32, u32)>> = (0..4)
1223            .map(|seg| (0..200).map(|i| (i * 3, (i + seg * 7) % 10 + 1)).collect())
1224            .collect();
1225
1226        let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1227        let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1228
1229        // Round 1: merge seg0+seg1 (offset=0,600), seg2+seg3 (offset=0,600)
1230        let mut merged_01 = Vec::new();
1231        let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1232        let (dc_01, _) =
1233            BlockPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1234        assert_eq!(dc_01, 400);
1235
1236        let mut merged_23 = Vec::new();
1237        let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1238        let (dc_23, _) =
1239            BlockPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1240        assert_eq!(dc_23, 400);
1241
1242        // Round 2: merge the two intermediate results (offset=0, 1200)
1243        let mut final_merged = Vec::new();
1244        let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1245        let (dc_final, _) =
1246            BlockPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1247        assert_eq!(dc_final, 800);
1248
1249        // Verify final result has all 800 postings with correct doc_ids
1250        let final_bpl = BlockPostingList::deserialize(&final_merged).unwrap();
1251        let postings = collect_postings(&final_bpl);
1252        assert_eq!(postings.len(), 800);
1253
1254        // Verify doc_id ordering (must be monotonically non-decreasing within segments,
1255        // and segment boundaries at 0, 600, 1200, 1800)
1256        // Seg0: 0..597, Seg1: 600..1197, Seg2: 1200..1797, Seg3: 1800..2397
1257        assert_eq!(postings[0].0, 0); // first doc of seg0
1258        assert_eq!(postings[199].0, 597); // last doc of seg0 (199*3)
1259        assert_eq!(postings[200].0, 600); // first doc of seg1 (0+600)
1260        assert_eq!(postings[399].0, 1197); // last doc of seg1 (597+600)
1261        assert_eq!(postings[400].0, 1200); // first doc of seg2
1262        assert_eq!(postings[799].0, 2397); // last doc of seg3
1263
1264        // Verify TFs preserved through two rounds of merging
1265        // Creation formula: tf = (i + seg * 7) % 10 + 1
1266        for seg in 0u32..4 {
1267            for i in 0u32..200 {
1268                let idx = (seg * 200 + i) as usize;
1269                assert_eq!(
1270                    postings[idx].1,
1271                    (i + seg * 7) % 10 + 1,
1272                    "seg{} tf[{}]",
1273                    seg,
1274                    i
1275                );
1276            }
1277        }
1278
1279        // Verify seek works on final merged result
1280        let mut it = final_bpl.iterator();
1281        assert_eq!(it.seek(600), 600);
1282        assert_eq!(it.seek(1200), 1200);
1283        assert_eq!(it.seek(2397), 2397);
1284        assert_eq!(it.seek(2398), TERMINATED);
1285    }
1286
1287    #[test]
1288    fn test_large_scale_merge() {
1289        // 5 segments × 2000 docs each = 10,000 total docs
1290        // Each segment has 16 blocks (2000/128 = 15.6 → 16 blocks)
1291        let num_segments = 5;
1292        let docs_per_segment = 2000;
1293        let docs_gap = 3; // doc_ids: 0, 3, 6, ...
1294
1295        let segments: Vec<Vec<(u32, u32)>> = (0..num_segments)
1296            .map(|seg| {
1297                (0..docs_per_segment)
1298                    .map(|i| (i as u32 * docs_gap, (i as u32 + seg as u32) % 20 + 1))
1299                    .collect()
1300            })
1301            .collect();
1302
1303        let bpls: Vec<BlockPostingList> = segments.iter().map(|s| build_bpl(s)).collect();
1304
1305        // Verify each segment has multiple blocks
1306        for bpl in &bpls {
1307            assert!(
1308                bpl.num_blocks() >= 15,
1309                "expected >=15 blocks, got {}",
1310                bpl.num_blocks()
1311            );
1312        }
1313
1314        let serialized: Vec<Vec<u8>> = bpls.iter().map(serialize_bpl).collect();
1315
1316        // Compute offsets: each segment occupies max_doc+1 doc_id space
1317        let max_doc_per_seg = (docs_per_segment as u32 - 1) * docs_gap;
1318        let offsets: Vec<u32> = (0..num_segments)
1319            .map(|i| i as u32 * (max_doc_per_seg + 1))
1320            .collect();
1321
1322        let sources: Vec<(&[u8], u32)> = serialized
1323            .iter()
1324            .zip(offsets.iter())
1325            .map(|(b, o)| (b.as_slice(), *o))
1326            .collect();
1327
1328        let mut merged = Vec::new();
1329        let (doc_count, _) =
1330            BlockPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1331        assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1332
1333        // Deserialize and verify
1334        let merged_bpl = BlockPostingList::deserialize(&merged).unwrap();
1335        let postings = collect_postings(&merged_bpl);
1336        assert_eq!(postings.len(), num_segments * docs_per_segment);
1337
1338        // Verify all doc_ids are strictly monotonically increasing across segment boundaries
1339        for i in 1..postings.len() {
1340            assert!(
1341                postings[i].0 > postings[i - 1].0 || (i % docs_per_segment == 0), // new segment can have lower absolute ID
1342                "doc_id not increasing at {}: {} vs {}",
1343                i,
1344                postings[i - 1].0,
1345                postings[i].0,
1346            );
1347        }
1348
1349        // Verify seek across all block boundaries
1350        let mut it = merged_bpl.iterator();
1351        for (seg, &expected_first) in offsets.iter().enumerate() {
1352            assert_eq!(
1353                it.seek(expected_first),
1354                expected_first,
1355                "seek to segment {} start",
1356                seg
1357            );
1358        }
1359    }
1360
1361    #[test]
1362    fn test_merge_edge_cases() {
1363        // Single doc per segment
1364        let bpl_a = build_bpl(&[(0, 5)]);
1365        let bpl_b = build_bpl(&[(0, 3)]);
1366
1367        let merged =
1368            BlockPostingList::concatenate_blocks(&[(bpl_a.clone(), 0), (bpl_b.clone(), 1)])
1369                .unwrap();
1370        assert_eq!(merged.doc_count(), 2);
1371        let p = collect_postings(&merged);
1372        assert_eq!(p, vec![(0, 5), (1, 3)]);
1373
1374        // Exactly BLOCK_SIZE docs (single full block)
1375        let exact_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32).map(|i| (i, i % 5 + 1)).collect();
1376        let bpl_exact = build_bpl(&exact_block);
1377        assert_eq!(bpl_exact.num_blocks(), 1);
1378
1379        let bytes = serialize_bpl(&bpl_exact);
1380        let mut out = Vec::new();
1381        let sources: Vec<(&[u8], u32)> = vec![(&bytes, 0), (&bytes, BLOCK_SIZE as u32)];
1382        let (dc, _) = BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1383        assert_eq!(dc, BLOCK_SIZE as u32 * 2);
1384
1385        let merged = BlockPostingList::deserialize(&out).unwrap();
1386        let postings = collect_postings(&merged);
1387        assert_eq!(postings.len(), BLOCK_SIZE * 2);
1388        // Second segment's docs offset by BLOCK_SIZE
1389        assert_eq!(postings[BLOCK_SIZE].0, BLOCK_SIZE as u32);
1390
1391        // BLOCK_SIZE + 1 docs (two blocks: 128 + 1)
1392        let over_block: Vec<(u32, u32)> = (0..BLOCK_SIZE as u32 + 1).map(|i| (i * 2, 1)).collect();
1393        let bpl_over = build_bpl(&over_block);
1394        assert_eq!(bpl_over.num_blocks(), 2);
1395    }
1396
1397    #[test]
1398    fn test_streaming_roundtrip_single_source() {
1399        // Streaming merge with a single source should produce equivalent output to serialize
1400        let docs: Vec<(u32, u32)> = (0..500).map(|i| (i * 7, i % 15 + 1)).collect();
1401        let bpl = build_bpl(&docs);
1402        let direct = serialize_bpl(&bpl);
1403
1404        let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1405        let mut streamed = Vec::new();
1406        BlockPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1407
1408        // Both should deserialize to identical postings
1409        let p1 = collect_postings(&BlockPostingList::deserialize(&direct).unwrap());
1410        let p2 = collect_postings(&BlockPostingList::deserialize(&streamed).unwrap());
1411        assert_eq!(p1, p2);
1412    }
1413
1414    #[test]
1415    fn test_max_tf_preserved_through_merge() {
1416        // Segment A: max_tf = 50
1417        let mut a = Vec::new();
1418        for i in 0..200 {
1419            a.push((i * 2, if i == 100 { 50 } else { 1 }));
1420        }
1421        let bpl_a = build_bpl(&a);
1422        assert_eq!(bpl_a.max_tf(), 50);
1423
1424        // Segment B: max_tf = 30
1425        let mut b = Vec::new();
1426        for i in 0..200 {
1427            b.push((i * 2, if i == 50 { 30 } else { 2 }));
1428        }
1429        let bpl_b = build_bpl(&b);
1430        assert_eq!(bpl_b.max_tf(), 30);
1431
1432        // After merge, max_tf should be max(50, 30) = 50
1433        let bytes_a = serialize_bpl(&bpl_a);
1434        let bytes_b = serialize_bpl(&bpl_b);
1435        let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 1000)];
1436        let mut out = Vec::new();
1437        BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1438
1439        let merged = BlockPostingList::deserialize(&out).unwrap();
1440        assert_eq!(merged.max_tf(), 50);
1441        assert_eq!(merged.doc_count(), 400);
1442    }
1443
1444    // ── 2-level skip list format tests ──────────────────────────────────
1445
1446    #[test]
1447    fn test_l0_l1_counts() {
1448        // 1 block (< L1_INTERVAL) → 1 L1 entry (partial group)
1449        let bpl = build_bpl(&(0..50u32).map(|i| (i, 1)).collect::<Vec<_>>());
1450        assert_eq!(bpl.num_blocks(), 1);
1451        assert_eq!(bpl.l1_docs.len(), 1);
1452
1453        // Exactly L1_INTERVAL blocks → 1 L1 entry (full group)
1454        let n = BLOCK_SIZE * L1_INTERVAL;
1455        let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1456        assert_eq!(bpl.num_blocks(), L1_INTERVAL);
1457        assert_eq!(bpl.l1_docs.len(), 1);
1458
1459        // L1_INTERVAL + 1 blocks → 2 L1 entries
1460        let n = BLOCK_SIZE * L1_INTERVAL + 1;
1461        let bpl = build_bpl(&(0..n as u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1462        assert_eq!(bpl.num_blocks(), L1_INTERVAL + 1);
1463        assert_eq!(bpl.l1_docs.len(), 2);
1464
1465        // 3 × L1_INTERVAL blocks → 3 L1 entries (all full groups)
1466        let n = BLOCK_SIZE * L1_INTERVAL * 3;
1467        let bpl = build_bpl(&(0..n as u32).map(|i| (i, 1)).collect::<Vec<_>>());
1468        assert_eq!(bpl.num_blocks(), L1_INTERVAL * 3);
1469        assert_eq!(bpl.l1_docs.len(), 3);
1470    }
1471
1472    #[test]
1473    fn test_l1_last_doc_values() {
1474        // 20 blocks: 2 full L1 groups (8+8) + 1 partial (4) → 3 L1 entries
1475        let n = BLOCK_SIZE * 20;
1476        let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1477        let bpl = build_bpl(&docs);
1478        assert_eq!(bpl.num_blocks(), 20);
1479        assert_eq!(bpl.l1_docs.len(), 3); // ceil(20/8) = 3
1480
1481        // L1[0] = last_doc of block 7 (end of first group)
1482        let expected_l1_0 = bpl.block_last_doc(7).unwrap();
1483        assert_eq!(bpl.l1_docs[0], expected_l1_0);
1484
1485        // L1[1] = last_doc of block 15 (end of second group)
1486        let expected_l1_1 = bpl.block_last_doc(15).unwrap();
1487        assert_eq!(bpl.l1_docs[1], expected_l1_1);
1488
1489        // L1[2] = last_doc of block 19 (end of partial group)
1490        let expected_l1_2 = bpl.block_last_doc(19).unwrap();
1491        assert_eq!(bpl.l1_docs[2], expected_l1_2);
1492    }
1493
1494    #[test]
1495    fn test_seek_block_basic() {
1496        // 20 blocks spanning large doc ID range
1497        let n = BLOCK_SIZE * 20;
1498        let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 10, 1)).collect();
1499        let bpl = build_bpl(&docs);
1500
1501        // Seek to doc 0 → block 0
1502        assert_eq!(bpl.seek_block(0, 0), Some(0));
1503
1504        // Seek to the first doc of each block
1505        for blk in 0..20 {
1506            let first = bpl.block_first_doc(blk).unwrap();
1507            assert_eq!(
1508                bpl.seek_block(first, 0),
1509                Some(blk),
1510                "seek to block {} first_doc",
1511                blk
1512            );
1513        }
1514
1515        // Seek to the last doc of each block
1516        for blk in 0..20 {
1517            let last = bpl.block_last_doc(blk).unwrap();
1518            assert_eq!(
1519                bpl.seek_block(last, 0),
1520                Some(blk),
1521                "seek to block {} last_doc",
1522                blk
1523            );
1524        }
1525
1526        // Seek past all docs
1527        let max_doc = bpl.block_last_doc(19).unwrap();
1528        assert_eq!(bpl.seek_block(max_doc + 1, 0), None);
1529
1530        // Seek with from_block > 0 (skip early blocks)
1531        let mid_doc = bpl.block_first_doc(10).unwrap();
1532        assert_eq!(bpl.seek_block(mid_doc, 10), Some(10));
1533        assert_eq!(
1534            bpl.seek_block(mid_doc, 11),
1535            Some(11).or(bpl.seek_block(mid_doc, 11))
1536        );
1537    }
1538
1539    #[test]
1540    fn test_seek_block_across_l1_boundaries() {
1541        // 24 blocks = 3 L1 groups of 8
1542        let n = BLOCK_SIZE * 24;
1543        let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 5, 1)).collect();
1544        let bpl = build_bpl(&docs);
1545        assert_eq!(bpl.l1_docs.len(), 3);
1546
1547        // Seek into each L1 group
1548        for group in 0..3 {
1549            let blk = group * L1_INTERVAL;
1550            let target = bpl.block_first_doc(blk).unwrap();
1551            assert_eq!(
1552                bpl.seek_block(target, 0),
1553                Some(blk),
1554                "seek to group {} block {}",
1555                group,
1556                blk
1557            );
1558        }
1559
1560        // Seek to doc in the middle of group 2 (block 20)
1561        let target = bpl.block_first_doc(20).unwrap() + 1;
1562        assert_eq!(bpl.seek_block(target, 0), Some(20));
1563    }
1564
1565    #[test]
1566    fn test_block_data_size_helper() {
1567        // Build a posting list and verify block_data_size matches actual block sizes
1568        let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 7, (i % 20) + 1)).collect();
1569        let bpl = build_bpl(&docs);
1570
1571        for blk in 0..bpl.num_blocks() {
1572            let (_, _, offset, _) = bpl.read_l0_entry(blk);
1573            let computed_size = block_data_size(&bpl.stream, offset as usize);
1574
1575            // Verify: next block's offset - this block's offset should equal computed_size
1576            // (for all but last block)
1577            if blk + 1 < bpl.num_blocks() {
1578                let (_, _, next_offset, _) = bpl.read_l0_entry(blk + 1);
1579                assert_eq!(
1580                    computed_size,
1581                    (next_offset - offset) as usize,
1582                    "block_data_size mismatch at block {}",
1583                    blk
1584                );
1585            } else {
1586                // Last block: offset + size should equal stream length
1587                assert_eq!(
1588                    offset as usize + computed_size,
1589                    bpl.stream.len(),
1590                    "last block size mismatch"
1591                );
1592            }
1593        }
1594    }
1595
1596    #[test]
1597    fn test_l0_entry_roundtrip() {
1598        // Verify L0 entries survive serialize → deserialize
1599        let docs: Vec<(u32, u32)> = (0..1000u32).map(|i| (i * 3, (i % 10) + 1)).collect();
1600        let bpl = build_bpl(&docs);
1601
1602        let bytes = serialize_bpl(&bpl);
1603        let bpl2 = BlockPostingList::deserialize(&bytes).unwrap();
1604
1605        assert_eq!(bpl.num_blocks(), bpl2.num_blocks());
1606        for blk in 0..bpl.num_blocks() {
1607            assert_eq!(
1608                bpl.read_l0_entry(blk),
1609                bpl2.read_l0_entry(blk),
1610                "L0 entry mismatch at block {}",
1611                blk
1612            );
1613        }
1614
1615        // Verify L1 docs match
1616        assert_eq!(bpl.l1_docs, bpl2.l1_docs);
1617    }
1618
1619    #[test]
1620    fn test_zero_copy_deserialize_matches() {
1621        let docs: Vec<(u32, u32)> = (0..2000u32).map(|i| (i * 2, (i % 5) + 1)).collect();
1622        let bpl = build_bpl(&docs);
1623        let bytes = serialize_bpl(&bpl);
1624
1625        let copied = BlockPostingList::deserialize(&bytes).unwrap();
1626        let zero_copy =
1627            BlockPostingList::deserialize_zero_copy(OwnedBytes::new(bytes.clone())).unwrap();
1628
1629        // Same structure
1630        assert_eq!(copied.l0_count, zero_copy.l0_count);
1631        assert_eq!(copied.l1_docs, zero_copy.l1_docs);
1632        assert_eq!(copied.doc_count, zero_copy.doc_count);
1633        assert_eq!(copied.max_tf, zero_copy.max_tf);
1634
1635        // Same iteration
1636        let p1 = collect_postings(&copied);
1637        let p2 = collect_postings(&zero_copy);
1638        assert_eq!(p1, p2);
1639    }
1640
1641    #[test]
1642    fn test_l1_preserved_through_streaming_merge() {
1643        // Merge 3 segments, verify L1 is correctly rebuilt
1644        let seg_a = build_bpl(&(0..1000u32).map(|i| (i * 2, 1)).collect::<Vec<_>>());
1645        let seg_b = build_bpl(&(0..800u32).map(|i| (i * 3, 2)).collect::<Vec<_>>());
1646        let seg_c = build_bpl(&(0..500u32).map(|i| (i * 5, 3)).collect::<Vec<_>>());
1647
1648        let bytes_a = serialize_bpl(&seg_a);
1649        let bytes_b = serialize_bpl(&seg_b);
1650        let bytes_c = serialize_bpl(&seg_c);
1651
1652        let sources: Vec<(&[u8], u32)> = vec![(&bytes_a, 0), (&bytes_b, 10000), (&bytes_c, 20000)];
1653        let mut out = Vec::new();
1654        BlockPostingList::concatenate_streaming(&sources, &mut out).unwrap();
1655
1656        let merged = BlockPostingList::deserialize(&out).unwrap();
1657        let expected_l1_count = merged.num_blocks().div_ceil(L1_INTERVAL);
1658        assert_eq!(merged.l1_docs.len(), expected_l1_count);
1659
1660        // Verify L1 values are correct
1661        for (i, &l1_doc) in merged.l1_docs.iter().enumerate() {
1662            let last_block_in_group = ((i + 1) * L1_INTERVAL - 1).min(merged.num_blocks() - 1);
1663            let expected = merged.block_last_doc(last_block_in_group).unwrap();
1664            assert_eq!(l1_doc, expected, "L1[{}] mismatch", i);
1665        }
1666
1667        // Verify seek_block works on merged result
1668        for blk in 0..merged.num_blocks() {
1669            let first = merged.block_first_doc(blk).unwrap();
1670            assert_eq!(merged.seek_block(first, 0), Some(blk));
1671        }
1672    }
1673
1674    #[test]
1675    fn test_seek_block_single_block() {
1676        // Edge case: single block (< L1_INTERVAL)
1677        let bpl = build_bpl(&[(0, 1), (10, 2), (20, 3)]);
1678        assert_eq!(bpl.num_blocks(), 1);
1679        assert_eq!(bpl.l1_docs.len(), 1);
1680
1681        assert_eq!(bpl.seek_block(0, 0), Some(0));
1682        assert_eq!(bpl.seek_block(10, 0), Some(0));
1683        assert_eq!(bpl.seek_block(20, 0), Some(0));
1684        assert_eq!(bpl.seek_block(21, 0), None);
1685    }
1686
1687    #[test]
1688    fn test_footer_size() {
1689        // Verify serialized size = stream + L0 + L1 + FOOTER_SIZE
1690        let docs: Vec<(u32, u32)> = (0..500u32).map(|i| (i * 2, 1)).collect();
1691        let bpl = build_bpl(&docs);
1692        let bytes = serialize_bpl(&bpl);
1693
1694        let expected =
1695            bpl.stream.len() + bpl.l0_count * L0_SIZE + bpl.l1_docs.len() * L1_SIZE + FOOTER_SIZE;
1696        assert_eq!(bytes.len(), expected);
1697    }
1698
1699    #[test]
1700    fn test_seek_block_from_block_skips_earlier() {
1701        // 16 blocks: seek with from_block should skip earlier blocks
1702        let n = BLOCK_SIZE * 16;
1703        let docs: Vec<(u32, u32)> = (0..n as u32).map(|i| (i * 3, 1)).collect();
1704        let bpl = build_bpl(&docs);
1705
1706        // Target is in block 5, but from_block=8 → should find block >= 8
1707        let target_in_5 = bpl.block_first_doc(5).unwrap() + 1;
1708        // from_block=8 means we only look at blocks 8+
1709        // target_in_5 < last_doc of block 8, so seek_block(target, 8) should return 8
1710        let result = bpl.seek_block(target_in_5, 8);
1711        assert!(result.is_some());
1712        assert!(result.unwrap() >= 8);
1713    }
1714}
hermes_core/structures/postings/posting.rs

hermes_core/structures/postings/
posting.rs