luci/inverted/
postings.rs

1//! Posting list encoding and decoding.
2//!
3//! Encodes sorted `(DocId, term_frequency)` pairs using delta-encoded doc IDs
4//! and VByte encoding. Format:
5//!
6//! ```text
7//! [num_docs: u32] [delta_0: vbyte] [tf_0: vbyte] [delta_1: vbyte] [tf_1: vbyte] ...
8//! ```
9//!
10//! See [[inverted-index]] and [[architecture-overview#Step 1]].
11
12use crate::core::DocId;
13
14// --- VByte encoding ---
15
16/// Encode a u32 as a variable-length byte sequence.
17/// Uses 1-5 bytes. Each byte stores 7 data bits; the high bit indicates
18/// continuation (1 = more bytes follow, 0 = last byte).
19fn encode_vbyte(mut value: u32, out: &mut Vec<u8>) {
20    loop {
21        let byte = (value & 0x7F) as u8;
22        value >>= 7;
23        if value == 0 {
24            out.push(byte); // high bit clear = last byte
25            break;
26        }
27        out.push(byte | 0x80); // high bit set = more bytes follow
28    }
29}
30
31/// Decode a VByte-encoded u32 from the given position. Returns (value, bytes_consumed).
32fn decode_vbyte(data: &[u8], pos: usize) -> (u32, usize) {
33    let mut result: u32 = 0;
34    let mut shift = 0;
35    let mut i = pos;
36    loop {
37        let byte = data[i];
38        result |= ((byte & 0x7F) as u32) << shift;
39        i += 1;
40        if byte & 0x80 == 0 {
41            break;
42        }
43        shift += 7;
44    }
45    (result, i - pos)
46}
47
48// --- PostingListWriter ---
49
50/// Builds a posting list from sorted (doc_id, tf) pairs.
51///
52/// Callers must add documents in strictly increasing doc_id order.
53pub struct PostingListWriter {
54    buf: Vec<u8>,
55    count: u32,
56    last_doc_id: u32,
57}
58
59impl PostingListWriter {
60    pub fn new() -> Self {
61        Self {
62            buf: Vec::new(),
63            count: 0,
64            last_doc_id: 0,
65        }
66    }
67
68    /// Add a posting. `doc_id` must be strictly greater than the previous one.
69    pub fn add(&mut self, doc_id: DocId, tf: u32) {
70        let id = doc_id.as_u32();
71        let delta = if self.count == 0 {
72            id
73        } else {
74            debug_assert!(id > self.last_doc_id, "doc IDs must be strictly increasing");
75            id - self.last_doc_id
76        };
77
78        encode_vbyte(delta, &mut self.buf);
79        encode_vbyte(tf, &mut self.buf);
80
81        self.last_doc_id = id;
82        self.count += 1;
83    }
84
85    /// Finalize and return the encoded posting list bytes.
86    pub fn finish(self) -> Vec<u8> {
87        let mut result = Vec::with_capacity(5 + self.buf.len());
88        result.extend_from_slice(&self.count.to_le_bytes());
89        result.push(0x00); // flags: no positions
90        result.extend_from_slice(&self.buf);
91        result
92    }
93}
94
95impl Default for PostingListWriter {
96    fn default() -> Self {
97        Self::new()
98    }
99}
100
101// --- PostingListReader ---
102
103/// Iterates over a posting list, yielding (DocId, term_frequency) pairs.
104///
105/// Handles both the original format and the position-aware format. When
106/// reading position-aware data, positions are skipped automatically.
107pub struct PostingListReader<'a> {
108    data: &'a [u8],
109    pos: usize,
110    remaining: u32,
111    current_doc_id: u32,
112    has_positions: bool,
113}
114
115impl<'a> PostingListReader<'a> {
116    /// Create a reader from encoded posting list bytes.
117    ///
118    /// Handles both the original format (no positions) and the position-aware
119    /// format (with flags byte). When reading position-aware data, positions
120    /// are skipped — only doc_id and tf are returned. Use
121    /// `PositionPostingListReader` to read positions.
122    pub fn new(data: &'a [u8]) -> Self {
123        let count = if data.len() >= 5 {
124            u32::from_le_bytes([data[0], data[1], data[2], data[3]])
125        } else {
126            0
127        };
128        let has_pos = has_positions(data);
129
130        // Determine start of posting data based on format
131        let pos = if data.len() >= 5 && data[4] == FLAG_BLOCK_MAX {
132            // Block-max format: skip [num_docs(4)][flags(1)][num_blocks(2)][headers]
133            let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap()) as usize;
134            7 + num_blocks * 8 // skip block headers (8 bytes each)
135        } else {
136            5 // skip num_docs (4) + flags (1)
137        };
138
139        Self {
140            data,
141            pos,
142            remaining: count,
143            current_doc_id: 0,
144            has_positions: has_pos,
145        }
146    }
147
148    /// Number of postings in this list.
149    pub fn len(&self) -> u32 {
150        if self.data.len() >= 5 {
151            u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
152        } else {
153            0
154        }
155    }
156
157    /// Whether the posting list is empty.
158    pub fn is_empty(&self) -> bool {
159        self.len() == 0
160    }
161
162    /// Read the next (DocId, tf) pair, or None if exhausted.
163    /// If the underlying data has positions, they are skipped.
164    pub fn next(&mut self) -> Option<(DocId, u32)> {
165        if self.remaining == 0 {
166            return None;
167        }
168
169        let (delta, consumed) = decode_vbyte(self.data, self.pos);
170        self.pos += consumed;
171
172        let (tf, consumed) = decode_vbyte(self.data, self.pos);
173        self.pos += consumed;
174
175        // Skip positions if present
176        if self.has_positions {
177            for _ in 0..tf {
178                let (_, consumed) = decode_vbyte(self.data, self.pos);
179                self.pos += consumed;
180            }
181        }
182
183        self.current_doc_id += delta;
184        self.remaining -= 1;
185
186        Some((DocId(self.current_doc_id), tf))
187    }
188}
189
190// --- Position-aware posting list ---
191
192/// Builds a posting list that includes term positions (for phrase queries).
193///
194/// Format:
195/// ```text
196/// [num_docs: u32] [flags: u8 = 0x01]
197/// Per doc: [delta: vbyte] [tf: vbyte] [pos_0: vbyte] [pos_delta_1: vbyte] ...
198/// ```
199///
200/// Positions are delta-encoded within each document.
201pub struct PositionPostingListWriter {
202    buf: Vec<u8>,
203    count: u32,
204    last_doc_id: u32,
205}
206
207/// Flags byte values.
208const FLAG_HAS_POSITIONS: u8 = 0x01;
209const FLAG_BLOCK_MAX: u8 = 0x02;
210
211/// Block size for block-max posting lists (matches Lucene convention).
212const BLOCK_SIZE: usize = 128;
213
214impl PositionPostingListWriter {
215    pub fn new() -> Self {
216        Self {
217            buf: Vec::new(),
218            count: 0,
219            last_doc_id: 0,
220        }
221    }
222
223    /// Add a posting with positions. `doc_id` must be strictly increasing.
224    /// `positions` must be sorted ascending.
225    pub fn add(&mut self, doc_id: DocId, positions: &[u32]) {
226        let id = doc_id.as_u32();
227        let delta = if self.count == 0 {
228            id
229        } else {
230            debug_assert!(id > self.last_doc_id);
231            id - self.last_doc_id
232        };
233
234        let tf = positions.len() as u32;
235        encode_vbyte(delta, &mut self.buf);
236        encode_vbyte(tf, &mut self.buf);
237
238        // Delta-encode positions
239        let mut last_pos = 0u32;
240        for &pos in positions {
241            encode_vbyte(pos - last_pos, &mut self.buf);
242            last_pos = pos;
243        }
244
245        self.last_doc_id = id;
246        self.count += 1;
247    }
248
249    pub fn finish(self) -> Vec<u8> {
250        let mut result = Vec::with_capacity(5 + self.buf.len());
251        result.extend_from_slice(&self.count.to_le_bytes());
252        result.push(FLAG_HAS_POSITIONS);
253        result.extend_from_slice(&self.buf);
254        result
255    }
256}
257
258impl Default for PositionPostingListWriter {
259    fn default() -> Self {
260        Self::new()
261    }
262}
263
264/// Iterates over a position-aware posting list.
265///
266/// Uses a reusable internal buffer for positions to avoid per-document
267/// heap allocation. Supports `advance()` to skip intermediate documents
268/// without decoding their position data.
269pub struct PositionPostingListReader<'a> {
270    data: &'a [u8],
271    pos: usize,
272    remaining: u32,
273    current_doc_id: u32,
274    /// Reusable position buffer — cleared and refilled on each next/advance.
275    position_buf: Vec<u32>,
276    /// Cached first position and TF for the fast TF=1 path.
277    cached_first_pos: u32,
278    cached_tf: u32,
279}
280
281impl<'a> PositionPostingListReader<'a> {
282    /// Create a reader from position-aware posting list bytes.
283    pub fn new(data: &'a [u8]) -> Self {
284        let count = if data.len() >= 5 {
285            u32::from_le_bytes([data[0], data[1], data[2], data[3]])
286        } else {
287            0
288        };
289        Self {
290            data,
291            pos: 5, // skip num_docs (4) + flags (1)
292            remaining: count,
293            current_doc_id: 0,
294            position_buf: Vec::new(),
295            cached_first_pos: 0,
296            cached_tf: 0,
297        }
298    }
299
300    pub fn len(&self) -> u32 {
301        if self.data.len() >= 4 {
302            u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
303        } else {
304            0
305        }
306    }
307
308    pub fn is_empty(&self) -> bool {
309        self.len() == 0
310    }
311
312    /// Current document's positions (valid after next() or advance()).
313    pub fn positions(&self) -> &[u32] {
314        &self.position_buf
315    }
316
317    /// The first position for the current document. For TF=1 docs (common
318    /// in phrase queries), this is the only position. Uses cached value,
319    /// no Vec access.
320    #[inline(always)]
321    pub fn first_position(&self) -> u32 {
322        self.cached_first_pos
323    }
324
325    /// Term frequency of the current document (valid after next/advance).
326    #[inline(always)]
327    pub fn current_tf(&self) -> u32 {
328        self.cached_tf
329    }
330
331    /// Decode positions for the current document into the internal buffer.
332    /// For TF=1, only caches the single position (no Vec push).
333    fn decode_positions(&mut self, tf: u32) {
334        self.cached_tf = tf;
335        let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
336        self.pos += consumed;
337        self.cached_first_pos = first_delta;
338
339        if tf == 1 {
340            // Fast path: single position, no Vec needed
341            self.position_buf.clear();
342            self.position_buf.push(first_delta);
343        } else {
344            // General path: decode remaining positions into Vec
345            self.position_buf.clear();
346            self.position_buf.push(first_delta);
347            let mut last_pos = first_delta;
348            for _ in 1..tf {
349                let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
350                self.pos += consumed;
351                last_pos += pos_delta;
352                self.position_buf.push(last_pos);
353            }
354        }
355    }
356
357    /// Skip positions for a document without decoding values.
358    fn skip_positions(&mut self, tf: u32) {
359        for _ in 0..tf {
360            let (_, consumed) = decode_vbyte(self.data, self.pos);
361            self.pos += consumed;
362        }
363    }
364
365    /// Read the next posting with positions.
366    pub fn next(&mut self) -> Option<(DocId, Vec<u32>)> {
367        if self.remaining == 0 {
368            return None;
369        }
370
371        let (delta, consumed) = decode_vbyte(self.data, self.pos);
372        self.pos += consumed;
373
374        let (tf, consumed) = decode_vbyte(self.data, self.pos);
375        self.pos += consumed;
376
377        self.current_doc_id += delta;
378        self.decode_positions(tf);
379        self.remaining -= 1;
380
381        Some((DocId(self.current_doc_id), self.position_buf.clone()))
382    }
383
384    /// Advance to the first document >= target, decoding its positions.
385    /// Skips intermediate documents without decoding their position data.
386    /// Returns the doc_id if found, or None if exhausted.
387    /// After calling, use `positions()` to access the decoded positions.
388    pub fn advance(&mut self, target: DocId) -> Option<DocId> {
389        let target_val = target.as_u32();
390        while self.remaining > 0 {
391            let (delta, consumed) = decode_vbyte(self.data, self.pos);
392            self.pos += consumed;
393
394            let (tf, consumed) = decode_vbyte(self.data, self.pos);
395            self.pos += consumed;
396
397            self.current_doc_id += delta;
398            self.remaining -= 1;
399
400            if self.current_doc_id >= target_val {
401                self.decode_positions(tf);
402                return Some(DocId(self.current_doc_id));
403            }
404
405            // Skip positions without decoding
406            self.skip_positions(tf);
407        }
408        None
409    }
410
411    /// Advance to the next document sequentially (no target, no skip check).
412    /// Faster than advance() when iterating all docs in order.
413    /// For TF=1 (most common), skips Vec operations entirely.
414    #[inline(always)]
415    pub fn next_doc(&mut self) -> Option<DocId> {
416        if self.remaining == 0 {
417            return None;
418        }
419
420        let (delta, consumed) = decode_vbyte(self.data, self.pos);
421        self.pos += consumed;
422
423        let (tf, consumed) = decode_vbyte(self.data, self.pos);
424        self.pos += consumed;
425
426        self.current_doc_id += delta;
427        self.cached_tf = tf;
428
429        // Decode first position (always needed)
430        let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
431        self.pos += consumed;
432        self.cached_first_pos = first_delta;
433
434        if tf > 1 {
435            // Rare: multi-position doc, decode remaining into Vec
436            self.position_buf.clear();
437            self.position_buf.push(first_delta);
438            let mut last_pos = first_delta;
439            for _ in 1..tf {
440                let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
441                self.pos += consumed;
442                last_pos += pos_delta;
443                self.position_buf.push(last_pos);
444            }
445        }
446        // For TF=1, skip Vec entirely — use cached_first_pos
447
448        self.remaining -= 1;
449        Some(DocId(self.current_doc_id))
450    }
451
452    /// Current document ID (valid after next() or advance()).
453    pub fn current_doc_id(&self) -> u32 {
454        self.current_doc_id
455    }
456}
457
458// --- Block-Max posting list ---
459
460/// Builds a posting list with per-block max TF metadata for WAND optimization.
461///
462/// Format:
463/// ```text
464/// [num_docs: u32] [flags: u8 = 0x02] [num_blocks: u16]
465/// [block_headers: (last_doc_id: u32, max_tf: u16, data_len: u16) × num_blocks]
466/// [block_data: delta-encoded doc_ids + TFs per block]
467/// ```
468///
469/// See [[architecture-query-execution#WAND / MaxScore Optimization]].
470pub struct BlockMaxPostingListWriter {
471    entries: Vec<(u32, u32)>, // (doc_id, tf)
472}
473
474impl BlockMaxPostingListWriter {
475    pub fn new() -> Self {
476        Self {
477            entries: Vec::new(),
478        }
479    }
480
481    pub fn add(&mut self, doc_id: DocId, tf: u32) {
482        self.entries.push((doc_id.as_u32(), tf));
483    }
484
485    pub fn finish(self) -> Vec<u8> {
486        let num_docs = self.entries.len() as u32;
487        let num_blocks = if self.entries.is_empty() {
488            0u16
489        } else {
490            ((self.entries.len() + BLOCK_SIZE - 1) / BLOCK_SIZE) as u16
491        };
492
493        // Build block data and headers
494        let mut block_headers: Vec<(u32, u16, u16)> = Vec::with_capacity(num_blocks as usize);
495        let mut block_data_bufs: Vec<Vec<u8>> = Vec::with_capacity(num_blocks as usize);
496
497        for block_idx in 0..num_blocks as usize {
498            let start = block_idx * BLOCK_SIZE;
499            let end = ((block_idx + 1) * BLOCK_SIZE).min(self.entries.len());
500            let block_entries = &self.entries[start..end];
501
502            let last_doc_id = block_entries.last().unwrap().0;
503            let max_tf = block_entries.iter().map(|e| e.1).max().unwrap();
504            let max_tf_u16 = if max_tf > u16::MAX as u32 {
505                u16::MAX
506            } else {
507                max_tf as u16
508            };
509
510            // Delta-encode within block (delta from previous block's last doc, or 0)
511            let mut buf = Vec::new();
512            let base_doc_id = if block_idx == 0 {
513                0u32
514            } else {
515                self.entries[start - 1].0
516            };
517            let mut prev = base_doc_id;
518            for &(doc_id, tf) in block_entries {
519                encode_vbyte(doc_id - prev, &mut buf);
520                encode_vbyte(tf, &mut buf);
521                prev = doc_id;
522            }
523
524            let data_len = buf.len() as u16;
525            block_headers.push((last_doc_id, max_tf_u16, data_len));
526            block_data_bufs.push(buf);
527        }
528
529        // Assemble final output
530        let header_bytes = num_blocks as usize * 8;
531        let data_bytes: usize = block_data_bufs.iter().map(|b| b.len()).sum();
532        let mut result = Vec::with_capacity(4 + 1 + 2 + header_bytes + data_bytes);
533
534        result.extend_from_slice(&num_docs.to_le_bytes());
535        result.push(FLAG_BLOCK_MAX);
536        result.extend_from_slice(&num_blocks.to_le_bytes());
537
538        // Block headers
539        for &(last_doc_id, max_tf, data_len) in &block_headers {
540            result.extend_from_slice(&last_doc_id.to_le_bytes());
541            result.extend_from_slice(&max_tf.to_le_bytes());
542            result.extend_from_slice(&data_len.to_le_bytes());
543        }
544
545        // Block data
546        for buf in block_data_bufs {
547            result.extend_from_slice(&buf);
548        }
549
550        result
551    }
552}
553
554impl Default for BlockMaxPostingListWriter {
555    fn default() -> Self {
556        Self::new()
557    }
558}
559
560/// Iterates over a block-max posting list, yielding (DocId, TF) pairs.
561///
562/// Supports block-level skipping via `advance_to_block()` for WAND optimization.
563pub struct BlockMaxPostingListReader<'a> {
564    data: &'a [u8],
565    num_docs: u32,
566    num_blocks: u16,
567    headers_start: usize,
568
569    // Per-block cumulative data offsets (computed once at construction)
570    block_data_offsets: Vec<usize>,
571
572    // Iteration state
573    current_block: u16,
574    pos_in_data: usize,
575    remaining_in_block: u16,
576    current_doc_id: u32,
577    total_remaining: u32,
578}
579
580impl<'a> BlockMaxPostingListReader<'a> {
581    pub fn new(data: &'a [u8]) -> Self {
582        let num_docs = u32::from_le_bytes(data[0..4].try_into().unwrap());
583        debug_assert_eq!(data[4], FLAG_BLOCK_MAX);
584        let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap());
585
586        let headers_start = 7;
587        let block_data_start = headers_start + num_blocks as usize * 8;
588
589        // Precompute cumulative block data offsets
590        let mut block_data_offsets = Vec::with_capacity(num_blocks as usize + 1);
591        let mut offset = block_data_start;
592        for i in 0..num_blocks as usize {
593            block_data_offsets.push(offset);
594            let hdr_pos = headers_start + i * 8;
595            let data_len =
596                u16::from_le_bytes(data[hdr_pos + 6..hdr_pos + 8].try_into().unwrap()) as usize;
597            offset += data_len;
598        }
599        block_data_offsets.push(offset); // sentinel
600
601        let first_block_docs = if num_blocks > 0 {
602            let total = num_docs as usize;
603            let _full_blocks = if num_blocks > 1 {
604                (num_blocks as usize - 1) * BLOCK_SIZE
605            } else {
606                0
607            };
608            if num_blocks == 1 {
609                total as u16
610            } else {
611                BLOCK_SIZE as u16
612            }
613        } else {
614            0
615        };
616
617        Self {
618            data,
619            num_docs,
620            num_blocks,
621            headers_start,
622            block_data_offsets,
623            current_block: 0,
624            pos_in_data: block_data_start,
625            remaining_in_block: first_block_docs,
626            current_doc_id: 0,
627            total_remaining: num_docs,
628        }
629    }
630
631    /// Number of postings in this list.
632    pub fn len(&self) -> u32 {
633        self.num_docs
634    }
635
636    pub fn is_empty(&self) -> bool {
637        self.num_docs == 0
638    }
639
640    pub fn num_blocks(&self) -> u16 {
641        self.num_blocks
642    }
643
644    /// Max TF for a given block.
645    pub fn block_max_tf(&self, block: u16) -> u16 {
646        let hdr_pos = self.headers_start + block as usize * 8;
647        u16::from_le_bytes(self.data[hdr_pos + 4..hdr_pos + 6].try_into().unwrap())
648    }
649
650    /// Last doc ID in a given block.
651    pub fn block_last_doc(&self, block: u16) -> u32 {
652        let hdr_pos = self.headers_start + block as usize * 8;
653        u32::from_le_bytes(self.data[hdr_pos..hdr_pos + 4].try_into().unwrap())
654    }
655
656    /// Number of docs in a given block.
657    fn block_doc_count(&self, block: u16) -> u16 {
658        if (block as usize) < self.num_blocks as usize - 1 {
659            BLOCK_SIZE as u16
660        } else {
661            // Last block: remaining docs
662            let full_blocks = (self.num_blocks as usize - 1) * BLOCK_SIZE;
663            (self.num_docs as usize - full_blocks) as u16
664        }
665    }
666
667    /// Position the block pointer at the block containing `target` without
668    /// decoding individual postings. Used by WAND to read `block_max_tf()`
669    /// for the correct block.
670    ///
671    /// This only updates `current_block` — it does NOT advance the doc
672    /// iteration state. Call `advance_to_block()` for full block skipping.
673    pub fn advance_shallow(&mut self, target: DocId) {
674        let target_val = target.as_u32();
675        // Binary search over block headers
676        let mut lo = self.current_block as usize;
677        let mut hi = self.num_blocks as usize;
678        while lo < hi {
679            let mid = lo + (hi - lo) / 2;
680            if self.block_last_doc(mid as u16) < target_val {
681                lo = mid + 1;
682            } else {
683                hi = mid;
684            }
685        }
686        if lo < self.num_blocks as usize {
687            self.current_block = lo as u16;
688        }
689    }
690
691    /// Skip to the first block whose last_doc_id >= target.
692    /// Positions the reader at the start of that block.
693    pub fn advance_to_block(&mut self, target: DocId) {
694        let target_val = target.as_u32();
695
696        // Binary search over block headers
697        let mut lo = self.current_block as usize;
698        let mut hi = self.num_blocks as usize;
699        while lo < hi {
700            let mid = lo + (hi - lo) / 2;
701            if self.block_last_doc(mid as u16) < target_val {
702                lo = mid + 1;
703            } else {
704                hi = mid;
705            }
706        }
707
708        if lo >= self.num_blocks as usize {
709            // Past all blocks
710            self.total_remaining = 0;
711            self.remaining_in_block = 0;
712            return;
713        }
714
715        // Skip to block `lo`
716        self.seek_to_block(lo as u16);
717    }
718
719    /// Position reader at the start of a specific block.
720    fn seek_to_block(&mut self, block: u16) {
721        if block >= self.num_blocks {
722            self.total_remaining = 0;
723            self.remaining_in_block = 0;
724            return;
725        }
726
727        // Compute docs remaining from this block onward
728        let docs_before_block = block as usize * BLOCK_SIZE;
729        self.total_remaining = self.num_docs.saturating_sub(docs_before_block as u32);
730        self.remaining_in_block = self.block_doc_count(block);
731        self.current_block = block;
732        self.pos_in_data = self.block_data_offsets[block as usize];
733
734        // Set current_doc_id to the end of the previous block
735        if block == 0 {
736            self.current_doc_id = 0;
737        } else {
738            self.current_doc_id = self.block_last_doc(block - 1);
739        }
740    }
741
742    /// Read the next (DocId, TF) pair, or None if exhausted.
743    pub fn next(&mut self) -> Option<(DocId, u32)> {
744        if self.total_remaining == 0 {
745            return None;
746        }
747
748        // Move to next block if current block is exhausted
749        if self.remaining_in_block == 0 {
750            let next_block = self.current_block + 1;
751            if next_block >= self.num_blocks {
752                self.total_remaining = 0;
753                return None;
754            }
755            self.current_block = next_block;
756            self.remaining_in_block = self.block_doc_count(next_block);
757            self.pos_in_data = self.block_data_offsets[next_block as usize];
758        }
759
760        let (delta, consumed) = decode_vbyte(self.data, self.pos_in_data);
761        self.pos_in_data += consumed;
762        let (tf, consumed) = decode_vbyte(self.data, self.pos_in_data);
763        self.pos_in_data += consumed;
764
765        self.current_doc_id += delta;
766        self.remaining_in_block -= 1;
767        self.total_remaining -= 1;
768
769        Some((DocId(self.current_doc_id), tf))
770    }
771
772    /// Which block is currently being read.
773    pub fn current_block_idx(&self) -> u16 {
774        self.current_block
775    }
776}
777
778/// Check whether a posting list uses the block-max format.
779pub fn has_block_max(data: &[u8]) -> bool {
780    data.len() >= 5 && data[4] == FLAG_BLOCK_MAX
781}
782
783/// Check whether a posting list byte slice has positions encoded.
784pub fn has_positions(data: &[u8]) -> bool {
785    data.len() >= 5 && data[4] == FLAG_HAS_POSITIONS
786}
787
788#[cfg(test)]
789mod tests {
790    use super::*;
791
792    #[test]
793    fn vbyte_single_byte() {
794        // Values 0-127 fit in one byte
795        let mut buf = Vec::new();
796        encode_vbyte(0, &mut buf);
797        assert_eq!(buf.len(), 1);
798        let (val, consumed) = decode_vbyte(&buf, 0);
799        assert_eq!(val, 0);
800        assert_eq!(consumed, 1);
801    }
802
803    #[test]
804    fn vbyte_boundary_127() {
805        let mut buf = Vec::new();
806        encode_vbyte(127, &mut buf);
807        assert_eq!(buf.len(), 1);
808        let (val, _) = decode_vbyte(&buf, 0);
809        assert_eq!(val, 127);
810    }
811
812    #[test]
813    fn vbyte_boundary_128() {
814        let mut buf = Vec::new();
815        encode_vbyte(128, &mut buf);
816        assert_eq!(buf.len(), 2);
817        let (val, consumed) = decode_vbyte(&buf, 0);
818        assert_eq!(val, 128);
819        assert_eq!(consumed, 2);
820    }
821
822    #[test]
823    fn vbyte_16383() {
824        let mut buf = Vec::new();
825        encode_vbyte(16383, &mut buf);
826        assert_eq!(buf.len(), 2);
827        let (val, _) = decode_vbyte(&buf, 0);
828        assert_eq!(val, 16383);
829    }
830
831    #[test]
832    fn vbyte_16384() {
833        let mut buf = Vec::new();
834        encode_vbyte(16384, &mut buf);
835        assert_eq!(buf.len(), 3);
836        let (val, _) = decode_vbyte(&buf, 0);
837        assert_eq!(val, 16384);
838    }
839
840    #[test]
841    fn vbyte_large_value() {
842        let mut buf = Vec::new();
843        let val = u32::MAX - 1;
844        encode_vbyte(val, &mut buf);
845        assert_eq!(buf.len(), 5);
846        let (decoded, _) = decode_vbyte(&buf, 0);
847        assert_eq!(decoded, val);
848    }
849
850    #[test]
851    fn vbyte_max_value() {
852        let mut buf = Vec::new();
853        encode_vbyte(u32::MAX, &mut buf);
854        let (decoded, _) = decode_vbyte(&buf, 0);
855        assert_eq!(decoded, u32::MAX);
856    }
857
858    #[test]
859    fn round_trip_single_doc() {
860        let mut writer = PostingListWriter::new();
861        writer.add(DocId(5), 3);
862        let data = writer.finish();
863
864        let mut reader = PostingListReader::new(&data);
865        assert_eq!(reader.len(), 1);
866        assert_eq!(reader.next(), Some((DocId(5), 3)));
867        assert_eq!(reader.next(), None);
868    }
869
870    #[test]
871    fn round_trip_multiple_docs() {
872        let mut writer = PostingListWriter::new();
873        writer.add(DocId(1), 2);
874        writer.add(DocId(5), 1);
875        writer.add(DocId(100), 4);
876        writer.add(DocId(1000), 1);
877        let data = writer.finish();
878
879        let mut reader = PostingListReader::new(&data);
880        assert_eq!(reader.len(), 4);
881        assert_eq!(reader.next(), Some((DocId(1), 2)));
882        assert_eq!(reader.next(), Some((DocId(5), 1)));
883        assert_eq!(reader.next(), Some((DocId(100), 4)));
884        assert_eq!(reader.next(), Some((DocId(1000), 1)));
885        assert_eq!(reader.next(), None);
886    }
887
888    #[test]
889    fn round_trip_consecutive_doc_ids() {
890        let mut writer = PostingListWriter::new();
891        for i in 0..10 {
892            writer.add(DocId(i), 1);
893        }
894        let data = writer.finish();
895
896        let mut reader = PostingListReader::new(&data);
897        assert_eq!(reader.len(), 10);
898        for i in 0..10 {
899            assert_eq!(reader.next(), Some((DocId(i), 1)));
900        }
901        assert_eq!(reader.next(), None);
902    }
903
904    #[test]
905    fn round_trip_varying_tf() {
906        let mut writer = PostingListWriter::new();
907        writer.add(DocId(0), 0);
908        writer.add(DocId(1), 1);
909        writer.add(DocId(2), 127);
910        writer.add(DocId(3), 128);
911        writer.add(DocId(4), 10000);
912        let data = writer.finish();
913
914        let mut reader = PostingListReader::new(&data);
915        assert_eq!(reader.next(), Some((DocId(0), 0)));
916        assert_eq!(reader.next(), Some((DocId(1), 1)));
917        assert_eq!(reader.next(), Some((DocId(2), 127)));
918        assert_eq!(reader.next(), Some((DocId(3), 128)));
919        assert_eq!(reader.next(), Some((DocId(4), 10000)));
920    }
921
922    #[test]
923    fn empty_posting_list() {
924        let writer = PostingListWriter::new();
925        let data = writer.finish();
926
927        let mut reader = PostingListReader::new(&data);
928        assert_eq!(reader.len(), 0);
929        assert!(reader.is_empty());
930        assert_eq!(reader.next(), None);
931    }
932
933    #[test]
934    fn large_posting_list() {
935        let count = 10_000;
936        let mut writer = PostingListWriter::new();
937        for i in 0..count {
938            writer.add(DocId(i * 3), (i % 10) + 1);
939        }
940        let data = writer.finish();
941
942        let mut reader = PostingListReader::new(&data);
943        assert_eq!(reader.len(), count);
944        for i in 0..count {
945            let (doc_id, tf) = reader.next().unwrap();
946            assert_eq!(doc_id, DocId(i * 3));
947            assert_eq!(tf, (i % 10) + 1);
948        }
949        assert_eq!(reader.next(), None);
950    }
951
952    #[test]
953    fn delta_encoding_compresses() {
954        // Consecutive IDs should compress well (delta=1 each, 1 byte per delta).
955        let mut writer = PostingListWriter::new();
956        for i in 0..1000 {
957            writer.add(DocId(i), 1);
958        }
959        let data = writer.finish();
960
961        // 4 bytes header + ~1 byte per delta + ~1 byte per tf = ~2004 bytes.
962        // Without delta encoding it would be ~5000+ bytes.
963        assert!(data.len() < 3000);
964    }
965
966    #[test]
967    fn wide_gap_doc_ids() {
968        let mut writer = PostingListWriter::new();
969        writer.add(DocId(0), 1);
970        writer.add(DocId(1_000_000), 1);
971        writer.add(DocId(2_000_000), 1);
972        let data = writer.finish();
973
974        let mut reader = PostingListReader::new(&data);
975        assert_eq!(reader.next(), Some((DocId(0), 1)));
976        assert_eq!(reader.next(), Some((DocId(1_000_000), 1)));
977        assert_eq!(reader.next(), Some((DocId(2_000_000), 1)));
978        assert_eq!(reader.next(), None);
979    }
980
981    #[test]
982    fn iterator_exhaustion_is_stable() {
983        let mut writer = PostingListWriter::new();
984        writer.add(DocId(1), 1);
985        let data = writer.finish();
986
987        let mut reader = PostingListReader::new(&data);
988        assert!(reader.next().is_some());
989        assert_eq!(reader.next(), None);
990        assert_eq!(reader.next(), None); // calling again is safe
991        assert_eq!(reader.next(), None);
992    }
993
994    #[test]
995    fn doc_id_starting_at_zero() {
996        let mut writer = PostingListWriter::new();
997        writer.add(DocId(0), 5);
998        let data = writer.finish();
999
1000        let mut reader = PostingListReader::new(&data);
1001        assert_eq!(reader.next(), Some((DocId(0), 5)));
1002    }
1003
1004    // --- Position-aware posting list tests ---
1005
1006    #[test]
1007    fn position_round_trip_single_doc() {
1008        let mut writer = PositionPostingListWriter::new();
1009        writer.add(DocId(0), &[0, 3, 7]);
1010        let data = writer.finish();
1011
1012        assert!(has_positions(&data));
1013        let mut reader = PositionPostingListReader::new(&data);
1014        assert_eq!(reader.len(), 1);
1015        let (doc_id, positions) = reader.next().unwrap();
1016        assert_eq!(doc_id, DocId(0));
1017        assert_eq!(positions, vec![0, 3, 7]);
1018        assert!(reader.next().is_none());
1019    }
1020
1021    #[test]
1022    fn position_round_trip_multiple_docs() {
1023        let mut writer = PositionPostingListWriter::new();
1024        writer.add(DocId(1), &[0, 1]);
1025        writer.add(DocId(5), &[2, 5, 8]);
1026        writer.add(DocId(10), &[0]);
1027        let data = writer.finish();
1028
1029        let mut reader = PositionPostingListReader::new(&data);
1030        assert_eq!(reader.len(), 3);
1031
1032        let (id, pos) = reader.next().unwrap();
1033        assert_eq!(id, DocId(1));
1034        assert_eq!(pos, vec![0, 1]);
1035
1036        let (id, pos) = reader.next().unwrap();
1037        assert_eq!(id, DocId(5));
1038        assert_eq!(pos, vec![2, 5, 8]);
1039
1040        let (id, pos) = reader.next().unwrap();
1041        assert_eq!(id, DocId(10));
1042        assert_eq!(pos, vec![0]);
1043
1044        assert!(reader.next().is_none());
1045    }
1046
1047    #[test]
1048    fn position_consecutive_positions() {
1049        let mut writer = PositionPostingListWriter::new();
1050        writer.add(DocId(0), &[0, 1, 2, 3, 4]);
1051        let data = writer.finish();
1052
1053        let mut reader = PositionPostingListReader::new(&data);
1054        let (_, pos) = reader.next().unwrap();
1055        assert_eq!(pos, vec![0, 1, 2, 3, 4]);
1056    }
1057
1058    #[test]
1059    fn position_gapped_positions() {
1060        let mut writer = PositionPostingListWriter::new();
1061        writer.add(DocId(0), &[0, 100, 200, 5000]);
1062        let data = writer.finish();
1063
1064        let mut reader = PositionPostingListReader::new(&data);
1065        let (_, pos) = reader.next().unwrap();
1066        assert_eq!(pos, vec![0, 100, 200, 5000]);
1067    }
1068
1069    #[test]
1070    fn position_empty_list() {
1071        let writer = PositionPostingListWriter::new();
1072        let data = writer.finish();
1073
1074        let mut reader = PositionPostingListReader::new(&data);
1075        assert_eq!(reader.len(), 0);
1076        assert!(reader.is_empty());
1077        assert!(reader.next().is_none());
1078    }
1079
1080    #[test]
1081    fn position_single_position() {
1082        let mut writer = PositionPostingListWriter::new();
1083        writer.add(DocId(42), &[7]);
1084        let data = writer.finish();
1085
1086        let mut reader = PositionPostingListReader::new(&data);
1087        let (id, pos) = reader.next().unwrap();
1088        assert_eq!(id, DocId(42));
1089        assert_eq!(pos, vec![7]);
1090    }
1091
1092    #[test]
1093    fn position_many_docs() {
1094        let mut writer = PositionPostingListWriter::new();
1095        for i in 0..1000u32 {
1096            writer.add(DocId(i), &[i * 2, i * 2 + 1]);
1097        }
1098        let data = writer.finish();
1099
1100        let mut reader = PositionPostingListReader::new(&data);
1101        assert_eq!(reader.len(), 1000);
1102        for i in 0..1000u32 {
1103            let (id, pos) = reader.next().unwrap();
1104            assert_eq!(id, DocId(i));
1105            assert_eq!(pos, vec![i * 2, i * 2 + 1]);
1106        }
1107        assert!(reader.next().is_none());
1108    }
1109
1110    #[test]
1111    fn has_positions_flag() {
1112        // Position-aware format has the flag
1113        let mut pw = PositionPostingListWriter::new();
1114        pw.add(DocId(0), &[0]);
1115        let pdata = pw.finish();
1116        assert!(has_positions(&pdata));
1117
1118        // Original format does not
1119        let mut w = PostingListWriter::new();
1120        w.add(DocId(0), 1);
1121        let data = w.finish();
1122        assert!(!has_positions(&data));
1123    }
1124
1125    #[test]
1126    fn position_exhaustion_stable() {
1127        let mut writer = PositionPostingListWriter::new();
1128        writer.add(DocId(0), &[0]);
1129        let data = writer.finish();
1130
1131        let mut reader = PositionPostingListReader::new(&data);
1132        assert!(reader.next().is_some());
1133        assert!(reader.next().is_none());
1134        assert!(reader.next().is_none());
1135    }
1136
1137    // --- Block-max posting list tests ---
1138
1139    #[test]
1140    fn block_max_single_doc() {
1141        let mut writer = BlockMaxPostingListWriter::new();
1142        writer.add(DocId(5), 3);
1143        let data = writer.finish();
1144
1145        assert!(has_block_max(&data));
1146        let mut reader = BlockMaxPostingListReader::new(&data);
1147        assert_eq!(reader.len(), 1);
1148        assert_eq!(reader.num_blocks(), 1);
1149        assert_eq!(reader.block_last_doc(0), 5);
1150        assert_eq!(reader.block_max_tf(0), 3);
1151        assert_eq!(reader.next(), Some((DocId(5), 3)));
1152        assert_eq!(reader.next(), None);
1153    }
1154
1155    #[test]
1156    fn block_max_under_block_size() {
1157        let mut writer = BlockMaxPostingListWriter::new();
1158        for i in 0..50 {
1159            writer.add(DocId(i * 2), (i % 5) + 1);
1160        }
1161        let data = writer.finish();
1162
1163        let mut reader = BlockMaxPostingListReader::new(&data);
1164        assert_eq!(reader.len(), 50);
1165        assert_eq!(reader.num_blocks(), 1);
1166        assert_eq!(reader.block_last_doc(0), 98);
1167        assert_eq!(reader.block_max_tf(0), 5); // max of (i%5)+1
1168
1169        for i in 0..50 {
1170            assert_eq!(reader.next(), Some((DocId(i * 2), (i % 5) + 1)));
1171        }
1172        assert_eq!(reader.next(), None);
1173    }
1174
1175    #[test]
1176    fn block_max_exact_block_size() {
1177        let mut writer = BlockMaxPostingListWriter::new();
1178        for i in 0..128 {
1179            writer.add(DocId(i), 1);
1180        }
1181        let data = writer.finish();
1182
1183        let mut reader = BlockMaxPostingListReader::new(&data);
1184        assert_eq!(reader.len(), 128);
1185        assert_eq!(reader.num_blocks(), 1);
1186        assert_eq!(reader.block_last_doc(0), 127);
1187
1188        for i in 0..128 {
1189            assert_eq!(reader.next(), Some((DocId(i), 1)));
1190        }
1191        assert_eq!(reader.next(), None);
1192    }
1193
1194    #[test]
1195    fn block_max_multi_block() {
1196        // 300 docs = 2 full blocks (128) + 1 partial (44)
1197        let mut writer = BlockMaxPostingListWriter::new();
1198        for i in 0..300u32 {
1199            writer.add(DocId(i), (i % 10) + 1);
1200        }
1201        let data = writer.finish();
1202
1203        let mut reader = BlockMaxPostingListReader::new(&data);
1204        assert_eq!(reader.len(), 300);
1205        assert_eq!(reader.num_blocks(), 3);
1206
1207        // Block 0: docs 0-127
1208        assert_eq!(reader.block_last_doc(0), 127);
1209        assert_eq!(reader.block_max_tf(0), 10); // max of (0..128 % 10) + 1
1210
1211        // Block 1: docs 128-255
1212        assert_eq!(reader.block_last_doc(1), 255);
1213
1214        // Block 2: docs 256-299
1215        assert_eq!(reader.block_last_doc(2), 299);
1216
1217        // Full iteration
1218        for i in 0..300u32 {
1219            assert_eq!(reader.next(), Some((DocId(i), (i % 10) + 1)));
1220        }
1221        assert_eq!(reader.next(), None);
1222    }
1223
1224    #[test]
1225    fn block_max_advance_to_block() {
1226        let mut writer = BlockMaxPostingListWriter::new();
1227        for i in 0..300u32 {
1228            writer.add(DocId(i * 3), 1);
1229        }
1230        let data = writer.finish();
1231
1232        let mut reader = BlockMaxPostingListReader::new(&data);
1233        // Block 0: docs 0..127*3 = 0..381
1234        // Block 1: docs 128*3..255*3 = 384..765
1235        // Block 2: docs 256*3..299*3 = 768..897
1236
1237        // Advance to block containing doc >= 400
1238        reader.advance_to_block(DocId(400));
1239        // Block 1 has last_doc = 255*3 = 765, which is >= 400
1240        let (doc, _) = reader.next().unwrap();
1241        assert_eq!(doc, DocId(384)); // first doc of block 1
1242
1243        // Advance to block containing doc >= 800
1244        reader.advance_to_block(DocId(800));
1245        let (doc, _) = reader.next().unwrap();
1246        assert_eq!(doc, DocId(768)); // first doc of block 2
1247    }
1248
1249    #[test]
1250    fn block_max_advance_past_end() {
1251        let mut writer = BlockMaxPostingListWriter::new();
1252        for i in 0..10 {
1253            writer.add(DocId(i), 1);
1254        }
1255        let data = writer.finish();
1256
1257        let mut reader = BlockMaxPostingListReader::new(&data);
1258        reader.advance_to_block(DocId(100));
1259        assert_eq!(reader.next(), None);
1260    }
1261
1262    #[test]
1263    fn block_max_empty() {
1264        let writer = BlockMaxPostingListWriter::new();
1265        let data = writer.finish();
1266
1267        let mut reader = BlockMaxPostingListReader::new(&data);
1268        assert_eq!(reader.len(), 0);
1269        assert!(reader.is_empty());
1270        assert_eq!(reader.num_blocks(), 0);
1271        assert_eq!(reader.next(), None);
1272    }
1273
1274    #[test]
1275    fn block_max_large_tf_clamped() {
1276        let mut writer = BlockMaxPostingListWriter::new();
1277        writer.add(DocId(0), 70000); // > u16::MAX
1278        let data = writer.finish();
1279
1280        let mut reader = BlockMaxPostingListReader::new(&data);
1281        assert_eq!(reader.block_max_tf(0), u16::MAX); // clamped
1282        // But actual TF is preserved
1283        assert_eq!(reader.next(), Some((DocId(0), 70000)));
1284    }
1285
1286    #[test]
1287    fn block_max_flag_detected() {
1288        let mut bm = BlockMaxPostingListWriter::new();
1289        bm.add(DocId(0), 1);
1290        let bm_data = bm.finish();
1291        assert!(has_block_max(&bm_data));
1292        assert!(!has_positions(&bm_data));
1293
1294        let mut basic = PostingListWriter::new();
1295        basic.add(DocId(0), 1);
1296        let basic_data = basic.finish();
1297        assert!(!has_block_max(&basic_data));
1298    }
1299}
luci/inverted/postings.rs

luci/inverted/
postings.rs