hermes_core/structures/postings/
positions.rs

1//! Position-aware posting list for phrase queries and multi-field element tracking
2//!
3//! Positions are encoded as: (element_ordinal << 20) | token_position
4//! This allows up to 4096 elements per field and ~1M tokens per element.
5//!
6//! ## Block Format
7//!
8//! Uses a block-based format with skip list for efficient binary search by doc_id:
9//! - Skip list enables O(log n) lookup by doc_id
10//! - Blocks of up to 128 documents for cache efficiency
11//! - Delta encoding within blocks for compression
12//! - Stackable for fast merge (just adjust doc_id_base per block)
13//!
14//! Format:
15//! ```text
16//! Header:
17//!   - doc_count: u32
18//!   - num_blocks: u32
19//!   - skip_list: [(base_doc_id, last_doc_id, byte_offset)] per block
20//!   - data_len: u32
21//! Data:
22//!   - blocks: each block contains delta-encoded postings with positions
23//! ```
24
25use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
26use std::io::{self, Write};
27
28use super::posting_common::{read_vint, write_vint};
29use crate::DocId;
30
31/// Block size for position posting list (same as regular posting list)
32pub const POSITION_BLOCK_SIZE: usize = 128;
33
34/// Maximum token position within an element (20 bits = 1,048,575)
35pub const MAX_TOKEN_POSITION: u32 = (1 << 20) - 1;
36
37/// Maximum element ordinal (12 bits = 4095)
38pub const MAX_ELEMENT_ORDINAL: u32 = (1 << 12) - 1;
39
40/// Encode element ordinal and token position into a single u32
41#[inline]
42pub fn encode_position(element_ordinal: u32, token_position: u32) -> u32 {
43    debug_assert!(
44        element_ordinal <= MAX_ELEMENT_ORDINAL,
45        "Element ordinal {} exceeds maximum {}",
46        element_ordinal,
47        MAX_ELEMENT_ORDINAL
48    );
49    debug_assert!(
50        token_position <= MAX_TOKEN_POSITION,
51        "Token position {} exceeds maximum {}",
52        token_position,
53        MAX_TOKEN_POSITION
54    );
55    (element_ordinal << 20) | (token_position & MAX_TOKEN_POSITION)
56}
57
58/// Decode element ordinal from encoded position
59#[inline]
60pub fn decode_element_ordinal(position: u32) -> u32 {
61    position >> 20
62}
63
64/// Decode token position from encoded position
65#[inline]
66pub fn decode_token_position(position: u32) -> u32 {
67    position & MAX_TOKEN_POSITION
68}
69
70/// A posting entry with positions (used during building)
71#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct PostingWithPositions {
73    pub doc_id: DocId,
74    pub term_freq: u32,
75    /// Encoded positions: (element_ordinal << 20) | token_position
76    pub positions: Vec<u32>,
77}
78
79/// Block-based position posting list with skip list for O(log n) doc_id lookup
80///
81/// Similar to BlockPostingList but stores positions per document.
82/// Uses binary search on skip list to find the right block, then linear scan within block.
83#[derive(Debug, Clone)]
84pub struct PositionPostingList {
85    /// Skip list: (base_doc_id, last_doc_id, byte_offset)
86    /// Enables binary search to find the right block
87    skip_list: Vec<(DocId, DocId, u64)>,
88    /// Compressed block data
89    data: Vec<u8>,
90    /// Total document count
91    doc_count: u32,
92}
93
94impl Default for PositionPostingList {
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100impl PositionPostingList {
101    pub fn new() -> Self {
102        Self {
103            skip_list: Vec::new(),
104            data: Vec::new(),
105            doc_count: 0,
106        }
107    }
108
109    pub fn with_capacity(capacity: usize) -> Self {
110        Self {
111            skip_list: Vec::with_capacity(capacity / POSITION_BLOCK_SIZE + 1),
112            data: Vec::with_capacity(capacity * 8), // rough estimate
113            doc_count: 0,
114        }
115    }
116
117    /// Build from a list of postings with positions
118    pub fn from_postings(postings: &[PostingWithPositions]) -> io::Result<Self> {
119        if postings.is_empty() {
120            return Ok(Self::new());
121        }
122
123        let mut skip_list = Vec::new();
124        let mut data = Vec::new();
125        let mut i = 0;
126
127        while i < postings.len() {
128            let block_start = data.len() as u64;
129            let block_end = (i + POSITION_BLOCK_SIZE).min(postings.len());
130            let block = &postings[i..block_end];
131
132            // Record skip entry
133            let base_doc_id = block.first().unwrap().doc_id;
134            let last_doc_id = block.last().unwrap().doc_id;
135            skip_list.push((base_doc_id, last_doc_id, block_start));
136
137            // Write block: fixed u32 count + first_doc (8-byte prefix), then vint deltas
138            data.write_u32::<LittleEndian>(block.len() as u32)?;
139            data.write_u32::<LittleEndian>(base_doc_id)?;
140
141            let mut prev_doc_id = base_doc_id;
142            for (j, posting) in block.iter().enumerate() {
143                if j > 0 {
144                    let delta = posting.doc_id - prev_doc_id;
145                    write_vint(&mut data, delta as u64)?;
146                }
147                prev_doc_id = posting.doc_id;
148
149                // Write positions count and positions (absolute - delta bad for ordinal<<20)
150                write_vint(&mut data, posting.positions.len() as u64)?;
151                for &pos in &posting.positions {
152                    write_vint(&mut data, pos as u64)?;
153                }
154            }
155
156            i = block_end;
157        }
158
159        Ok(Self {
160            skip_list,
161            data,
162            doc_count: postings.len() as u32,
163        })
164    }
165
166    /// Add a posting with positions (for building - converts to block format on serialize)
167    pub fn push(&mut self, doc_id: DocId, positions: Vec<u32>) {
168        // For compatibility: build in-memory, convert to blocks on serialize
169        // This is a simplified approach - we rebuild skip_list on serialize
170        let posting = PostingWithPositions {
171            doc_id,
172            term_freq: positions.len() as u32,
173            positions,
174        };
175
176        // Serialize this posting to data buffer
177        let block_start = self.data.len() as u64;
178
179        // If this is first posting or we need a new block
180        let need_new_block =
181            self.skip_list.is_empty() || self.doc_count.is_multiple_of(POSITION_BLOCK_SIZE as u32);
182
183        if need_new_block {
184            // Start new block: fixed u32 count + first_doc (8-byte prefix)
185            self.skip_list.push((doc_id, doc_id, block_start));
186            self.data.write_u32::<LittleEndian>(1u32).unwrap();
187            self.data.write_u32::<LittleEndian>(doc_id).unwrap();
188        } else {
189            // Add to existing block — update count in-place + add delta
190            let last_block = self.skip_list.last_mut().unwrap();
191            let prev_doc_id = last_block.1;
192            last_block.1 = doc_id;
193
194            // Patch count u32 at block start
195            let count_offset = last_block.2 as usize;
196            let old_count = u32::from_le_bytes(
197                self.data[count_offset..count_offset + 4]
198                    .try_into()
199                    .unwrap(),
200            );
201            self.data[count_offset..count_offset + 4]
202                .copy_from_slice(&(old_count + 1).to_le_bytes());
203
204            let delta = doc_id - prev_doc_id;
205            write_vint(&mut self.data, delta as u64).unwrap();
206        }
207
208        // Write positions (absolute - delta encoding bad for ordinal<<20)
209        write_vint(&mut self.data, posting.positions.len() as u64).unwrap();
210        for &pos in &posting.positions {
211            write_vint(&mut self.data, pos as u64).unwrap();
212        }
213
214        self.doc_count += 1;
215    }
216
217    pub fn doc_count(&self) -> u32 {
218        self.doc_count
219    }
220
221    pub fn len(&self) -> usize {
222        self.doc_count as usize
223    }
224
225    pub fn is_empty(&self) -> bool {
226        self.doc_count == 0
227    }
228
229    /// Get positions for a specific document into a reusable buffer (zero allocation).
230    /// Returns true if the document was found.
231    pub fn get_positions_into(&self, target_doc_id: DocId, out: &mut Vec<u32>) -> bool {
232        out.clear();
233        self.get_positions_impl(target_doc_id, Some(out)).is_some()
234    }
235
236    /// Get positions for a specific document using binary search on skip list
237    pub fn get_positions(&self, target_doc_id: DocId) -> Option<Vec<u32>> {
238        self.get_positions_impl(target_doc_id, None)
239    }
240
241    fn get_positions_impl(
242        &self,
243        target_doc_id: DocId,
244        mut out: Option<&mut Vec<u32>>,
245    ) -> Option<Vec<u32>> {
246        if self.skip_list.is_empty() {
247            return None;
248        }
249
250        // Binary search on skip list to find the right block
251        let block_idx = match self.skip_list.binary_search_by(|&(base, last, _)| {
252            if target_doc_id < base {
253                std::cmp::Ordering::Greater
254            } else if target_doc_id > last {
255                std::cmp::Ordering::Less
256            } else {
257                std::cmp::Ordering::Equal
258            }
259        }) {
260            Ok(idx) => idx,
261            Err(_) => return None, // doc_id not in any block range
262        };
263
264        // Decode block and search for doc_id
265        let offset = self.skip_list[block_idx].2 as usize;
266        let mut reader = &self.data[offset..];
267
268        // Fixed 8-byte prefix: count(u32) + first_doc(u32)
269        let count = reader.read_u32::<LittleEndian>().ok()? as usize;
270        let first_doc = reader.read_u32::<LittleEndian>().ok()?;
271        let mut prev_doc_id = first_doc;
272
273        for i in 0..count {
274            let doc_id = if i == 0 {
275                first_doc
276            } else {
277                let delta = read_vint(&mut reader).ok()? as u32;
278                prev_doc_id + delta
279            };
280            prev_doc_id = doc_id;
281
282            let num_positions = read_vint(&mut reader).ok()? as usize;
283
284            if doc_id == target_doc_id {
285                // Found it! Read positions (stored absolute)
286                if let Some(buf) = &mut out {
287                    // Buffer-based path: write into caller's reusable buffer
288                    buf.reserve(num_positions);
289                    for _ in 0..num_positions {
290                        let pos = read_vint(&mut reader).ok()? as u32;
291                        buf.push(pos);
292                    }
293                    return Some(Vec::new()); // sentinel: actual data is in `out`
294                }
295                let mut positions = Vec::with_capacity(num_positions);
296                for _ in 0..num_positions {
297                    let pos = read_vint(&mut reader).ok()? as u32;
298                    positions.push(pos);
299                }
300                return Some(positions);
301            } else {
302                // Skip positions
303                for _ in 0..num_positions {
304                    if read_vint(&mut reader).is_err() {
305                        return None;
306                    }
307                }
308            }
309        }
310
311        None
312    }
313
314    /// Size of one serialized skip entry:
315    /// first_doc(4) + last_doc(4) + offset(8) + length(4) = 20 bytes
316    const SKIP_ENTRY_SIZE: usize = 20;
317
318    /// Serialize to bytes (footer-based: data first).
319    ///
320    /// Format:
321    /// ```text
322    /// [block data: data_len bytes]
323    /// [skip entries: N × 20 bytes (base_doc, last_doc, offset, length)]
324    /// [footer: data_len(8) + skip_count(4) + doc_count(4) = 16 bytes]
325    /// ```
326    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
327        // Data first
328        writer.write_all(&self.data)?;
329
330        // Skip list — compute length from adjacent entries
331        for (i, (base_doc_id, last_doc_id, offset)) in self.skip_list.iter().enumerate() {
332            let next_offset = if i + 1 < self.skip_list.len() {
333                self.skip_list[i + 1].2
334            } else {
335                self.data.len() as u64
336            };
337            let length = (next_offset - offset) as u32;
338            writer.write_u32::<LittleEndian>(*base_doc_id)?;
339            writer.write_u32::<LittleEndian>(*last_doc_id)?;
340            writer.write_u64::<LittleEndian>(*offset)?;
341            writer.write_u32::<LittleEndian>(length)?;
342        }
343
344        // Footer
345        writer.write_u64::<LittleEndian>(self.data.len() as u64)?;
346        writer.write_u32::<LittleEndian>(self.skip_list.len() as u32)?;
347        writer.write_u32::<LittleEndian>(self.doc_count)?;
348
349        Ok(())
350    }
351
352    /// Deserialize from a byte slice (footer-based format).
353    pub fn deserialize(raw: &[u8]) -> io::Result<Self> {
354        if raw.len() < 16 {
355            return Err(io::Error::new(
356                io::ErrorKind::InvalidData,
357                "position data too short",
358            ));
359        }
360
361        // Parse footer (last 16 bytes)
362        let f = raw.len() - 16;
363        let data_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
364        let skip_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
365        let doc_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap());
366
367        // Parse skip list (20-byte entries; length field not stored in-memory)
368        let mut skip_list = Vec::with_capacity(skip_count);
369        let mut pos = data_len;
370        for _ in 0..skip_count {
371            let base = u32::from_le_bytes(raw[pos..pos + 4].try_into().unwrap());
372            let last = u32::from_le_bytes(raw[pos + 4..pos + 8].try_into().unwrap());
373            let offset = u64::from_le_bytes(raw[pos + 8..pos + 16].try_into().unwrap());
374            // pos + 16..pos + 20 = length (not needed in-memory; adjacent entries suffice)
375            skip_list.push((base, last, offset));
376            pos += Self::SKIP_ENTRY_SIZE;
377        }
378
379        let data = raw[..data_len].to_vec();
380
381        Ok(Self {
382            skip_list,
383            data,
384            doc_count,
385        })
386    }
387
388    /// Concatenate blocks from multiple position lists with doc_id remapping (for merge)
389    pub fn concatenate_blocks(sources: &[(PositionPostingList, u32)]) -> io::Result<Self> {
390        let mut skip_list = Vec::new();
391        let mut data = Vec::new();
392        let mut total_docs = 0u32;
393
394        for (source, doc_offset) in sources {
395            for block_idx in 0..source.skip_list.len() {
396                let (base, last, src_offset) = source.skip_list[block_idx];
397                let next_offset = if block_idx + 1 < source.skip_list.len() {
398                    source.skip_list[block_idx + 1].2 as usize
399                } else {
400                    source.data.len()
401                };
402
403                let new_base = base + doc_offset;
404                let new_last = last + doc_offset;
405                let new_offset = data.len() as u64;
406
407                // Copy and adjust block data
408                let block_bytes = &source.data[src_offset as usize..next_offset];
409
410                // Fixed 8-byte prefix: count(u32) + first_doc(u32)
411                let count = u32::from_le_bytes(block_bytes[0..4].try_into().unwrap());
412                let first_doc = u32::from_le_bytes(block_bytes[4..8].try_into().unwrap());
413
414                // Write patched prefix + copy rest verbatim
415                data.write_u32::<LittleEndian>(count)?;
416                data.write_u32::<LittleEndian>(first_doc + doc_offset)?;
417                data.extend_from_slice(&block_bytes[8..]);
418
419                skip_list.push((new_base, new_last, new_offset));
420                total_docs += count;
421            }
422        }
423
424        Ok(Self {
425            skip_list,
426            data,
427            doc_count: total_docs,
428        })
429    }
430
431    /// Streaming merge: write blocks directly to output writer (bounded memory).
432    ///
433    /// **Zero-materializing**: reads skip entries directly from source bytes
434    /// without parsing into Vecs. Explicit `length` field in each 20-byte
435    /// entry eliminates adjacent-entry lookups.
436    ///
437    /// Only output skip bytes are buffered (bounded O(total_blocks × 20)).
438    /// Block data flows source → output writer without intermediate buffering.
439    ///
440    /// Returns `(doc_count, bytes_written)`.
441    pub fn concatenate_streaming<W: Write>(
442        sources: &[(&[u8], u32)],
443        writer: &mut W,
444    ) -> io::Result<(u32, usize)> {
445        // Parse only footers (16 bytes each) — no skip entries materialized
446        struct SourceMeta {
447            data_len: usize,
448            skip_count: usize,
449        }
450
451        let mut metas: Vec<SourceMeta> = Vec::with_capacity(sources.len());
452        let mut total_docs = 0u32;
453
454        for (raw, _) in sources {
455            if raw.len() < 16 {
456                continue;
457            }
458            let f = raw.len() - 16;
459            let data_len = u64::from_le_bytes(raw[f..f + 8].try_into().unwrap()) as usize;
460            let skip_count = u32::from_le_bytes(raw[f + 8..f + 12].try_into().unwrap()) as usize;
461            let doc_count = u32::from_le_bytes(raw[f + 12..f + 16].try_into().unwrap());
462            total_docs += doc_count;
463            metas.push(SourceMeta {
464                data_len,
465                skip_count,
466            });
467        }
468
469        // Phase 1: Stream block data, reading skip entries on-the-fly.
470        // Accumulate output skip bytes (bounded: 20 bytes × total_blocks).
471        let mut out_skip: Vec<u8> = Vec::new();
472        let mut out_skip_count = 0u32;
473        let mut data_written = 0u64;
474        let mut patch_buf = [0u8; 8];
475        let es = Self::SKIP_ENTRY_SIZE;
476
477        for (src_idx, meta) in metas.iter().enumerate() {
478            let (raw, doc_offset) = &sources[src_idx];
479            let skip_base = meta.data_len;
480            let data = &raw[..meta.data_len];
481
482            for i in 0..meta.skip_count {
483                // Read source skip entry directly from raw bytes
484                let p = skip_base + i * es;
485                let base = u32::from_le_bytes(raw[p..p + 4].try_into().unwrap());
486                let last = u32::from_le_bytes(raw[p + 4..p + 8].try_into().unwrap());
487                let offset = u64::from_le_bytes(raw[p + 8..p + 16].try_into().unwrap());
488                let length = u32::from_le_bytes(raw[p + 16..p + 20].try_into().unwrap());
489
490                let block = &data[offset as usize..(offset as usize + length as usize)];
491
492                // Write output skip entry
493                out_skip.extend_from_slice(&(base + doc_offset).to_le_bytes());
494                out_skip.extend_from_slice(&(last + doc_offset).to_le_bytes());
495                out_skip.extend_from_slice(&data_written.to_le_bytes());
496                out_skip.extend_from_slice(&length.to_le_bytes());
497                out_skip_count += 1;
498
499                // Write patched 8-byte prefix + rest of block verbatim
500                patch_buf[0..4].copy_from_slice(&block[0..4]);
501                let first_doc = u32::from_le_bytes(block[4..8].try_into().unwrap());
502                patch_buf[4..8].copy_from_slice(&(first_doc + doc_offset).to_le_bytes());
503                writer.write_all(&patch_buf)?;
504                writer.write_all(&block[8..])?;
505
506                data_written += block.len() as u64;
507            }
508        }
509
510        // Phase 2: Write skip entries + footer
511        writer.write_all(&out_skip)?;
512
513        writer.write_u64::<LittleEndian>(data_written)?;
514        writer.write_u32::<LittleEndian>(out_skip_count)?;
515        writer.write_u32::<LittleEndian>(total_docs)?;
516
517        let total_bytes = data_written as usize + out_skip.len() + 16;
518        Ok((total_docs, total_bytes))
519    }
520
521    /// Get iterator over all postings (for phrase queries)
522    pub fn iter(&self) -> PositionPostingIterator<'_> {
523        PositionPostingIterator::new(self)
524    }
525}
526
527/// Iterator over block-based position posting list
528///
529/// Uses flat buffers instead of per-posting `Vec<u32>` to avoid heap
530/// allocations on every block load. Positions are concatenated into a
531/// single `Vec<u32>` with offset/length pairs for O(1) slicing.
532pub struct PositionPostingIterator<'a> {
533    list: &'a PositionPostingList,
534    current_block: usize,
535    position_in_block: usize,
536    /// Number of postings in the current block
537    block_count: usize,
538    /// Doc IDs for each posting in the block
539    block_doc_ids: Vec<DocId>,
540    /// Term frequency for each posting in the block
541    block_term_freqs: Vec<u32>,
542    /// All positions concatenated (flat buffer, reused across blocks)
543    block_positions: Vec<u32>,
544    /// Start offset into block_positions for each posting
545    block_pos_offsets: Vec<usize>,
546    exhausted: bool,
547}
548
549impl<'a> PositionPostingIterator<'a> {
550    pub fn new(list: &'a PositionPostingList) -> Self {
551        let exhausted = list.skip_list.is_empty();
552        let mut iter = Self {
553            list,
554            current_block: 0,
555            position_in_block: 0,
556            block_count: 0,
557            block_doc_ids: Vec::with_capacity(POSITION_BLOCK_SIZE),
558            block_term_freqs: Vec::with_capacity(POSITION_BLOCK_SIZE),
559            block_positions: Vec::new(),
560            block_pos_offsets: Vec::with_capacity(POSITION_BLOCK_SIZE + 1),
561            exhausted,
562        };
563        if !iter.exhausted {
564            iter.load_block(0);
565        }
566        iter
567    }
568
569    fn load_block(&mut self, block_idx: usize) {
570        if block_idx >= self.list.skip_list.len() {
571            self.exhausted = true;
572            return;
573        }
574
575        self.current_block = block_idx;
576        self.position_in_block = 0;
577
578        let offset = self.list.skip_list[block_idx].2 as usize;
579        let mut reader = &self.list.data[offset..];
580
581        // Fixed 8-byte prefix: count(u32) + first_doc(u32)
582        let count = reader.read_u32::<LittleEndian>().unwrap_or(0) as usize;
583        let first_doc = reader.read_u32::<LittleEndian>().unwrap_or(0);
584
585        self.block_count = count;
586        self.block_doc_ids.clear();
587        self.block_term_freqs.clear();
588        self.block_positions.clear();
589        self.block_pos_offsets.clear();
590
591        let mut prev_doc_id = first_doc;
592
593        for i in 0..count {
594            let doc_id = if i == 0 {
595                first_doc
596            } else {
597                let delta = read_vint(&mut reader).unwrap_or(0) as u32;
598                prev_doc_id + delta
599            };
600            prev_doc_id = doc_id;
601
602            let num_positions = read_vint(&mut reader).unwrap_or(0) as usize;
603            self.block_doc_ids.push(doc_id);
604            self.block_term_freqs.push(num_positions as u32);
605            self.block_pos_offsets.push(self.block_positions.len());
606            for _ in 0..num_positions {
607                let pos = read_vint(&mut reader).unwrap_or(0) as u32;
608                self.block_positions.push(pos);
609            }
610        }
611        // Sentinel offset for the last posting's end
612        self.block_pos_offsets.push(self.block_positions.len());
613    }
614
615    pub fn doc(&self) -> DocId {
616        if self.exhausted || self.position_in_block >= self.block_count {
617            u32::MAX
618        } else {
619            self.block_doc_ids[self.position_in_block]
620        }
621    }
622
623    pub fn term_freq(&self) -> u32 {
624        if self.exhausted || self.position_in_block >= self.block_count {
625            0
626        } else {
627            self.block_term_freqs[self.position_in_block]
628        }
629    }
630
631    pub fn positions(&self) -> &[u32] {
632        if self.exhausted || self.position_in_block >= self.block_count {
633            &[]
634        } else {
635            let start = self.block_pos_offsets[self.position_in_block];
636            let end = self.block_pos_offsets[self.position_in_block + 1];
637            &self.block_positions[start..end]
638        }
639    }
640
641    pub fn advance(&mut self) {
642        if self.exhausted {
643            return;
644        }
645
646        self.position_in_block += 1;
647        if self.position_in_block >= self.block_count {
648            self.load_block(self.current_block + 1);
649        }
650    }
651
652    pub fn seek(&mut self, target: DocId) {
653        if self.exhausted {
654            return;
655        }
656
657        // Check if target is in current block
658        if let Some((_, last, _)) = self.list.skip_list.get(self.current_block)
659            && target <= *last
660        {
661            // SIMD-accelerated scan within block
662            let remaining = &self.block_doc_ids[self.position_in_block..self.block_count];
663            let offset = crate::structures::simd::find_first_ge_u32(remaining, target);
664            self.position_in_block += offset;
665            if self.position_in_block >= self.block_count {
666                self.load_block(self.current_block + 1);
667                self.seek(target); // Continue seeking in next block
668            }
669            return;
670        }
671
672        // Binary search on skip list to find the right block
673        let block_idx = match self.list.skip_list.binary_search_by(|&(base, last, _)| {
674            if target < base {
675                std::cmp::Ordering::Greater
676            } else if target > last {
677                std::cmp::Ordering::Less
678            } else {
679                std::cmp::Ordering::Equal
680            }
681        }) {
682            Ok(idx) => idx,
683            Err(idx) => idx, // Use the next block if not found exactly
684        };
685
686        if block_idx >= self.list.skip_list.len() {
687            self.exhausted = true;
688            return;
689        }
690
691        self.load_block(block_idx);
692
693        // SIMD-accelerated scan within block
694        let remaining = &self.block_doc_ids[self.position_in_block..self.block_count];
695        let offset = crate::structures::simd::find_first_ge_u32(remaining, target);
696        self.position_in_block += offset;
697
698        if self.position_in_block >= self.block_count {
699            self.load_block(self.current_block + 1);
700        }
701    }
702}
703
704#[cfg(test)]
705mod tests {
706    use super::*;
707
708    #[test]
709    fn test_position_encoding() {
710        // Element 0, position 5
711        let pos = encode_position(0, 5);
712        assert_eq!(decode_element_ordinal(pos), 0);
713        assert_eq!(decode_token_position(pos), 5);
714
715        // Element 3, position 100
716        let pos = encode_position(3, 100);
717        assert_eq!(decode_element_ordinal(pos), 3);
718        assert_eq!(decode_token_position(pos), 100);
719
720        // Max values
721        let pos = encode_position(MAX_ELEMENT_ORDINAL, MAX_TOKEN_POSITION);
722        assert_eq!(decode_element_ordinal(pos), MAX_ELEMENT_ORDINAL);
723        assert_eq!(decode_token_position(pos), MAX_TOKEN_POSITION);
724    }
725
726    #[test]
727    fn test_position_posting_list_build() {
728        // Build from postings
729        let postings = vec![
730            PostingWithPositions {
731                doc_id: 1,
732                term_freq: 2,
733                positions: vec![encode_position(0, 0), encode_position(0, 2)],
734            },
735            PostingWithPositions {
736                doc_id: 3,
737                term_freq: 1,
738                positions: vec![encode_position(1, 0)],
739            },
740        ];
741
742        let list = PositionPostingList::from_postings(&postings).unwrap();
743        assert_eq!(list.doc_count(), 2);
744
745        // Test binary search
746        let pos = list.get_positions(1).unwrap();
747        assert_eq!(pos.len(), 2);
748
749        let pos = list.get_positions(3).unwrap();
750        assert_eq!(pos.len(), 1);
751
752        // Not found
753        assert!(list.get_positions(2).is_none());
754        assert!(list.get_positions(99).is_none());
755    }
756
757    #[test]
758    fn test_serialization_roundtrip() {
759        let postings = vec![
760            PostingWithPositions {
761                doc_id: 1,
762                term_freq: 2,
763                positions: vec![encode_position(0, 0), encode_position(0, 5)],
764            },
765            PostingWithPositions {
766                doc_id: 3,
767                term_freq: 1,
768                positions: vec![encode_position(1, 0)],
769            },
770            PostingWithPositions {
771                doc_id: 5,
772                term_freq: 1,
773                positions: vec![encode_position(0, 10)],
774            },
775        ];
776
777        let list = PositionPostingList::from_postings(&postings).unwrap();
778
779        let mut bytes = Vec::new();
780        list.serialize(&mut bytes).unwrap();
781
782        let deserialized = PositionPostingList::deserialize(&bytes).unwrap();
783
784        assert_eq!(list.doc_count(), deserialized.doc_count());
785
786        // Verify binary search works on deserialized
787        let pos = deserialized.get_positions(1).unwrap();
788        assert_eq!(pos, vec![encode_position(0, 0), encode_position(0, 5)]);
789
790        let pos = deserialized.get_positions(3).unwrap();
791        assert_eq!(pos, vec![encode_position(1, 0)]);
792    }
793
794    #[test]
795    fn test_binary_search_many_blocks() {
796        // Create enough postings to span multiple blocks (128 per block)
797        let mut postings = Vec::new();
798        for i in 0..300 {
799            postings.push(PostingWithPositions {
800                doc_id: i * 2, // doc_ids: 0, 2, 4, 6, ...
801                term_freq: 1,
802                positions: vec![encode_position(0, i)],
803            });
804        }
805
806        let list = PositionPostingList::from_postings(&postings).unwrap();
807        assert_eq!(list.doc_count(), 300);
808
809        // Should have 3 blocks (128 + 128 + 44)
810        assert_eq!(list.skip_list.len(), 3);
811
812        // Test binary search across blocks
813        let pos = list.get_positions(0).unwrap();
814        assert_eq!(pos, vec![encode_position(0, 0)]);
815
816        let pos = list.get_positions(256).unwrap(); // doc 128 in block 1
817        assert_eq!(pos, vec![encode_position(0, 128)]);
818
819        let pos = list.get_positions(598).unwrap(); // last doc
820        assert_eq!(pos, vec![encode_position(0, 299)]);
821
822        // Not found (odd numbers don't exist)
823        assert!(list.get_positions(1).is_none());
824        assert!(list.get_positions(257).is_none());
825    }
826
827    #[test]
828    fn test_concatenate_blocks_merge() {
829        // Test efficient merge by concatenating blocks
830        let postings1 = vec![
831            PostingWithPositions {
832                doc_id: 0,
833                term_freq: 1,
834                positions: vec![0],
835            },
836            PostingWithPositions {
837                doc_id: 1,
838                term_freq: 1,
839                positions: vec![5],
840            },
841            PostingWithPositions {
842                doc_id: 2,
843                term_freq: 1,
844                positions: vec![10],
845            },
846        ];
847        let list1 = PositionPostingList::from_postings(&postings1).unwrap();
848
849        let postings2 = vec![
850            PostingWithPositions {
851                doc_id: 0,
852                term_freq: 1,
853                positions: vec![100],
854            },
855            PostingWithPositions {
856                doc_id: 1,
857                term_freq: 1,
858                positions: vec![105],
859            },
860        ];
861        let list2 = PositionPostingList::from_postings(&postings2).unwrap();
862
863        // Merge with doc_id offset
864        let combined = PositionPostingList::concatenate_blocks(&[
865            (list1, 0), // no offset
866            (list2, 3), // offset by 3 (list1 has 3 docs)
867        ])
868        .unwrap();
869
870        assert_eq!(combined.doc_count(), 5);
871
872        // Check doc_ids are correctly remapped
873        assert!(combined.get_positions(0).is_some());
874        assert!(combined.get_positions(1).is_some());
875        assert!(combined.get_positions(2).is_some());
876        assert!(combined.get_positions(3).is_some()); // was 0 in list2
877        assert!(combined.get_positions(4).is_some()); // was 1 in list2
878    }
879
880    #[test]
881    fn test_iterator() {
882        let postings = vec![
883            PostingWithPositions {
884                doc_id: 1,
885                term_freq: 2,
886                positions: vec![0, 5],
887            },
888            PostingWithPositions {
889                doc_id: 3,
890                term_freq: 1,
891                positions: vec![10],
892            },
893            PostingWithPositions {
894                doc_id: 5,
895                term_freq: 1,
896                positions: vec![15],
897            },
898        ];
899
900        let list = PositionPostingList::from_postings(&postings).unwrap();
901        let mut iter = list.iter();
902
903        assert_eq!(iter.doc(), 1);
904        assert_eq!(iter.positions(), &[0, 5]);
905
906        iter.advance();
907        assert_eq!(iter.doc(), 3);
908
909        iter.seek(5);
910        assert_eq!(iter.doc(), 5);
911        assert_eq!(iter.positions(), &[15]);
912
913        iter.advance();
914        assert_eq!(iter.doc(), u32::MAX); // exhausted
915    }
916
917    /// Helper: build a PositionPostingList from (doc_id, positions) pairs
918    fn build_ppl(entries: &[(u32, Vec<u32>)]) -> PositionPostingList {
919        let postings: Vec<PostingWithPositions> = entries
920            .iter()
921            .map(|(doc_id, positions)| PostingWithPositions {
922                doc_id: *doc_id,
923                term_freq: positions.len() as u32,
924                positions: positions.clone(),
925            })
926            .collect();
927        PositionPostingList::from_postings(&postings).unwrap()
928    }
929
930    /// Helper: serialize a PositionPostingList to bytes
931    fn serialize_ppl(ppl: &PositionPostingList) -> Vec<u8> {
932        let mut buf = Vec::new();
933        ppl.serialize(&mut buf).unwrap();
934        buf
935    }
936
937    /// Helper: collect all (doc_id, positions) from a PositionPostingIterator
938    fn collect_positions(ppl: &PositionPostingList) -> Vec<(u32, Vec<u32>)> {
939        let mut result = Vec::new();
940        let mut it = ppl.iter();
941        while it.doc() != u32::MAX {
942            result.push((it.doc(), it.positions().to_vec()));
943            it.advance();
944        }
945        result
946    }
947
948    #[test]
949    fn test_concatenate_streaming_matches_blocks() {
950        // Build 3 segments with positions
951        let seg_a: Vec<(u32, Vec<u32>)> = (0..150)
952            .map(|i| (i * 2, vec![i * 10, i * 10 + 3]))
953            .collect();
954        let seg_b: Vec<(u32, Vec<u32>)> = (0..100).map(|i| (i * 5, vec![i * 7])).collect();
955        let seg_c: Vec<(u32, Vec<u32>)> = (0..80).map(|i| (i * 3, vec![i, i + 1, i + 2])).collect();
956
957        let ppl_a = build_ppl(&seg_a);
958        let ppl_b = build_ppl(&seg_b);
959        let ppl_c = build_ppl(&seg_c);
960
961        let offset_b = 500u32;
962        let offset_c = 1000u32;
963
964        // Method 1: concatenate_blocks (reference)
965        let ref_merged = PositionPostingList::concatenate_blocks(&[
966            (ppl_a.clone(), 0),
967            (ppl_b.clone(), offset_b),
968            (ppl_c.clone(), offset_c),
969        ])
970        .unwrap();
971        let mut ref_buf = Vec::new();
972        ref_merged.serialize(&mut ref_buf).unwrap();
973
974        // Method 2: concatenate_streaming
975        let bytes_a = serialize_ppl(&ppl_a);
976        let bytes_b = serialize_ppl(&ppl_b);
977        let bytes_c = serialize_ppl(&ppl_c);
978
979        let sources: Vec<(&[u8], u32)> =
980            vec![(&bytes_a, 0), (&bytes_b, offset_b), (&bytes_c, offset_c)];
981        let mut stream_buf = Vec::new();
982        let (doc_count, bytes_written) =
983            PositionPostingList::concatenate_streaming(&sources, &mut stream_buf).unwrap();
984
985        assert_eq!(doc_count, 330);
986        assert_eq!(bytes_written, stream_buf.len());
987
988        // Deserialize both and compare all postings + positions
989        let ref_posts = collect_positions(&PositionPostingList::deserialize(&ref_buf).unwrap());
990        let stream_posts =
991            collect_positions(&PositionPostingList::deserialize(&stream_buf).unwrap());
992
993        assert_eq!(ref_posts.len(), stream_posts.len());
994        for (i, (r, s)) in ref_posts.iter().zip(stream_posts.iter()).enumerate() {
995            assert_eq!(r.0, s.0, "doc_id mismatch at {}", i);
996            assert_eq!(r.1, s.1, "positions mismatch at doc {}", r.0);
997        }
998    }
999
1000    #[test]
1001    fn test_positions_multi_round_merge() {
1002        // Round 0: 4 segments with distinct positions
1003        let segments: Vec<Vec<(u32, Vec<u32>)>> = (0..4)
1004            .map(|seg| {
1005                (0..200)
1006                    .map(|i| {
1007                        let pos_count = (i % 3) + 1;
1008                        let positions: Vec<u32> = (0..pos_count)
1009                            .map(|p| (seg * 1000 + i * 10 + p) as u32)
1010                            .collect();
1011                        (i as u32 * 3, positions)
1012                    })
1013                    .collect()
1014            })
1015            .collect();
1016
1017        let ppls: Vec<PositionPostingList> = segments.iter().map(|s| build_ppl(s)).collect();
1018        let serialized: Vec<Vec<u8>> = ppls.iter().map(serialize_ppl).collect();
1019
1020        // Round 1: merge pairs
1021        let mut merged_01 = Vec::new();
1022        let sources_01: Vec<(&[u8], u32)> = vec![(&serialized[0], 0), (&serialized[1], 600)];
1023        let (dc_01, _) =
1024            PositionPostingList::concatenate_streaming(&sources_01, &mut merged_01).unwrap();
1025        assert_eq!(dc_01, 400);
1026
1027        let mut merged_23 = Vec::new();
1028        let sources_23: Vec<(&[u8], u32)> = vec![(&serialized[2], 0), (&serialized[3], 600)];
1029        let (dc_23, _) =
1030            PositionPostingList::concatenate_streaming(&sources_23, &mut merged_23).unwrap();
1031        assert_eq!(dc_23, 400);
1032
1033        // Round 2: merge intermediate results
1034        let mut final_merged = Vec::new();
1035        let sources_final: Vec<(&[u8], u32)> = vec![(&merged_01, 0), (&merged_23, 1200)];
1036        let (dc_final, _) =
1037            PositionPostingList::concatenate_streaming(&sources_final, &mut final_merged).unwrap();
1038        assert_eq!(dc_final, 800);
1039
1040        // Verify all positions survived two rounds of merging
1041        let final_ppl = PositionPostingList::deserialize(&final_merged).unwrap();
1042        let all = collect_positions(&final_ppl);
1043        assert_eq!(all.len(), 800);
1044
1045        // Spot-check: first doc of each original segment
1046        // Seg0: doc_id=0, positions from seg=0
1047        assert_eq!(all[0].0, 0);
1048        assert_eq!(all[0].1, vec![0]); // seg=0, i=0, pos_count=1, pos=0*1000+0*10+0=0
1049
1050        // Seg1: doc_id=600, positions from seg=1
1051        assert_eq!(all[200].0, 600);
1052        assert_eq!(all[200].1, vec![1000]); // seg=1, i=0, pos_count=1, pos=1*1000+0*10+0=1000
1053
1054        // Seg2: doc_id=1200, positions from seg=2
1055        assert_eq!(all[400].0, 1200);
1056        assert_eq!(all[400].1, vec![2000]); // seg=2, i=0, pos_count=1, pos=2*1000+0*10+0=2000
1057
1058        // Verify seek + positions on the final merged list
1059        let mut it = final_ppl.iter();
1060        it.seek(1200);
1061        assert_eq!(it.doc(), 1200);
1062        assert_eq!(it.positions(), &[2000]);
1063
1064        // Verify get_positions (binary search path)
1065        let pos = final_ppl.get_positions(600).unwrap();
1066        assert_eq!(pos, vec![1000]);
1067    }
1068
1069    #[test]
1070    fn test_positions_large_scale_merge() {
1071        // 5 segments × 500 docs, each doc has 1-4 positions
1072        let num_segments = 5usize;
1073        let docs_per_segment = 500usize;
1074
1075        let segments: Vec<Vec<(u32, Vec<u32>)>> = (0..num_segments)
1076            .map(|seg| {
1077                (0..docs_per_segment)
1078                    .map(|i| {
1079                        let n_pos = (i % 4) + 1;
1080                        let positions: Vec<u32> =
1081                            (0..n_pos).map(|p| (p * 5 + seg) as u32).collect();
1082                        (i as u32 * 2, positions)
1083                    })
1084                    .collect()
1085            })
1086            .collect();
1087
1088        let ppls: Vec<PositionPostingList> = segments.iter().map(|s| build_ppl(s)).collect();
1089        let serialized: Vec<Vec<u8>> = ppls.iter().map(serialize_ppl).collect();
1090
1091        let max_doc = (docs_per_segment as u32 - 1) * 2;
1092        let offsets: Vec<u32> = (0..num_segments)
1093            .map(|i| i as u32 * (max_doc + 1))
1094            .collect();
1095
1096        let sources: Vec<(&[u8], u32)> = serialized
1097            .iter()
1098            .zip(offsets.iter())
1099            .map(|(b, o)| (b.as_slice(), *o))
1100            .collect();
1101
1102        let mut merged = Vec::new();
1103        let (doc_count, _) =
1104            PositionPostingList::concatenate_streaming(&sources, &mut merged).unwrap();
1105        assert_eq!(doc_count, (num_segments * docs_per_segment) as u32);
1106
1107        // Deserialize and verify all positions
1108        let merged_ppl = PositionPostingList::deserialize(&merged).unwrap();
1109        let all = collect_positions(&merged_ppl);
1110        assert_eq!(all.len(), num_segments * docs_per_segment);
1111
1112        // Verify positions for each segment
1113        for (seg, &offset) in offsets.iter().enumerate() {
1114            for i in 0..docs_per_segment {
1115                let idx = seg * docs_per_segment + i;
1116                let expected_doc = i as u32 * 2 + offset;
1117                assert_eq!(all[idx].0, expected_doc, "seg={} i={}", seg, i);
1118
1119                let n_pos = (i % 4) + 1;
1120                let expected_positions: Vec<u32> =
1121                    (0..n_pos).map(|p| (p * 5 + seg) as u32).collect();
1122                assert_eq!(
1123                    all[idx].1, expected_positions,
1124                    "positions mismatch seg={} i={}",
1125                    seg, i
1126                );
1127            }
1128        }
1129    }
1130
1131    #[test]
1132    fn test_positions_streaming_single_source() {
1133        let entries: Vec<(u32, Vec<u32>)> =
1134            (0..300).map(|i| (i * 4, vec![i * 2, i * 2 + 1])).collect();
1135        let ppl = build_ppl(&entries);
1136        let direct = serialize_ppl(&ppl);
1137
1138        let sources: Vec<(&[u8], u32)> = vec![(&direct, 0)];
1139        let mut streamed = Vec::new();
1140        PositionPostingList::concatenate_streaming(&sources, &mut streamed).unwrap();
1141
1142        let p1 = collect_positions(&PositionPostingList::deserialize(&direct).unwrap());
1143        let p2 = collect_positions(&PositionPostingList::deserialize(&streamed).unwrap());
1144        assert_eq!(p1, p2);
1145    }
1146
1147    #[test]
1148    fn test_positions_edge_single_doc() {
1149        let ppl_a = build_ppl(&[(0, vec![42, 43, 44])]);
1150        let ppl_b = build_ppl(&[(0, vec![100])]);
1151
1152        let merged = PositionPostingList::concatenate_blocks(&[(ppl_a, 0), (ppl_b, 1)]).unwrap();
1153
1154        assert_eq!(merged.doc_count(), 2);
1155        assert_eq!(merged.get_positions(0).unwrap(), vec![42, 43, 44]);
1156        assert_eq!(merged.get_positions(1).unwrap(), vec![100]);
1157    }
1158}
hermes_core/structures/postings/positions.rs

hermes_core/structures/postings/
positions.rs