Skip to main content

hermes_core/structures/postings/
elias_fano.rs

1//! Elias-Fano encoding for compressed sorted integer sequences
2//!
3//! Elias-Fano is a quasi-succinct encoding that achieves near-optimal space
4//! while supporting O(1) access and fast NextGEQ operations.
5//!
6//! Space: 2n + n⌈log(m/n)⌉ bits where n = count, m = universe size
7//!
8//! Used by Google, Facebook, and Lucene for posting list compression.
9
10use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
11use std::io::{self, Read, Write};
12
13/// Elias-Fano encoded monotone sequence
14#[derive(Debug, Clone)]
15pub struct EliasFano {
16    /// Lower bits array (dense, l bits per element)
17    lower_bits: Vec<u64>,
18    /// Upper bits array (sparse, unary encoded)
19    upper_bits: Vec<u64>,
20    /// Number of elements
21    len: u32,
22    /// Universe size (max value + 1)
23    universe: u64,
24    /// Number of lower bits per element
25    lower_bit_width: u8,
26}
27
28impl EliasFano {
29    /// Create Elias-Fano encoding from a sorted sequence
30    pub fn from_sorted_slice(values: &[u32]) -> Self {
31        if values.is_empty() {
32            return Self {
33                lower_bits: Vec::new(),
34                upper_bits: Vec::new(),
35                len: 0,
36                universe: 0,
37                lower_bit_width: 0,
38            };
39        }
40
41        let n = values.len() as u64;
42        let max_val = *values.last().unwrap() as u64;
43        let universe = max_val + 1;
44
45        // Calculate lower bit width: l = max(0, ⌊log2(m/n)⌋)
46        let lower_bit_width = if n == 0 {
47            0
48        } else {
49            let ratio = universe.max(1) / n.max(1);
50            if ratio <= 1 {
51                0
52            } else {
53                (64 - ratio.leading_zeros() - 1) as u8
54            }
55        };
56
57        // Allocate lower bits array
58        let lower_bits_total = (n as usize) * (lower_bit_width as usize);
59        let lower_words = lower_bits_total.div_ceil(64);
60        let mut lower_bits = vec![0u64; lower_words];
61
62        // Allocate upper bits array
63        // Upper bits use unary encoding: n ones + (max_val >> l) zeros
64        let upper_bound = n + (max_val >> lower_bit_width) + 1;
65        let upper_words = (upper_bound as usize).div_ceil(64);
66        let mut upper_bits = vec![0u64; upper_words];
67
68        // Encode each value
69        let lower_mask = if lower_bit_width == 0 {
70            0
71        } else {
72            (1u64 << lower_bit_width) - 1
73        };
74
75        for (i, &val) in values.iter().enumerate() {
76            let val = val as u64;
77
78            // Store lower bits
79            if lower_bit_width > 0 {
80                let lower = val & lower_mask;
81                let bit_pos = i * (lower_bit_width as usize);
82                let word_idx = bit_pos / 64;
83                let bit_offset = bit_pos % 64;
84
85                lower_bits[word_idx] |= lower << bit_offset;
86                if bit_offset + (lower_bit_width as usize) > 64 && word_idx + 1 < lower_bits.len() {
87                    lower_bits[word_idx + 1] |= lower >> (64 - bit_offset);
88                }
89            }
90
91            // Store upper bits (unary: position = i + (val >> l))
92            let upper = val >> lower_bit_width;
93            let upper_pos = (i as u64) + upper;
94            let word_idx = (upper_pos / 64) as usize;
95            let bit_offset = upper_pos % 64;
96            if word_idx < upper_bits.len() {
97                upper_bits[word_idx] |= 1u64 << bit_offset;
98            }
99        }
100
101        Self {
102            lower_bits,
103            upper_bits,
104            len: values.len() as u32,
105            universe,
106            lower_bit_width,
107        }
108    }
109
110    /// Number of elements
111    #[inline]
112    pub fn len(&self) -> u32 {
113        self.len
114    }
115
116    /// Check if empty
117    #[inline]
118    pub fn is_empty(&self) -> bool {
119        self.len == 0
120    }
121
122    /// Access element at position i (0-indexed)
123    #[inline]
124    pub fn get(&self, i: u32) -> Option<u32> {
125        if i >= self.len {
126            return None;
127        }
128
129        let i = i as usize;
130
131        // Get lower bits
132        let lower = if self.lower_bit_width == 0 {
133            0u64
134        } else {
135            let bit_pos = i * (self.lower_bit_width as usize);
136            let word_idx = bit_pos / 64;
137            let bit_offset = bit_pos % 64;
138            let lower_mask = (1u64 << self.lower_bit_width) - 1;
139
140            let mut val = (self.lower_bits[word_idx] >> bit_offset) & lower_mask;
141            if bit_offset + (self.lower_bit_width as usize) > 64
142                && word_idx + 1 < self.lower_bits.len()
143            {
144                val |= (self.lower_bits[word_idx + 1] << (64 - bit_offset)) & lower_mask;
145            }
146            val
147        };
148
149        // Get upper bits via select1(i) - i
150        let select_pos = self.select1(i as u32)?;
151        let upper = (select_pos as u64) - (i as u64);
152
153        Some(((upper << self.lower_bit_width) | lower) as u32)
154    }
155
156    /// Find position of i-th set bit (0-indexed)
157    /// Uses POPCNT on x86_64 and efficient intrinsics on aarch64
158    fn select1(&self, i: u32) -> Option<u32> {
159        if i >= self.len {
160            return None;
161        }
162
163        let mut remaining = i + 1;
164        let mut pos = 0u32;
165
166        // Process words using architecture-specific popcount
167        for &word in self.upper_bits.iter() {
168            let popcount = Self::popcount64(word);
169            if popcount >= remaining {
170                // Target is in this word - find the remaining-th set bit
171                let bit_pos = Self::select_in_word(word, remaining);
172                return Some(pos + bit_pos);
173            }
174            remaining -= popcount;
175            pos += 64;
176        }
177
178        None
179    }
180
181    /// Fast popcount using architecture-specific intrinsics
182    #[inline]
183    fn popcount64(word: u64) -> u32 {
184        // count_ones() compiles to POPCNT on x86_64 and efficient code on aarch64
185        word.count_ones()
186    }
187
188    /// Find position of k-th set bit within a word (1-indexed k)
189    #[inline]
190    fn select_in_word(word: u64, k: u32) -> u32 {
191        #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
192        {
193            // Use PDEP instruction for O(1) select on BMI2-capable CPUs
194            use std::arch::x86_64::_pdep_u64;
195            let mask = 1u64 << (k - 1);
196            let selected = unsafe { _pdep_u64(mask, word) };
197            selected.trailing_zeros()
198        }
199
200        #[cfg(not(all(target_arch = "x86_64", target_feature = "bmi2")))]
201        {
202            // Fallback: clear lowest set bits one by one
203            let mut w = word;
204            for _ in 0..k - 1 {
205                w &= w - 1; // Clear lowest set bit
206            }
207            w.trailing_zeros()
208        }
209    }
210
211    /// Find first element >= target (NextGEQ operation)
212    /// Returns (position, value) or None if no such element exists
213    pub fn next_geq(&self, target: u32) -> Option<(u32, u32)> {
214        if self.len == 0 {
215            return None;
216        }
217
218        let target = target as u64;
219
220        // Get the bucket (upper bits of target)
221        let target_upper = target >> self.lower_bit_width;
222        let _target_lower = if self.lower_bit_width == 0 {
223            0
224        } else {
225            target & ((1u64 << self.lower_bit_width) - 1)
226        };
227
228        // Find position via select0(target_upper)
229        let bucket_start = self.select0(target_upper as u32);
230
231        // Scan from bucket_start
232        for pos in bucket_start..self.len {
233            if let Some(val) = self.get(pos)
234                && val as u64 >= target
235            {
236                return Some((pos, val));
237            }
238        }
239
240        None
241    }
242
243    /// Find position after i-th zero bit (0-indexed)
244    /// Returns the index in the original sequence where the i-th bucket starts
245    fn select0(&self, i: u32) -> u32 {
246        if i == 0 {
247            return 0;
248        }
249
250        let mut zeros_seen = 0u32;
251        let mut ones_seen = 0u32;
252
253        for &word in &self.upper_bits {
254            let zeros_in_word = 64 - word.count_ones();
255            let ones_in_word = word.count_ones();
256
257            if zeros_seen + zeros_in_word >= i {
258                // Target zero is in this word
259                let mut w = word;
260                let mut bit_idx = 0u32;
261                while bit_idx < 64 {
262                    if (w & 1) == 0 {
263                        zeros_seen += 1;
264                        if zeros_seen == i {
265                            // Found the i-th zero, return count of 1s seen so far
266                            return ones_seen;
267                        }
268                    } else {
269                        ones_seen += 1;
270                    }
271                    w >>= 1;
272                    bit_idx += 1;
273                }
274            }
275            zeros_seen += zeros_in_word;
276            ones_seen += ones_in_word;
277        }
278
279        self.len
280    }
281
282    /// Serialize to bytes
283    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
284        writer.write_u32::<LittleEndian>(self.len)?;
285        writer.write_u64::<LittleEndian>(self.universe)?;
286        writer.write_u8(self.lower_bit_width)?;
287
288        // Write lower bits
289        writer.write_u32::<LittleEndian>(self.lower_bits.len() as u32)?;
290        for &word in &self.lower_bits {
291            writer.write_u64::<LittleEndian>(word)?;
292        }
293
294        // Write upper bits
295        writer.write_u32::<LittleEndian>(self.upper_bits.len() as u32)?;
296        for &word in &self.upper_bits {
297            writer.write_u64::<LittleEndian>(word)?;
298        }
299
300        Ok(())
301    }
302
303    /// Deserialize from bytes
304    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
305        let len = reader.read_u32::<LittleEndian>()?;
306        let universe = reader.read_u64::<LittleEndian>()?;
307        let lower_bit_width = reader.read_u8()?;
308
309        let lower_len = reader.read_u32::<LittleEndian>()? as usize;
310        let mut lower_bits = Vec::with_capacity(lower_len);
311        for _ in 0..lower_len {
312            lower_bits.push(reader.read_u64::<LittleEndian>()?);
313        }
314
315        let upper_len = reader.read_u32::<LittleEndian>()? as usize;
316        let mut upper_bits = Vec::with_capacity(upper_len);
317        for _ in 0..upper_len {
318            upper_bits.push(reader.read_u64::<LittleEndian>()?);
319        }
320
321        Ok(Self {
322            lower_bits,
323            upper_bits,
324            len,
325            universe,
326            lower_bit_width,
327        })
328    }
329
330    /// Get approximate size in bytes
331    pub fn size_bytes(&self) -> usize {
332        self.lower_bits.len() * 8 + self.upper_bits.len() * 8 + 16
333    }
334
335    /// Create an iterator
336    pub fn iter(&self) -> EliasFanoIterator<'_> {
337        EliasFanoIterator { ef: self, pos: 0 }
338    }
339}
340
341/// Iterator over Elias-Fano encoded sequence
342pub struct EliasFanoIterator<'a> {
343    ef: &'a EliasFano,
344    pos: u32,
345}
346
347impl<'a> Iterator for EliasFanoIterator<'a> {
348    type Item = u32;
349
350    fn next(&mut self) -> Option<Self::Item> {
351        if self.pos >= self.ef.len {
352            return None;
353        }
354
355        // Fast path: use cached upper position
356        let val = self.ef.get(self.pos)?;
357        self.pos += 1;
358        Some(val)
359    }
360
361    fn size_hint(&self) -> (usize, Option<usize>) {
362        let remaining = (self.ef.len - self.pos) as usize;
363        (remaining, Some(remaining))
364    }
365}
366
367impl<'a> ExactSizeIterator for EliasFanoIterator<'a> {}
368
369/// Block size for Elias-Fano BlockMax (matches bitpacking for consistency)
370pub const EF_BLOCK_SIZE: usize = 128;
371
372/// Block metadata for block-max pruning optimization
373#[derive(Debug, Clone)]
374pub struct EFBlockInfo {
375    /// First document ID in this block
376    pub first_doc_id: u32,
377    /// Last document ID in this block
378    pub last_doc_id: u32,
379    /// Maximum term frequency in this block
380    pub max_tf: u32,
381    /// Maximum BM25 score upper bound for this block
382    pub max_block_score: f32,
383    /// Starting position (index) in the posting list
384    pub start_pos: u32,
385    /// Number of documents in this block
386    pub num_docs: u16,
387}
388
389impl EFBlockInfo {
390    /// Serialize block info
391    pub fn serialize<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
392        use byteorder::WriteBytesExt;
393        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
394        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
395        writer.write_u32::<LittleEndian>(self.max_tf)?;
396        writer.write_f32::<LittleEndian>(self.max_block_score)?;
397        writer.write_u32::<LittleEndian>(self.start_pos)?;
398        writer.write_u16::<LittleEndian>(self.num_docs)?;
399        Ok(())
400    }
401
402    /// Deserialize block info
403    pub fn deserialize<R: std::io::Read>(reader: &mut R) -> std::io::Result<Self> {
404        use byteorder::ReadBytesExt;
405        Ok(Self {
406            first_doc_id: reader.read_u32::<LittleEndian>()?,
407            last_doc_id: reader.read_u32::<LittleEndian>()?,
408            max_tf: reader.read_u32::<LittleEndian>()?,
409            max_block_score: reader.read_f32::<LittleEndian>()?,
410            start_pos: reader.read_u32::<LittleEndian>()?,
411            num_docs: reader.read_u16::<LittleEndian>()?,
412        })
413    }
414}
415
416/// Elias-Fano encoded posting list with term frequencies and BlockMax support
417#[derive(Debug, Clone)]
418pub struct EliasFanoPostingList {
419    /// Document IDs (Elias-Fano encoded)
420    pub doc_ids: EliasFano,
421    /// Term frequencies (packed, using minimal bits)
422    pub term_freqs: Vec<u8>,
423    /// Bits per term frequency
424    pub tf_bits: u8,
425    /// Maximum term frequency (for BM25 upper bound)
426    pub max_tf: u32,
427    /// Block metadata for block-max pruning
428    pub blocks: Vec<EFBlockInfo>,
429    /// Global maximum score across all blocks
430    pub max_score: f32,
431}
432
433impl EliasFanoPostingList {
434    /// Create from doc_ids and term frequencies (without IDF - for backwards compatibility)
435    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
436        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
437    }
438
439    /// Create from doc_ids and term frequencies with IDF for block-max scores
440    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
441        assert_eq!(doc_ids.len(), term_freqs.len());
442
443        if doc_ids.is_empty() {
444            return Self {
445                doc_ids: EliasFano::from_sorted_slice(&[]),
446                term_freqs: Vec::new(),
447                tf_bits: 0,
448                max_tf: 0,
449                blocks: Vec::new(),
450                max_score: 0.0,
451            };
452        }
453
454        let ef_doc_ids = EliasFano::from_sorted_slice(doc_ids);
455
456        // Find max TF to determine bit width
457        let max_tf = *term_freqs.iter().max().unwrap();
458        let tf_bits = if max_tf == 0 {
459            0
460        } else {
461            (32 - max_tf.leading_zeros()) as u8
462        };
463
464        // Pack term frequencies
465        let total_bits = doc_ids.len() * (tf_bits as usize);
466        let total_bytes = total_bits.div_ceil(8);
467        let mut packed_tfs = vec![0u8; total_bytes];
468
469        if tf_bits > 0 {
470            for (i, &tf) in term_freqs.iter().enumerate() {
471                let bit_pos = i * (tf_bits as usize);
472                let byte_idx = bit_pos / 8;
473                let bit_offset = bit_pos % 8;
474
475                // Store tf - 1 to save one bit (tf is always >= 1)
476                let val = tf.saturating_sub(1);
477
478                packed_tfs[byte_idx] |= (val as u8) << bit_offset;
479                if bit_offset + (tf_bits as usize) > 8 && byte_idx + 1 < packed_tfs.len() {
480                    packed_tfs[byte_idx + 1] |= (val >> (8 - bit_offset)) as u8;
481                }
482                if bit_offset + (tf_bits as usize) > 16 && byte_idx + 2 < packed_tfs.len() {
483                    packed_tfs[byte_idx + 2] |= (val >> (16 - bit_offset)) as u8;
484                }
485            }
486        }
487
488        // Build block metadata for block-max pruning
489        let mut blocks = Vec::new();
490        let mut max_score = 0.0f32;
491        let mut i = 0;
492
493        while i < doc_ids.len() {
494            let block_end = (i + EF_BLOCK_SIZE).min(doc_ids.len());
495            let block_doc_ids = &doc_ids[i..block_end];
496            let block_tfs = &term_freqs[i..block_end];
497
498            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
499            let block_score = crate::query::bm25_upper_bound(block_max_tf as f32, idf);
500            max_score = max_score.max(block_score);
501
502            blocks.push(EFBlockInfo {
503                first_doc_id: block_doc_ids[0],
504                last_doc_id: *block_doc_ids.last().unwrap(),
505                max_tf: block_max_tf,
506                max_block_score: block_score,
507                start_pos: i as u32,
508                num_docs: (block_end - i) as u16,
509            });
510
511            i = block_end;
512        }
513
514        Self {
515            doc_ids: ef_doc_ids,
516            term_freqs: packed_tfs,
517            tf_bits,
518            max_tf,
519            blocks,
520            max_score,
521        }
522    }
523
524    /// Get term frequency at position
525    pub fn get_tf(&self, pos: u32) -> u32 {
526        if self.tf_bits == 0 || pos >= self.doc_ids.len() {
527            return 1;
528        }
529
530        let bit_pos = (pos as usize) * (self.tf_bits as usize);
531        let byte_idx = bit_pos / 8;
532        let bit_offset = bit_pos % 8;
533        let mask = (1u32 << self.tf_bits) - 1;
534
535        let mut val = (self.term_freqs[byte_idx] >> bit_offset) as u32;
536        if bit_offset + (self.tf_bits as usize) > 8 && byte_idx + 1 < self.term_freqs.len() {
537            val |= (self.term_freqs[byte_idx + 1] as u32) << (8 - bit_offset);
538        }
539        if bit_offset + (self.tf_bits as usize) > 16 && byte_idx + 2 < self.term_freqs.len() {
540            val |= (self.term_freqs[byte_idx + 2] as u32) << (16 - bit_offset);
541        }
542
543        (val & mask) + 1 // Add 1 back since we stored tf-1
544    }
545
546    /// Number of documents
547    pub fn len(&self) -> u32 {
548        self.doc_ids.len()
549    }
550
551    /// Check if empty
552    pub fn is_empty(&self) -> bool {
553        self.doc_ids.is_empty()
554    }
555
556    /// Serialize
557    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
558        self.doc_ids.serialize(writer)?;
559        writer.write_u8(self.tf_bits)?;
560        writer.write_u32::<LittleEndian>(self.max_tf)?;
561        writer.write_f32::<LittleEndian>(self.max_score)?;
562        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
563        writer.write_all(&self.term_freqs)?;
564
565        // Write block metadata
566        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
567        for block in &self.blocks {
568            block.serialize(writer)?;
569        }
570
571        Ok(())
572    }
573
574    /// Deserialize
575    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
576        let doc_ids = EliasFano::deserialize(reader)?;
577        let tf_bits = reader.read_u8()?;
578        let max_tf = reader.read_u32::<LittleEndian>()?;
579        let max_score = reader.read_f32::<LittleEndian>()?;
580        let tf_len = reader.read_u32::<LittleEndian>()? as usize;
581        let mut term_freqs = vec![0u8; tf_len];
582        reader.read_exact(&mut term_freqs)?;
583
584        // Read block metadata
585        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
586        let mut blocks = Vec::with_capacity(num_blocks);
587        for _ in 0..num_blocks {
588            blocks.push(EFBlockInfo::deserialize(reader)?);
589        }
590
591        Ok(Self {
592            doc_ids,
593            term_freqs,
594            tf_bits,
595            max_tf,
596            blocks,
597            max_score,
598        })
599    }
600
601    /// Get number of blocks
602    pub fn num_blocks(&self) -> usize {
603        self.blocks.len()
604    }
605
606    /// Get block containing position
607    pub fn block_for_pos(&self, pos: u32) -> usize {
608        (pos as usize) / EF_BLOCK_SIZE
609    }
610
611    /// Create iterator
612    pub fn iterator(&self) -> EliasFanoPostingIterator<'_> {
613        EliasFanoPostingIterator {
614            list: self,
615            pos: 0,
616            current_block: 0,
617        }
618    }
619}
620
621/// Iterator over Elias-Fano posting list with BlockMax support
622pub struct EliasFanoPostingIterator<'a> {
623    list: &'a EliasFanoPostingList,
624    pos: u32,
625    current_block: usize,
626}
627
628impl<'a> EliasFanoPostingIterator<'a> {
629    /// Current document ID
630    pub fn doc(&self) -> u32 {
631        self.list.doc_ids.get(self.pos).unwrap_or(u32::MAX)
632    }
633
634    /// Current term frequency
635    pub fn term_freq(&self) -> u32 {
636        self.list.get_tf(self.pos)
637    }
638
639    /// Advance to next document
640    pub fn advance(&mut self) -> u32 {
641        self.pos += 1;
642        // Update current block if we've moved past it
643        if !self.list.blocks.is_empty() {
644            let new_block = self.list.block_for_pos(self.pos);
645            if new_block < self.list.blocks.len() {
646                self.current_block = new_block;
647            }
648        }
649        self.doc()
650    }
651
652    /// Seek to first doc >= target
653    pub fn seek(&mut self, target: u32) -> u32 {
654        // Use block skip list for faster seeking
655        if !self.list.blocks.is_empty() {
656            // Binary search to find the right block
657            let block_idx = self.list.blocks[self.current_block..].binary_search_by(|block| {
658                if block.last_doc_id < target {
659                    std::cmp::Ordering::Less
660                } else if block.first_doc_id > target {
661                    std::cmp::Ordering::Greater
662                } else {
663                    std::cmp::Ordering::Equal
664                }
665            });
666
667            let target_block = match block_idx {
668                Ok(idx) => self.current_block + idx,
669                Err(idx) => {
670                    let abs_idx = self.current_block + idx;
671                    if abs_idx >= self.list.blocks.len() {
672                        self.pos = self.list.len();
673                        return u32::MAX;
674                    }
675                    abs_idx
676                }
677            };
678
679            // Jump to block start if it's ahead
680            if target_block > self.current_block {
681                self.current_block = target_block;
682                self.pos = self.list.blocks[target_block].start_pos;
683            }
684        }
685
686        // Use Elias-Fano's next_geq for efficient seeking within block
687        if let Some((pos, val)) = self.list.doc_ids.next_geq(target)
688            && pos >= self.pos
689        {
690            self.pos = pos;
691            if !self.list.blocks.is_empty() {
692                self.current_block = self.list.block_for_pos(pos);
693            }
694            return val;
695        }
696
697        // Linear scan from current position (fallback)
698        while self.pos < self.list.len() {
699            let doc = self.doc();
700            if doc >= target {
701                return doc;
702            }
703            self.pos += 1;
704        }
705
706        u32::MAX
707    }
708
709    /// Check if exhausted
710    pub fn is_exhausted(&self) -> bool {
711        self.pos >= self.list.len()
712    }
713
714    /// Get current block's max score (for block-max pruning)
715    pub fn current_block_max_score(&self) -> f32 {
716        if self.is_exhausted() || self.list.blocks.is_empty() {
717            return 0.0;
718        }
719        if self.current_block < self.list.blocks.len() {
720            self.list.blocks[self.current_block].max_block_score
721        } else {
722            0.0
723        }
724    }
725
726    /// Get current block's max term frequency (for BM25F recalculation)
727    pub fn current_block_max_tf(&self) -> u32 {
728        if self.is_exhausted() || self.list.blocks.is_empty() {
729            return 0;
730        }
731        if self.current_block < self.list.blocks.len() {
732            self.list.blocks[self.current_block].max_tf
733        } else {
734            0
735        }
736    }
737
738    /// Get max score for remaining blocks (for MaxScore optimization)
739    pub fn max_remaining_score(&self) -> f32 {
740        if self.is_exhausted() || self.list.blocks.is_empty() {
741            return 0.0;
742        }
743        self.list.blocks[self.current_block..]
744            .iter()
745            .map(|b| b.max_block_score)
746            .fold(0.0f32, |a, b| a.max(b))
747    }
748
749    /// Skip to next block containing doc >= target (for block-max pruning)
750    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
751    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
752        if self.list.blocks.is_empty() {
753            return None;
754        }
755
756        while self.current_block < self.list.blocks.len() {
757            let block = &self.list.blocks[self.current_block];
758            if block.last_doc_id >= target {
759                self.pos = block.start_pos;
760                return Some((block.first_doc_id, block.max_block_score));
761            }
762            self.current_block += 1;
763        }
764
765        self.pos = self.list.len();
766        None
767    }
768}
769
770#[cfg(test)]
771mod tests {
772    use super::*;
773
774    #[test]
775    fn test_elias_fano_basic() {
776        let values = vec![2, 3, 5, 7, 11, 13, 24];
777        let ef = EliasFano::from_sorted_slice(&values);
778
779        assert_eq!(ef.len(), 7);
780
781        for (i, &expected) in values.iter().enumerate() {
782            assert_eq!(ef.get(i as u32), Some(expected), "Mismatch at index {}", i);
783        }
784    }
785
786    #[test]
787    fn test_elias_fano_next_geq() {
788        let values = vec![10, 20, 30, 100, 200, 300, 1000];
789        let ef = EliasFano::from_sorted_slice(&values);
790
791        assert_eq!(ef.next_geq(5), Some((0, 10)));
792        assert_eq!(ef.next_geq(10), Some((0, 10)));
793        assert_eq!(ef.next_geq(15), Some((1, 20)));
794        assert_eq!(ef.next_geq(100), Some((3, 100)));
795        assert_eq!(ef.next_geq(500), Some((6, 1000)));
796        assert_eq!(ef.next_geq(2000), None);
797    }
798
799    #[test]
800    fn test_elias_fano_serialization() {
801        let values: Vec<u32> = (0..1000).map(|i| i * 3).collect();
802        let ef = EliasFano::from_sorted_slice(&values);
803
804        let mut buffer = Vec::new();
805        ef.serialize(&mut buffer).unwrap();
806
807        let restored = EliasFano::deserialize(&mut &buffer[..]).unwrap();
808
809        assert_eq!(restored.len(), ef.len());
810        for i in 0..ef.len() {
811            assert_eq!(restored.get(i), ef.get(i));
812        }
813    }
814
815    #[test]
816    fn test_elias_fano_posting_list() {
817        let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
818        let term_freqs: Vec<u32> = vec![1, 2, 3, 4, 5, 6, 7];
819
820        let list = EliasFanoPostingList::from_postings(&doc_ids, &term_freqs);
821
822        assert_eq!(list.len(), 7);
823        assert_eq!(list.max_tf, 7);
824
825        let mut iter = list.iterator();
826        for (i, (&expected_doc, &expected_tf)) in doc_ids.iter().zip(term_freqs.iter()).enumerate()
827        {
828            assert_eq!(iter.doc(), expected_doc, "Doc mismatch at {}", i);
829            assert_eq!(iter.term_freq(), expected_tf, "TF mismatch at {}", i);
830            iter.advance();
831        }
832    }
833
834    #[test]
835    fn test_elias_fano_iterator_seek() {
836        let doc_ids: Vec<u32> = (0..100).map(|i| i * 10).collect();
837        let term_freqs: Vec<u32> = vec![1; 100];
838
839        let list = EliasFanoPostingList::from_postings(&doc_ids, &term_freqs);
840        let mut iter = list.iterator();
841
842        assert_eq!(iter.seek(55), 60);
843        assert_eq!(iter.seek(100), 100);
844        assert_eq!(iter.seek(999), u32::MAX);
845    }
846
847    #[test]
848    fn test_elias_fano_block_max() {
849        // Create a large posting list that spans multiple blocks
850        let doc_ids: Vec<u32> = (0..500).map(|i| i * 2).collect();
851        // Vary term frequencies so different blocks have different max_tf
852        let term_freqs: Vec<u32> = (0..500)
853            .map(|i| {
854                if i < 128 {
855                    1 // Block 0: max_tf = 1
856                } else if i < 256 {
857                    5 // Block 1: max_tf = 5
858                } else if i < 384 {
859                    10 // Block 2: max_tf = 10
860                } else {
861                    3 // Block 3: max_tf = 3
862                }
863            })
864            .collect();
865
866        let list = EliasFanoPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
867
868        // Should have 4 blocks (500 docs / 128 per block)
869        assert_eq!(list.num_blocks(), 4);
870        assert_eq!(list.blocks[0].max_tf, 1);
871        assert_eq!(list.blocks[1].max_tf, 5);
872        assert_eq!(list.blocks[2].max_tf, 10);
873        assert_eq!(list.blocks[3].max_tf, 3);
874
875        // Block 2 should have highest score (max_tf = 10)
876        assert!(list.blocks[2].max_block_score > list.blocks[0].max_block_score);
877        assert!(list.blocks[2].max_block_score > list.blocks[1].max_block_score);
878        assert!(list.blocks[2].max_block_score > list.blocks[3].max_block_score);
879
880        // Global max_score should equal block 2's score
881        assert_eq!(list.max_score, list.blocks[2].max_block_score);
882
883        // Test raw Elias-Fano next_geq first
884        let (pos, val) = list.doc_ids.next_geq(256).unwrap();
885        assert_eq!(val, 256, "next_geq(256) should return 256, got {}", val);
886        assert_eq!(pos, 128, "position of 256 should be 128, got {}", pos);
887
888        // Test iterator block-max methods
889        let mut iter = list.iterator();
890        assert_eq!(iter.current_block_max_tf(), 1); // Block 0
891
892        // Verify block boundaries
893        // Block 0: positions 0-127, doc_ids 0-254 (i*2 where i=0..127)
894        // Block 1: positions 128-255, doc_ids 256-510 (i*2 where i=128..255)
895        // Block 2: positions 256-383, doc_ids 512-766 (i*2 where i=256..383)
896        // Block 3: positions 384-499, doc_ids 768-998 (i*2 where i=384..499)
897        assert_eq!(list.blocks[0].first_doc_id, 0);
898        assert_eq!(list.blocks[0].last_doc_id, 254);
899        assert_eq!(list.blocks[1].first_doc_id, 256);
900        assert_eq!(list.blocks[1].last_doc_id, 510);
901
902        // Seek to block 1 - use a doc_id clearly in block 1's range
903        let doc = iter.seek(256); // first doc in block 1
904        assert_eq!(doc, 256, "seek(256) should return 256, got {}", doc);
905        // After seek, current_block should be updated
906        let block_tf = iter.current_block_max_tf();
907        assert_eq!(block_tf, 5, "block 1 max_tf should be 5, got {}", block_tf);
908
909        // Seek to block 2
910        let doc = iter.seek(512); // first doc in block 2
911        assert_eq!(doc, 512, "seek(512) should return 512, got {}", doc);
912        assert_eq!(iter.current_block_max_tf(), 10);
913    }
914
915    #[test]
916    fn test_elias_fano_block_max_serialization() {
917        let doc_ids: Vec<u32> = (0..300).map(|i| i * 3).collect();
918        let term_freqs: Vec<u32> = (0..300).map(|i| (i % 10) + 1).collect();
919
920        let list = EliasFanoPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
921
922        let mut buffer = Vec::new();
923        list.serialize(&mut buffer).unwrap();
924
925        let restored = EliasFanoPostingList::deserialize(&mut &buffer[..]).unwrap();
926
927        assert_eq!(restored.len(), list.len());
928        assert_eq!(restored.max_tf, list.max_tf);
929        assert_eq!(restored.max_score, list.max_score);
930        assert_eq!(restored.num_blocks(), list.num_blocks());
931
932        // Verify block metadata
933        for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
934            assert_eq!(orig.first_doc_id, rest.first_doc_id);
935            assert_eq!(orig.last_doc_id, rest.last_doc_id);
936            assert_eq!(orig.max_tf, rest.max_tf);
937            assert_eq!(orig.max_block_score, rest.max_block_score);
938        }
939
940        // Verify iteration produces same results
941        let mut iter1 = list.iterator();
942        let mut iter2 = restored.iterator();
943
944        while !iter1.is_exhausted() {
945            assert_eq!(iter1.doc(), iter2.doc());
946            assert_eq!(iter1.term_freq(), iter2.term_freq());
947            iter1.advance();
948            iter2.advance();
949        }
950        assert!(iter2.is_exhausted());
951    }
952}