hermes_core/structures/
elias_fano.rs

1//! Elias-Fano encoding for compressed sorted integer sequences
2//!
3//! Elias-Fano is a quasi-succinct encoding that achieves near-optimal space
4//! while supporting O(1) access and fast NextGEQ operations.
5//!
6//! Space: 2n + n⌈log(m/n)⌉ bits where n = count, m = universe size
7//!
8//! Used by Google, Facebook, and Lucene for posting list compression.
9
10use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
11use std::io::{self, Read, Write};
12
13/// Elias-Fano encoded monotone sequence
14#[derive(Debug, Clone)]
15pub struct EliasFano {
16    /// Lower bits array (dense, l bits per element)
17    lower_bits: Vec<u64>,
18    /// Upper bits array (sparse, unary encoded)
19    upper_bits: Vec<u64>,
20    /// Number of elements
21    len: u32,
22    /// Universe size (max value + 1)
23    universe: u64,
24    /// Number of lower bits per element
25    lower_bit_width: u8,
26}
27
28impl EliasFano {
29    /// Create Elias-Fano encoding from a sorted sequence
30    pub fn from_sorted_slice(values: &[u32]) -> Self {
31        if values.is_empty() {
32            return Self {
33                lower_bits: Vec::new(),
34                upper_bits: Vec::new(),
35                len: 0,
36                universe: 0,
37                lower_bit_width: 0,
38            };
39        }
40
41        let n = values.len() as u64;
42        let max_val = *values.last().unwrap() as u64;
43        let universe = max_val + 1;
44
45        // Calculate lower bit width: l = max(0, ⌊log2(m/n)⌋)
46        let lower_bit_width = if n == 0 {
47            0
48        } else {
49            let ratio = universe.max(1) / n.max(1);
50            if ratio <= 1 {
51                0
52            } else {
53                (64 - ratio.leading_zeros() - 1) as u8
54            }
55        };
56
57        // Allocate lower bits array
58        let lower_bits_total = (n as usize) * (lower_bit_width as usize);
59        let lower_words = lower_bits_total.div_ceil(64);
60        let mut lower_bits = vec![0u64; lower_words];
61
62        // Allocate upper bits array
63        // Upper bits use unary encoding: n ones + (max_val >> l) zeros
64        let upper_bound = n + (max_val >> lower_bit_width) + 1;
65        let upper_words = (upper_bound as usize).div_ceil(64);
66        let mut upper_bits = vec![0u64; upper_words];
67
68        // Encode each value
69        let lower_mask = if lower_bit_width == 0 {
70            0
71        } else {
72            (1u64 << lower_bit_width) - 1
73        };
74
75        for (i, &val) in values.iter().enumerate() {
76            let val = val as u64;
77
78            // Store lower bits
79            if lower_bit_width > 0 {
80                let lower = val & lower_mask;
81                let bit_pos = i * (lower_bit_width as usize);
82                let word_idx = bit_pos / 64;
83                let bit_offset = bit_pos % 64;
84
85                lower_bits[word_idx] |= lower << bit_offset;
86                if bit_offset + (lower_bit_width as usize) > 64 && word_idx + 1 < lower_bits.len() {
87                    lower_bits[word_idx + 1] |= lower >> (64 - bit_offset);
88                }
89            }
90
91            // Store upper bits (unary: position = i + (val >> l))
92            let upper = val >> lower_bit_width;
93            let upper_pos = (i as u64) + upper;
94            let word_idx = (upper_pos / 64) as usize;
95            let bit_offset = upper_pos % 64;
96            if word_idx < upper_bits.len() {
97                upper_bits[word_idx] |= 1u64 << bit_offset;
98            }
99        }
100
101        Self {
102            lower_bits,
103            upper_bits,
104            len: values.len() as u32,
105            universe,
106            lower_bit_width,
107        }
108    }
109
110    /// Number of elements
111    #[inline]
112    pub fn len(&self) -> u32 {
113        self.len
114    }
115
116    /// Check if empty
117    #[inline]
118    pub fn is_empty(&self) -> bool {
119        self.len == 0
120    }
121
122    /// Access element at position i (0-indexed)
123    #[inline]
124    pub fn get(&self, i: u32) -> Option<u32> {
125        if i >= self.len {
126            return None;
127        }
128
129        let i = i as usize;
130
131        // Get lower bits
132        let lower = if self.lower_bit_width == 0 {
133            0u64
134        } else {
135            let bit_pos = i * (self.lower_bit_width as usize);
136            let word_idx = bit_pos / 64;
137            let bit_offset = bit_pos % 64;
138            let lower_mask = (1u64 << self.lower_bit_width) - 1;
139
140            let mut val = (self.lower_bits[word_idx] >> bit_offset) & lower_mask;
141            if bit_offset + (self.lower_bit_width as usize) > 64
142                && word_idx + 1 < self.lower_bits.len()
143            {
144                val |= (self.lower_bits[word_idx + 1] << (64 - bit_offset)) & lower_mask;
145            }
146            val
147        };
148
149        // Get upper bits via select1(i) - i
150        let select_pos = self.select1(i as u32)?;
151        let upper = (select_pos as u64) - (i as u64);
152
153        Some(((upper << self.lower_bit_width) | lower) as u32)
154    }
155
156    /// Find position of i-th set bit (0-indexed)
157    /// Uses POPCNT on x86_64 and efficient intrinsics on aarch64
158    fn select1(&self, i: u32) -> Option<u32> {
159        if i >= self.len {
160            return None;
161        }
162
163        let mut remaining = i + 1;
164        let mut pos = 0u32;
165
166        // Process words using architecture-specific popcount
167        for &word in self.upper_bits.iter() {
168            let popcount = Self::popcount64(word);
169            if popcount >= remaining {
170                // Target is in this word - find the remaining-th set bit
171                let bit_pos = Self::select_in_word(word, remaining);
172                return Some(pos + bit_pos);
173            }
174            remaining -= popcount;
175            pos += 64;
176        }
177
178        None
179    }
180
181    /// Fast popcount using architecture-specific intrinsics
182    #[inline]
183    fn popcount64(word: u64) -> u32 {
184        // count_ones() compiles to POPCNT on x86_64 and efficient code on aarch64
185        word.count_ones()
186    }
187
188    /// Find position of k-th set bit within a word (1-indexed k)
189    #[inline]
190    fn select_in_word(word: u64, k: u32) -> u32 {
191        #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
192        {
193            // Use PDEP instruction for O(1) select on BMI2-capable CPUs
194            use std::arch::x86_64::_pdep_u64;
195            let mask = 1u64 << (k - 1);
196            let selected = unsafe { _pdep_u64(mask, word) };
197            selected.trailing_zeros()
198        }
199
200        #[cfg(not(all(target_arch = "x86_64", target_feature = "bmi2")))]
201        {
202            // Fallback: clear lowest set bits one by one
203            let mut w = word;
204            for _ in 0..k - 1 {
205                w &= w - 1; // Clear lowest set bit
206            }
207            w.trailing_zeros()
208        }
209    }
210
211    /// Find first element >= target (NextGEQ operation)
212    /// Returns (position, value) or None if no such element exists
213    pub fn next_geq(&self, target: u32) -> Option<(u32, u32)> {
214        if self.len == 0 {
215            return None;
216        }
217
218        let target = target as u64;
219
220        // Get the bucket (upper bits of target)
221        let target_upper = target >> self.lower_bit_width;
222        let _target_lower = if self.lower_bit_width == 0 {
223            0
224        } else {
225            target & ((1u64 << self.lower_bit_width) - 1)
226        };
227
228        // Find position via select0(target_upper)
229        let bucket_start = self.select0(target_upper as u32);
230
231        // Scan from bucket_start
232        for pos in bucket_start..self.len {
233            if let Some(val) = self.get(pos)
234                && val as u64 >= target
235            {
236                return Some((pos, val));
237            }
238        }
239
240        None
241    }
242
243    /// Find position after i-th zero bit (0-indexed)
244    /// Returns the index in the original sequence where the i-th bucket starts
245    fn select0(&self, i: u32) -> u32 {
246        if i == 0 {
247            return 0;
248        }
249
250        let mut zeros_seen = 0u32;
251        let mut ones_seen = 0u32;
252
253        for &word in &self.upper_bits {
254            let zeros_in_word = 64 - word.count_ones();
255            let ones_in_word = word.count_ones();
256
257            if zeros_seen + zeros_in_word >= i {
258                // Target zero is in this word
259                let mut w = word;
260                let mut bit_idx = 0u32;
261                while bit_idx < 64 {
262                    if (w & 1) == 0 {
263                        zeros_seen += 1;
264                        if zeros_seen == i {
265                            // Found the i-th zero, return count of 1s seen so far
266                            return ones_seen;
267                        }
268                    } else {
269                        ones_seen += 1;
270                    }
271                    w >>= 1;
272                    bit_idx += 1;
273                }
274            }
275            zeros_seen += zeros_in_word;
276            ones_seen += ones_in_word;
277        }
278
279        self.len
280    }
281
282    /// Serialize to bytes
283    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
284        writer.write_u32::<LittleEndian>(self.len)?;
285        writer.write_u64::<LittleEndian>(self.universe)?;
286        writer.write_u8(self.lower_bit_width)?;
287
288        // Write lower bits
289        writer.write_u32::<LittleEndian>(self.lower_bits.len() as u32)?;
290        for &word in &self.lower_bits {
291            writer.write_u64::<LittleEndian>(word)?;
292        }
293
294        // Write upper bits
295        writer.write_u32::<LittleEndian>(self.upper_bits.len() as u32)?;
296        for &word in &self.upper_bits {
297            writer.write_u64::<LittleEndian>(word)?;
298        }
299
300        Ok(())
301    }
302
303    /// Deserialize from bytes
304    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
305        let len = reader.read_u32::<LittleEndian>()?;
306        let universe = reader.read_u64::<LittleEndian>()?;
307        let lower_bit_width = reader.read_u8()?;
308
309        let lower_len = reader.read_u32::<LittleEndian>()? as usize;
310        let mut lower_bits = Vec::with_capacity(lower_len);
311        for _ in 0..lower_len {
312            lower_bits.push(reader.read_u64::<LittleEndian>()?);
313        }
314
315        let upper_len = reader.read_u32::<LittleEndian>()? as usize;
316        let mut upper_bits = Vec::with_capacity(upper_len);
317        for _ in 0..upper_len {
318            upper_bits.push(reader.read_u64::<LittleEndian>()?);
319        }
320
321        Ok(Self {
322            lower_bits,
323            upper_bits,
324            len,
325            universe,
326            lower_bit_width,
327        })
328    }
329
330    /// Get approximate size in bytes
331    pub fn size_bytes(&self) -> usize {
332        self.lower_bits.len() * 8 + self.upper_bits.len() * 8 + 16
333    }
334
335    /// Create an iterator
336    pub fn iter(&self) -> EliasFanoIterator<'_> {
337        EliasFanoIterator { ef: self, pos: 0 }
338    }
339}
340
341/// Iterator over Elias-Fano encoded sequence
342pub struct EliasFanoIterator<'a> {
343    ef: &'a EliasFano,
344    pos: u32,
345}
346
347impl<'a> Iterator for EliasFanoIterator<'a> {
348    type Item = u32;
349
350    fn next(&mut self) -> Option<Self::Item> {
351        if self.pos >= self.ef.len {
352            return None;
353        }
354
355        // Fast path: use cached upper position
356        let val = self.ef.get(self.pos)?;
357        self.pos += 1;
358        Some(val)
359    }
360
361    fn size_hint(&self) -> (usize, Option<usize>) {
362        let remaining = (self.ef.len - self.pos) as usize;
363        (remaining, Some(remaining))
364    }
365}
366
367impl<'a> ExactSizeIterator for EliasFanoIterator<'a> {}
368
369/// Block size for Elias-Fano BlockMax (matches bitpacking for consistency)
370pub const EF_BLOCK_SIZE: usize = 128;
371
372/// Block metadata for BlockMax WAND optimization
373#[derive(Debug, Clone)]
374pub struct EFBlockInfo {
375    /// First document ID in this block
376    pub first_doc_id: u32,
377    /// Last document ID in this block
378    pub last_doc_id: u32,
379    /// Maximum term frequency in this block
380    pub max_tf: u32,
381    /// Maximum BM25 score upper bound for this block
382    pub max_block_score: f32,
383    /// Starting position (index) in the posting list
384    pub start_pos: u32,
385    /// Number of documents in this block
386    pub num_docs: u16,
387}
388
389impl EFBlockInfo {
390    /// Serialize block info
391    pub fn serialize<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
392        use byteorder::WriteBytesExt;
393        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
394        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
395        writer.write_u32::<LittleEndian>(self.max_tf)?;
396        writer.write_f32::<LittleEndian>(self.max_block_score)?;
397        writer.write_u32::<LittleEndian>(self.start_pos)?;
398        writer.write_u16::<LittleEndian>(self.num_docs)?;
399        Ok(())
400    }
401
402    /// Deserialize block info
403    pub fn deserialize<R: std::io::Read>(reader: &mut R) -> std::io::Result<Self> {
404        use byteorder::ReadBytesExt;
405        Ok(Self {
406            first_doc_id: reader.read_u32::<LittleEndian>()?,
407            last_doc_id: reader.read_u32::<LittleEndian>()?,
408            max_tf: reader.read_u32::<LittleEndian>()?,
409            max_block_score: reader.read_f32::<LittleEndian>()?,
410            start_pos: reader.read_u32::<LittleEndian>()?,
411            num_docs: reader.read_u16::<LittleEndian>()?,
412        })
413    }
414}
415
416/// Elias-Fano encoded posting list with term frequencies and BlockMax support
417#[derive(Debug, Clone)]
418pub struct EliasFanoPostingList {
419    /// Document IDs (Elias-Fano encoded)
420    pub doc_ids: EliasFano,
421    /// Term frequencies (packed, using minimal bits)
422    pub term_freqs: Vec<u8>,
423    /// Bits per term frequency
424    pub tf_bits: u8,
425    /// Maximum term frequency (for BM25 upper bound)
426    pub max_tf: u32,
427    /// Block metadata for BlockMax WAND
428    pub blocks: Vec<EFBlockInfo>,
429    /// Global maximum score across all blocks
430    pub max_score: f32,
431}
432
433impl EliasFanoPostingList {
434    /// BM25 parameters for block-max score calculation
435    const K1: f32 = 1.2;
436    const B: f32 = 0.75;
437
438    /// Compute BM25 upper bound score for a given max_tf and IDF
439    #[inline]
440    pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
441        let tf = max_tf as f32;
442        // Conservative upper bound: assume dl=0, so length_norm = 1 - b = 0.25
443        let min_length_norm = 1.0 - Self::B;
444        let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
445        idf * tf_norm
446    }
447
448    /// Create from doc_ids and term frequencies (without IDF - for backwards compatibility)
449    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
450        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
451    }
452
453    /// Create from doc_ids and term frequencies with IDF for block-max scores
454    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
455        assert_eq!(doc_ids.len(), term_freqs.len());
456
457        if doc_ids.is_empty() {
458            return Self {
459                doc_ids: EliasFano::from_sorted_slice(&[]),
460                term_freqs: Vec::new(),
461                tf_bits: 0,
462                max_tf: 0,
463                blocks: Vec::new(),
464                max_score: 0.0,
465            };
466        }
467
468        let ef_doc_ids = EliasFano::from_sorted_slice(doc_ids);
469
470        // Find max TF to determine bit width
471        let max_tf = *term_freqs.iter().max().unwrap();
472        let tf_bits = if max_tf == 0 {
473            0
474        } else {
475            (32 - max_tf.leading_zeros()) as u8
476        };
477
478        // Pack term frequencies
479        let total_bits = doc_ids.len() * (tf_bits as usize);
480        let total_bytes = total_bits.div_ceil(8);
481        let mut packed_tfs = vec![0u8; total_bytes];
482
483        if tf_bits > 0 {
484            for (i, &tf) in term_freqs.iter().enumerate() {
485                let bit_pos = i * (tf_bits as usize);
486                let byte_idx = bit_pos / 8;
487                let bit_offset = bit_pos % 8;
488
489                // Store tf - 1 to save one bit (tf is always >= 1)
490                let val = tf.saturating_sub(1);
491
492                packed_tfs[byte_idx] |= (val as u8) << bit_offset;
493                if bit_offset + (tf_bits as usize) > 8 && byte_idx + 1 < packed_tfs.len() {
494                    packed_tfs[byte_idx + 1] |= (val >> (8 - bit_offset)) as u8;
495                }
496                if bit_offset + (tf_bits as usize) > 16 && byte_idx + 2 < packed_tfs.len() {
497                    packed_tfs[byte_idx + 2] |= (val >> (16 - bit_offset)) as u8;
498                }
499            }
500        }
501
502        // Build block metadata for BlockMax WAND
503        let mut blocks = Vec::new();
504        let mut max_score = 0.0f32;
505        let mut i = 0;
506
507        while i < doc_ids.len() {
508            let block_end = (i + EF_BLOCK_SIZE).min(doc_ids.len());
509            let block_doc_ids = &doc_ids[i..block_end];
510            let block_tfs = &term_freqs[i..block_end];
511
512            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
513            let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
514            max_score = max_score.max(block_score);
515
516            blocks.push(EFBlockInfo {
517                first_doc_id: block_doc_ids[0],
518                last_doc_id: *block_doc_ids.last().unwrap(),
519                max_tf: block_max_tf,
520                max_block_score: block_score,
521                start_pos: i as u32,
522                num_docs: (block_end - i) as u16,
523            });
524
525            i = block_end;
526        }
527
528        Self {
529            doc_ids: ef_doc_ids,
530            term_freqs: packed_tfs,
531            tf_bits,
532            max_tf,
533            blocks,
534            max_score,
535        }
536    }
537
538    /// Get term frequency at position
539    pub fn get_tf(&self, pos: u32) -> u32 {
540        if self.tf_bits == 0 || pos >= self.doc_ids.len() {
541            return 1;
542        }
543
544        let bit_pos = (pos as usize) * (self.tf_bits as usize);
545        let byte_idx = bit_pos / 8;
546        let bit_offset = bit_pos % 8;
547        let mask = (1u32 << self.tf_bits) - 1;
548
549        let mut val = (self.term_freqs[byte_idx] >> bit_offset) as u32;
550        if bit_offset + (self.tf_bits as usize) > 8 && byte_idx + 1 < self.term_freqs.len() {
551            val |= (self.term_freqs[byte_idx + 1] as u32) << (8 - bit_offset);
552        }
553        if bit_offset + (self.tf_bits as usize) > 16 && byte_idx + 2 < self.term_freqs.len() {
554            val |= (self.term_freqs[byte_idx + 2] as u32) << (16 - bit_offset);
555        }
556
557        (val & mask) + 1 // Add 1 back since we stored tf-1
558    }
559
560    /// Number of documents
561    pub fn len(&self) -> u32 {
562        self.doc_ids.len()
563    }
564
565    /// Check if empty
566    pub fn is_empty(&self) -> bool {
567        self.doc_ids.is_empty()
568    }
569
570    /// Serialize
571    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
572        self.doc_ids.serialize(writer)?;
573        writer.write_u8(self.tf_bits)?;
574        writer.write_u32::<LittleEndian>(self.max_tf)?;
575        writer.write_f32::<LittleEndian>(self.max_score)?;
576        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
577        writer.write_all(&self.term_freqs)?;
578
579        // Write block metadata
580        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
581        for block in &self.blocks {
582            block.serialize(writer)?;
583        }
584
585        Ok(())
586    }
587
588    /// Deserialize
589    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
590        let doc_ids = EliasFano::deserialize(reader)?;
591        let tf_bits = reader.read_u8()?;
592        let max_tf = reader.read_u32::<LittleEndian>()?;
593        let max_score = reader.read_f32::<LittleEndian>()?;
594        let tf_len = reader.read_u32::<LittleEndian>()? as usize;
595        let mut term_freqs = vec![0u8; tf_len];
596        reader.read_exact(&mut term_freqs)?;
597
598        // Read block metadata
599        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
600        let mut blocks = Vec::with_capacity(num_blocks);
601        for _ in 0..num_blocks {
602            blocks.push(EFBlockInfo::deserialize(reader)?);
603        }
604
605        Ok(Self {
606            doc_ids,
607            term_freqs,
608            tf_bits,
609            max_tf,
610            blocks,
611            max_score,
612        })
613    }
614
615    /// Get number of blocks
616    pub fn num_blocks(&self) -> usize {
617        self.blocks.len()
618    }
619
620    /// Get block containing position
621    pub fn block_for_pos(&self, pos: u32) -> usize {
622        (pos as usize) / EF_BLOCK_SIZE
623    }
624
625    /// Create iterator
626    pub fn iterator(&self) -> EliasFanoPostingIterator<'_> {
627        EliasFanoPostingIterator {
628            list: self,
629            pos: 0,
630            current_block: 0,
631        }
632    }
633}
634
635/// Iterator over Elias-Fano posting list with BlockMax support
636pub struct EliasFanoPostingIterator<'a> {
637    list: &'a EliasFanoPostingList,
638    pos: u32,
639    current_block: usize,
640}
641
642impl<'a> EliasFanoPostingIterator<'a> {
643    /// Current document ID
644    pub fn doc(&self) -> u32 {
645        self.list.doc_ids.get(self.pos).unwrap_or(u32::MAX)
646    }
647
648    /// Current term frequency
649    pub fn term_freq(&self) -> u32 {
650        self.list.get_tf(self.pos)
651    }
652
653    /// Advance to next document
654    pub fn advance(&mut self) -> u32 {
655        self.pos += 1;
656        // Update current block if we've moved past it
657        if !self.list.blocks.is_empty() {
658            let new_block = self.list.block_for_pos(self.pos);
659            if new_block < self.list.blocks.len() {
660                self.current_block = new_block;
661            }
662        }
663        self.doc()
664    }
665
666    /// Seek to first doc >= target
667    pub fn seek(&mut self, target: u32) -> u32 {
668        // Use block skip list for faster seeking
669        if !self.list.blocks.is_empty() {
670            // Binary search to find the right block
671            let block_idx = self.list.blocks[self.current_block..].binary_search_by(|block| {
672                if block.last_doc_id < target {
673                    std::cmp::Ordering::Less
674                } else if block.first_doc_id > target {
675                    std::cmp::Ordering::Greater
676                } else {
677                    std::cmp::Ordering::Equal
678                }
679            });
680
681            let target_block = match block_idx {
682                Ok(idx) => self.current_block + idx,
683                Err(idx) => {
684                    let abs_idx = self.current_block + idx;
685                    if abs_idx >= self.list.blocks.len() {
686                        self.pos = self.list.len();
687                        return u32::MAX;
688                    }
689                    abs_idx
690                }
691            };
692
693            // Jump to block start if it's ahead
694            if target_block > self.current_block {
695                self.current_block = target_block;
696                self.pos = self.list.blocks[target_block].start_pos;
697            }
698        }
699
700        // Use Elias-Fano's next_geq for efficient seeking within block
701        if let Some((pos, val)) = self.list.doc_ids.next_geq(target)
702            && pos >= self.pos
703        {
704            self.pos = pos;
705            if !self.list.blocks.is_empty() {
706                self.current_block = self.list.block_for_pos(pos);
707            }
708            return val;
709        }
710
711        // Linear scan from current position (fallback)
712        while self.pos < self.list.len() {
713            let doc = self.doc();
714            if doc >= target {
715                return doc;
716            }
717            self.pos += 1;
718        }
719
720        u32::MAX
721    }
722
723    /// Check if exhausted
724    pub fn is_exhausted(&self) -> bool {
725        self.pos >= self.list.len()
726    }
727
728    /// Get current block's max score (for BlockMax WAND)
729    pub fn current_block_max_score(&self) -> f32 {
730        if self.is_exhausted() || self.list.blocks.is_empty() {
731            return 0.0;
732        }
733        if self.current_block < self.list.blocks.len() {
734            self.list.blocks[self.current_block].max_block_score
735        } else {
736            0.0
737        }
738    }
739
740    /// Get current block's max term frequency (for BM25F recalculation)
741    pub fn current_block_max_tf(&self) -> u32 {
742        if self.is_exhausted() || self.list.blocks.is_empty() {
743            return 0;
744        }
745        if self.current_block < self.list.blocks.len() {
746            self.list.blocks[self.current_block].max_tf
747        } else {
748            0
749        }
750    }
751
752    /// Get max score for remaining blocks (for MaxScore optimization)
753    pub fn max_remaining_score(&self) -> f32 {
754        if self.is_exhausted() || self.list.blocks.is_empty() {
755            return 0.0;
756        }
757        self.list.blocks[self.current_block..]
758            .iter()
759            .map(|b| b.max_block_score)
760            .fold(0.0f32, |a, b| a.max(b))
761    }
762
763    /// Skip to next block containing doc >= target (for BlockWAND)
764    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
765    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
766        if self.list.blocks.is_empty() {
767            return None;
768        }
769
770        while self.current_block < self.list.blocks.len() {
771            let block = &self.list.blocks[self.current_block];
772            if block.last_doc_id >= target {
773                self.pos = block.start_pos;
774                return Some((block.first_doc_id, block.max_block_score));
775            }
776            self.current_block += 1;
777        }
778
779        self.pos = self.list.len();
780        None
781    }
782}
783
784#[cfg(test)]
785mod tests {
786    use super::*;
787
788    #[test]
789    fn test_elias_fano_basic() {
790        let values = vec![2, 3, 5, 7, 11, 13, 24];
791        let ef = EliasFano::from_sorted_slice(&values);
792
793        assert_eq!(ef.len(), 7);
794
795        for (i, &expected) in values.iter().enumerate() {
796            assert_eq!(ef.get(i as u32), Some(expected), "Mismatch at index {}", i);
797        }
798    }
799
800    #[test]
801    fn test_elias_fano_next_geq() {
802        let values = vec![10, 20, 30, 100, 200, 300, 1000];
803        let ef = EliasFano::from_sorted_slice(&values);
804
805        assert_eq!(ef.next_geq(5), Some((0, 10)));
806        assert_eq!(ef.next_geq(10), Some((0, 10)));
807        assert_eq!(ef.next_geq(15), Some((1, 20)));
808        assert_eq!(ef.next_geq(100), Some((3, 100)));
809        assert_eq!(ef.next_geq(500), Some((6, 1000)));
810        assert_eq!(ef.next_geq(2000), None);
811    }
812
813    #[test]
814    fn test_elias_fano_serialization() {
815        let values: Vec<u32> = (0..1000).map(|i| i * 3).collect();
816        let ef = EliasFano::from_sorted_slice(&values);
817
818        let mut buffer = Vec::new();
819        ef.serialize(&mut buffer).unwrap();
820
821        let restored = EliasFano::deserialize(&mut &buffer[..]).unwrap();
822
823        assert_eq!(restored.len(), ef.len());
824        for i in 0..ef.len() {
825            assert_eq!(restored.get(i), ef.get(i));
826        }
827    }
828
829    #[test]
830    fn test_elias_fano_posting_list() {
831        let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
832        let term_freqs: Vec<u32> = vec![1, 2, 3, 4, 5, 6, 7];
833
834        let list = EliasFanoPostingList::from_postings(&doc_ids, &term_freqs);
835
836        assert_eq!(list.len(), 7);
837        assert_eq!(list.max_tf, 7);
838
839        let mut iter = list.iterator();
840        for (i, (&expected_doc, &expected_tf)) in doc_ids.iter().zip(term_freqs.iter()).enumerate()
841        {
842            assert_eq!(iter.doc(), expected_doc, "Doc mismatch at {}", i);
843            assert_eq!(iter.term_freq(), expected_tf, "TF mismatch at {}", i);
844            iter.advance();
845        }
846    }
847
848    #[test]
849    fn test_elias_fano_iterator_seek() {
850        let doc_ids: Vec<u32> = (0..100).map(|i| i * 10).collect();
851        let term_freqs: Vec<u32> = vec![1; 100];
852
853        let list = EliasFanoPostingList::from_postings(&doc_ids, &term_freqs);
854        let mut iter = list.iterator();
855
856        assert_eq!(iter.seek(55), 60);
857        assert_eq!(iter.seek(100), 100);
858        assert_eq!(iter.seek(999), u32::MAX);
859    }
860
861    #[test]
862    fn test_elias_fano_block_max() {
863        // Create a large posting list that spans multiple blocks
864        let doc_ids: Vec<u32> = (0..500).map(|i| i * 2).collect();
865        // Vary term frequencies so different blocks have different max_tf
866        let term_freqs: Vec<u32> = (0..500)
867            .map(|i| {
868                if i < 128 {
869                    1 // Block 0: max_tf = 1
870                } else if i < 256 {
871                    5 // Block 1: max_tf = 5
872                } else if i < 384 {
873                    10 // Block 2: max_tf = 10
874                } else {
875                    3 // Block 3: max_tf = 3
876                }
877            })
878            .collect();
879
880        let list = EliasFanoPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
881
882        // Should have 4 blocks (500 docs / 128 per block)
883        assert_eq!(list.num_blocks(), 4);
884        assert_eq!(list.blocks[0].max_tf, 1);
885        assert_eq!(list.blocks[1].max_tf, 5);
886        assert_eq!(list.blocks[2].max_tf, 10);
887        assert_eq!(list.blocks[3].max_tf, 3);
888
889        // Block 2 should have highest score (max_tf = 10)
890        assert!(list.blocks[2].max_block_score > list.blocks[0].max_block_score);
891        assert!(list.blocks[2].max_block_score > list.blocks[1].max_block_score);
892        assert!(list.blocks[2].max_block_score > list.blocks[3].max_block_score);
893
894        // Global max_score should equal block 2's score
895        assert_eq!(list.max_score, list.blocks[2].max_block_score);
896
897        // Test raw Elias-Fano next_geq first
898        let (pos, val) = list.doc_ids.next_geq(256).unwrap();
899        assert_eq!(val, 256, "next_geq(256) should return 256, got {}", val);
900        assert_eq!(pos, 128, "position of 256 should be 128, got {}", pos);
901
902        // Test iterator block-max methods
903        let mut iter = list.iterator();
904        assert_eq!(iter.current_block_max_tf(), 1); // Block 0
905
906        // Verify block boundaries
907        // Block 0: positions 0-127, doc_ids 0-254 (i*2 where i=0..127)
908        // Block 1: positions 128-255, doc_ids 256-510 (i*2 where i=128..255)
909        // Block 2: positions 256-383, doc_ids 512-766 (i*2 where i=256..383)
910        // Block 3: positions 384-499, doc_ids 768-998 (i*2 where i=384..499)
911        assert_eq!(list.blocks[0].first_doc_id, 0);
912        assert_eq!(list.blocks[0].last_doc_id, 254);
913        assert_eq!(list.blocks[1].first_doc_id, 256);
914        assert_eq!(list.blocks[1].last_doc_id, 510);
915
916        // Seek to block 1 - use a doc_id clearly in block 1's range
917        let doc = iter.seek(256); // first doc in block 1
918        assert_eq!(doc, 256, "seek(256) should return 256, got {}", doc);
919        // After seek, current_block should be updated
920        let block_tf = iter.current_block_max_tf();
921        assert_eq!(block_tf, 5, "block 1 max_tf should be 5, got {}", block_tf);
922
923        // Seek to block 2
924        let doc = iter.seek(512); // first doc in block 2
925        assert_eq!(doc, 512, "seek(512) should return 512, got {}", doc);
926        assert_eq!(iter.current_block_max_tf(), 10);
927    }
928
929    #[test]
930    fn test_elias_fano_block_max_serialization() {
931        let doc_ids: Vec<u32> = (0..300).map(|i| i * 3).collect();
932        let term_freqs: Vec<u32> = (0..300).map(|i| (i % 10) + 1).collect();
933
934        let list = EliasFanoPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
935
936        let mut buffer = Vec::new();
937        list.serialize(&mut buffer).unwrap();
938
939        let restored = EliasFanoPostingList::deserialize(&mut &buffer[..]).unwrap();
940
941        assert_eq!(restored.len(), list.len());
942        assert_eq!(restored.max_tf, list.max_tf);
943        assert_eq!(restored.max_score, list.max_score);
944        assert_eq!(restored.num_blocks(), list.num_blocks());
945
946        // Verify block metadata
947        for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
948            assert_eq!(orig.first_doc_id, rest.first_doc_id);
949            assert_eq!(orig.last_doc_id, rest.last_doc_id);
950            assert_eq!(orig.max_tf, rest.max_tf);
951            assert_eq!(orig.max_block_score, rest.max_block_score);
952        }
953
954        // Verify iteration produces same results
955        let mut iter1 = list.iterator();
956        let mut iter2 = restored.iterator();
957
958        while !iter1.is_exhausted() {
959            assert_eq!(iter1.doc(), iter2.doc());
960            assert_eq!(iter1.term_freq(), iter2.term_freq());
961            iter1.advance();
962            iter2.advance();
963        }
964        assert!(iter2.is_exhausted());
965    }
966}