hermes_core/structures/
elias_fano.rs

1//! Elias-Fano encoding for compressed sorted integer sequences
2//!
3//! Elias-Fano is a quasi-succinct encoding that achieves near-optimal space
4//! while supporting O(1) access and fast NextGEQ operations.
5//!
6//! Space: 2n + n⌈log(m/n)⌉ bits where n = count, m = universe size
7//!
8//! Used by Google, Facebook, and Lucene for posting list compression.
9
10use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
11use std::io::{self, Read, Write};
12
13/// Elias-Fano encoded monotone sequence
14#[derive(Debug, Clone)]
15pub struct EliasFano {
16    /// Lower bits array (dense, l bits per element)
17    lower_bits: Vec<u64>,
18    /// Upper bits array (sparse, unary encoded)
19    upper_bits: Vec<u64>,
20    /// Number of elements
21    len: u32,
22    /// Universe size (max value + 1)
23    universe: u64,
24    /// Number of lower bits per element
25    lower_bit_width: u8,
26}
27
28impl EliasFano {
29    /// Create Elias-Fano encoding from a sorted sequence
30    pub fn from_sorted_slice(values: &[u32]) -> Self {
31        if values.is_empty() {
32            return Self {
33                lower_bits: Vec::new(),
34                upper_bits: Vec::new(),
35                len: 0,
36                universe: 0,
37                lower_bit_width: 0,
38            };
39        }
40
41        let n = values.len() as u64;
42        let max_val = *values.last().unwrap() as u64;
43        let universe = max_val + 1;
44
45        // Calculate lower bit width: l = max(0, ⌊log2(m/n)⌋)
46        let lower_bit_width = if n == 0 {
47            0
48        } else {
49            let ratio = universe.max(1) / n.max(1);
50            if ratio <= 1 {
51                0
52            } else {
53                (64 - ratio.leading_zeros() - 1) as u8
54            }
55        };
56
57        // Allocate lower bits array
58        let lower_bits_total = (n as usize) * (lower_bit_width as usize);
59        let lower_words = lower_bits_total.div_ceil(64);
60        let mut lower_bits = vec![0u64; lower_words];
61
62        // Allocate upper bits array
63        // Upper bits use unary encoding: n ones + (max_val >> l) zeros
64        let upper_bound = n + (max_val >> lower_bit_width) + 1;
65        let upper_words = (upper_bound as usize).div_ceil(64);
66        let mut upper_bits = vec![0u64; upper_words];
67
68        // Encode each value
69        let lower_mask = if lower_bit_width == 0 {
70            0
71        } else {
72            (1u64 << lower_bit_width) - 1
73        };
74
75        for (i, &val) in values.iter().enumerate() {
76            let val = val as u64;
77
78            // Store lower bits
79            if lower_bit_width > 0 {
80                let lower = val & lower_mask;
81                let bit_pos = i * (lower_bit_width as usize);
82                let word_idx = bit_pos / 64;
83                let bit_offset = bit_pos % 64;
84
85                lower_bits[word_idx] |= lower << bit_offset;
86                if bit_offset + (lower_bit_width as usize) > 64 && word_idx + 1 < lower_bits.len() {
87                    lower_bits[word_idx + 1] |= lower >> (64 - bit_offset);
88                }
89            }
90
91            // Store upper bits (unary: position = i + (val >> l))
92            let upper = val >> lower_bit_width;
93            let upper_pos = (i as u64) + upper;
94            let word_idx = (upper_pos / 64) as usize;
95            let bit_offset = upper_pos % 64;
96            if word_idx < upper_bits.len() {
97                upper_bits[word_idx] |= 1u64 << bit_offset;
98            }
99        }
100
101        Self {
102            lower_bits,
103            upper_bits,
104            len: values.len() as u32,
105            universe,
106            lower_bit_width,
107        }
108    }
109
110    /// Number of elements
111    #[inline]
112    pub fn len(&self) -> u32 {
113        self.len
114    }
115
116    /// Check if empty
117    #[inline]
118    pub fn is_empty(&self) -> bool {
119        self.len == 0
120    }
121
122    /// Access element at position i (0-indexed)
123    #[inline]
124    pub fn get(&self, i: u32) -> Option<u32> {
125        if i >= self.len {
126            return None;
127        }
128
129        let i = i as usize;
130
131        // Get lower bits
132        let lower = if self.lower_bit_width == 0 {
133            0u64
134        } else {
135            let bit_pos = i * (self.lower_bit_width as usize);
136            let word_idx = bit_pos / 64;
137            let bit_offset = bit_pos % 64;
138            let lower_mask = (1u64 << self.lower_bit_width) - 1;
139
140            let mut val = (self.lower_bits[word_idx] >> bit_offset) & lower_mask;
141            if bit_offset + (self.lower_bit_width as usize) > 64
142                && word_idx + 1 < self.lower_bits.len()
143            {
144                val |= (self.lower_bits[word_idx + 1] << (64 - bit_offset)) & lower_mask;
145            }
146            val
147        };
148
149        // Get upper bits via select1(i) - i
150        let select_pos = self.select1(i as u32)?;
151        let upper = (select_pos as u64) - (i as u64);
152
153        Some(((upper << self.lower_bit_width) | lower) as u32)
154    }
155
156    /// Find position of i-th set bit (0-indexed)
157    fn select1(&self, i: u32) -> Option<u32> {
158        if i >= self.len {
159            return None;
160        }
161
162        let mut remaining = i + 1;
163        let mut pos = 0u32;
164
165        for &word in self.upper_bits.iter() {
166            let popcount = word.count_ones();
167            if popcount >= remaining {
168                // Target is in this word
169                let mut w = word;
170                for _ in 0..remaining - 1 {
171                    w &= w - 1; // Clear lowest set bit
172                }
173                let bit_pos = w.trailing_zeros();
174                return Some(pos + bit_pos);
175            }
176            remaining -= popcount;
177            pos += 64;
178        }
179
180        None
181    }
182
183    /// Find first element >= target (NextGEQ operation)
184    /// Returns (position, value) or None if no such element exists
185    pub fn next_geq(&self, target: u32) -> Option<(u32, u32)> {
186        if self.len == 0 {
187            return None;
188        }
189
190        let target = target as u64;
191
192        // Get the bucket (upper bits of target)
193        let target_upper = target >> self.lower_bit_width;
194        let _target_lower = if self.lower_bit_width == 0 {
195            0
196        } else {
197            target & ((1u64 << self.lower_bit_width) - 1)
198        };
199
200        // Find position via select0(target_upper)
201        let bucket_start = self.select0(target_upper as u32);
202
203        // Scan from bucket_start
204        for pos in bucket_start..self.len {
205            if let Some(val) = self.get(pos)
206                && val as u64 >= target
207            {
208                return Some((pos, val));
209            }
210        }
211
212        None
213    }
214
215    /// Find position after i-th zero bit (0-indexed)
216    /// Returns the index in the original sequence where the i-th bucket starts
217    fn select0(&self, i: u32) -> u32 {
218        if i == 0 {
219            return 0;
220        }
221
222        let mut zeros_seen = 0u32;
223        let mut ones_seen = 0u32;
224
225        for &word in &self.upper_bits {
226            let zeros_in_word = 64 - word.count_ones();
227            let ones_in_word = word.count_ones();
228
229            if zeros_seen + zeros_in_word >= i {
230                // Target zero is in this word
231                let mut w = word;
232                let mut bit_idx = 0u32;
233                while bit_idx < 64 {
234                    if (w & 1) == 0 {
235                        zeros_seen += 1;
236                        if zeros_seen == i {
237                            // Found the i-th zero, return count of 1s seen so far
238                            return ones_seen;
239                        }
240                    } else {
241                        ones_seen += 1;
242                    }
243                    w >>= 1;
244                    bit_idx += 1;
245                }
246            }
247            zeros_seen += zeros_in_word;
248            ones_seen += ones_in_word;
249        }
250
251        self.len
252    }
253
254    /// Serialize to bytes
255    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
256        writer.write_u32::<LittleEndian>(self.len)?;
257        writer.write_u64::<LittleEndian>(self.universe)?;
258        writer.write_u8(self.lower_bit_width)?;
259
260        // Write lower bits
261        writer.write_u32::<LittleEndian>(self.lower_bits.len() as u32)?;
262        for &word in &self.lower_bits {
263            writer.write_u64::<LittleEndian>(word)?;
264        }
265
266        // Write upper bits
267        writer.write_u32::<LittleEndian>(self.upper_bits.len() as u32)?;
268        for &word in &self.upper_bits {
269            writer.write_u64::<LittleEndian>(word)?;
270        }
271
272        Ok(())
273    }
274
275    /// Deserialize from bytes
276    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
277        let len = reader.read_u32::<LittleEndian>()?;
278        let universe = reader.read_u64::<LittleEndian>()?;
279        let lower_bit_width = reader.read_u8()?;
280
281        let lower_len = reader.read_u32::<LittleEndian>()? as usize;
282        let mut lower_bits = Vec::with_capacity(lower_len);
283        for _ in 0..lower_len {
284            lower_bits.push(reader.read_u64::<LittleEndian>()?);
285        }
286
287        let upper_len = reader.read_u32::<LittleEndian>()? as usize;
288        let mut upper_bits = Vec::with_capacity(upper_len);
289        for _ in 0..upper_len {
290            upper_bits.push(reader.read_u64::<LittleEndian>()?);
291        }
292
293        Ok(Self {
294            lower_bits,
295            upper_bits,
296            len,
297            universe,
298            lower_bit_width,
299        })
300    }
301
302    /// Get approximate size in bytes
303    pub fn size_bytes(&self) -> usize {
304        self.lower_bits.len() * 8 + self.upper_bits.len() * 8 + 16
305    }
306
307    /// Create an iterator
308    pub fn iter(&self) -> EliasFanoIterator<'_> {
309        EliasFanoIterator { ef: self, pos: 0 }
310    }
311}
312
313/// Iterator over Elias-Fano encoded sequence
314pub struct EliasFanoIterator<'a> {
315    ef: &'a EliasFano,
316    pos: u32,
317}
318
319impl<'a> Iterator for EliasFanoIterator<'a> {
320    type Item = u32;
321
322    fn next(&mut self) -> Option<Self::Item> {
323        if self.pos >= self.ef.len {
324            return None;
325        }
326
327        // Fast path: use cached upper position
328        let val = self.ef.get(self.pos)?;
329        self.pos += 1;
330        Some(val)
331    }
332
333    fn size_hint(&self) -> (usize, Option<usize>) {
334        let remaining = (self.ef.len - self.pos) as usize;
335        (remaining, Some(remaining))
336    }
337}
338
339impl<'a> ExactSizeIterator for EliasFanoIterator<'a> {}
340
341/// Block size for Elias-Fano BlockMax (matches bitpacking for consistency)
342pub const EF_BLOCK_SIZE: usize = 128;
343
344/// Block metadata for BlockMax WAND optimization
345#[derive(Debug, Clone)]
346pub struct EFBlockInfo {
347    /// First document ID in this block
348    pub first_doc_id: u32,
349    /// Last document ID in this block
350    pub last_doc_id: u32,
351    /// Maximum term frequency in this block
352    pub max_tf: u32,
353    /// Maximum BM25 score upper bound for this block
354    pub max_block_score: f32,
355    /// Starting position (index) in the posting list
356    pub start_pos: u32,
357    /// Number of documents in this block
358    pub num_docs: u16,
359}
360
361impl EFBlockInfo {
362    /// Serialize block info
363    pub fn serialize<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
364        use byteorder::WriteBytesExt;
365        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
366        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
367        writer.write_u32::<LittleEndian>(self.max_tf)?;
368        writer.write_f32::<LittleEndian>(self.max_block_score)?;
369        writer.write_u32::<LittleEndian>(self.start_pos)?;
370        writer.write_u16::<LittleEndian>(self.num_docs)?;
371        Ok(())
372    }
373
374    /// Deserialize block info
375    pub fn deserialize<R: std::io::Read>(reader: &mut R) -> std::io::Result<Self> {
376        use byteorder::ReadBytesExt;
377        Ok(Self {
378            first_doc_id: reader.read_u32::<LittleEndian>()?,
379            last_doc_id: reader.read_u32::<LittleEndian>()?,
380            max_tf: reader.read_u32::<LittleEndian>()?,
381            max_block_score: reader.read_f32::<LittleEndian>()?,
382            start_pos: reader.read_u32::<LittleEndian>()?,
383            num_docs: reader.read_u16::<LittleEndian>()?,
384        })
385    }
386}
387
388/// Elias-Fano encoded posting list with term frequencies and BlockMax support
389#[derive(Debug, Clone)]
390pub struct EliasFanoPostingList {
391    /// Document IDs (Elias-Fano encoded)
392    pub doc_ids: EliasFano,
393    /// Term frequencies (packed, using minimal bits)
394    pub term_freqs: Vec<u8>,
395    /// Bits per term frequency
396    pub tf_bits: u8,
397    /// Maximum term frequency (for BM25 upper bound)
398    pub max_tf: u32,
399    /// Block metadata for BlockMax WAND
400    pub blocks: Vec<EFBlockInfo>,
401    /// Global maximum score across all blocks
402    pub max_score: f32,
403}
404
405impl EliasFanoPostingList {
406    /// BM25 parameters for block-max score calculation
407    const K1: f32 = 1.2;
408    const B: f32 = 0.75;
409
410    /// Compute BM25 upper bound score for a given max_tf and IDF
411    #[inline]
412    pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
413        let tf = max_tf as f32;
414        // Conservative upper bound: assume dl=0, so length_norm = 1 - b = 0.25
415        let min_length_norm = 1.0 - Self::B;
416        let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
417        idf * tf_norm
418    }
419
420    /// Create from doc_ids and term frequencies (without IDF - for backwards compatibility)
421    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
422        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
423    }
424
425    /// Create from doc_ids and term frequencies with IDF for block-max scores
426    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
427        assert_eq!(doc_ids.len(), term_freqs.len());
428
429        if doc_ids.is_empty() {
430            return Self {
431                doc_ids: EliasFano::from_sorted_slice(&[]),
432                term_freqs: Vec::new(),
433                tf_bits: 0,
434                max_tf: 0,
435                blocks: Vec::new(),
436                max_score: 0.0,
437            };
438        }
439
440        let ef_doc_ids = EliasFano::from_sorted_slice(doc_ids);
441
442        // Find max TF to determine bit width
443        let max_tf = *term_freqs.iter().max().unwrap();
444        let tf_bits = if max_tf == 0 {
445            0
446        } else {
447            (32 - max_tf.leading_zeros()) as u8
448        };
449
450        // Pack term frequencies
451        let total_bits = doc_ids.len() * (tf_bits as usize);
452        let total_bytes = total_bits.div_ceil(8);
453        let mut packed_tfs = vec![0u8; total_bytes];
454
455        if tf_bits > 0 {
456            for (i, &tf) in term_freqs.iter().enumerate() {
457                let bit_pos = i * (tf_bits as usize);
458                let byte_idx = bit_pos / 8;
459                let bit_offset = bit_pos % 8;
460
461                // Store tf - 1 to save one bit (tf is always >= 1)
462                let val = tf.saturating_sub(1);
463
464                packed_tfs[byte_idx] |= (val as u8) << bit_offset;
465                if bit_offset + (tf_bits as usize) > 8 && byte_idx + 1 < packed_tfs.len() {
466                    packed_tfs[byte_idx + 1] |= (val >> (8 - bit_offset)) as u8;
467                }
468                if bit_offset + (tf_bits as usize) > 16 && byte_idx + 2 < packed_tfs.len() {
469                    packed_tfs[byte_idx + 2] |= (val >> (16 - bit_offset)) as u8;
470                }
471            }
472        }
473
474        // Build block metadata for BlockMax WAND
475        let mut blocks = Vec::new();
476        let mut max_score = 0.0f32;
477        let mut i = 0;
478
479        while i < doc_ids.len() {
480            let block_end = (i + EF_BLOCK_SIZE).min(doc_ids.len());
481            let block_doc_ids = &doc_ids[i..block_end];
482            let block_tfs = &term_freqs[i..block_end];
483
484            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
485            let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
486            max_score = max_score.max(block_score);
487
488            blocks.push(EFBlockInfo {
489                first_doc_id: block_doc_ids[0],
490                last_doc_id: *block_doc_ids.last().unwrap(),
491                max_tf: block_max_tf,
492                max_block_score: block_score,
493                start_pos: i as u32,
494                num_docs: (block_end - i) as u16,
495            });
496
497            i = block_end;
498        }
499
500        Self {
501            doc_ids: ef_doc_ids,
502            term_freqs: packed_tfs,
503            tf_bits,
504            max_tf,
505            blocks,
506            max_score,
507        }
508    }
509
510    /// Get term frequency at position
511    pub fn get_tf(&self, pos: u32) -> u32 {
512        if self.tf_bits == 0 || pos >= self.doc_ids.len() {
513            return 1;
514        }
515
516        let bit_pos = (pos as usize) * (self.tf_bits as usize);
517        let byte_idx = bit_pos / 8;
518        let bit_offset = bit_pos % 8;
519        let mask = (1u32 << self.tf_bits) - 1;
520
521        let mut val = (self.term_freqs[byte_idx] >> bit_offset) as u32;
522        if bit_offset + (self.tf_bits as usize) > 8 && byte_idx + 1 < self.term_freqs.len() {
523            val |= (self.term_freqs[byte_idx + 1] as u32) << (8 - bit_offset);
524        }
525        if bit_offset + (self.tf_bits as usize) > 16 && byte_idx + 2 < self.term_freqs.len() {
526            val |= (self.term_freqs[byte_idx + 2] as u32) << (16 - bit_offset);
527        }
528
529        (val & mask) + 1 // Add 1 back since we stored tf-1
530    }
531
532    /// Number of documents
533    pub fn len(&self) -> u32 {
534        self.doc_ids.len()
535    }
536
537    /// Check if empty
538    pub fn is_empty(&self) -> bool {
539        self.doc_ids.is_empty()
540    }
541
542    /// Serialize
543    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
544        self.doc_ids.serialize(writer)?;
545        writer.write_u8(self.tf_bits)?;
546        writer.write_u32::<LittleEndian>(self.max_tf)?;
547        writer.write_f32::<LittleEndian>(self.max_score)?;
548        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
549        writer.write_all(&self.term_freqs)?;
550
551        // Write block metadata
552        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
553        for block in &self.blocks {
554            block.serialize(writer)?;
555        }
556
557        Ok(())
558    }
559
560    /// Deserialize
561    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
562        let doc_ids = EliasFano::deserialize(reader)?;
563        let tf_bits = reader.read_u8()?;
564        let max_tf = reader.read_u32::<LittleEndian>()?;
565        let max_score = reader.read_f32::<LittleEndian>()?;
566        let tf_len = reader.read_u32::<LittleEndian>()? as usize;
567        let mut term_freqs = vec![0u8; tf_len];
568        reader.read_exact(&mut term_freqs)?;
569
570        // Read block metadata
571        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
572        let mut blocks = Vec::with_capacity(num_blocks);
573        for _ in 0..num_blocks {
574            blocks.push(EFBlockInfo::deserialize(reader)?);
575        }
576
577        Ok(Self {
578            doc_ids,
579            term_freqs,
580            tf_bits,
581            max_tf,
582            blocks,
583            max_score,
584        })
585    }
586
587    /// Get number of blocks
588    pub fn num_blocks(&self) -> usize {
589        self.blocks.len()
590    }
591
592    /// Get block containing position
593    pub fn block_for_pos(&self, pos: u32) -> usize {
594        (pos as usize) / EF_BLOCK_SIZE
595    }
596
597    /// Create iterator
598    pub fn iterator(&self) -> EliasFanoPostingIterator<'_> {
599        EliasFanoPostingIterator {
600            list: self,
601            pos: 0,
602            current_block: 0,
603        }
604    }
605}
606
607/// Iterator over Elias-Fano posting list with BlockMax support
608pub struct EliasFanoPostingIterator<'a> {
609    list: &'a EliasFanoPostingList,
610    pos: u32,
611    current_block: usize,
612}
613
614impl<'a> EliasFanoPostingIterator<'a> {
615    /// Current document ID
616    pub fn doc(&self) -> u32 {
617        self.list.doc_ids.get(self.pos).unwrap_or(u32::MAX)
618    }
619
620    /// Current term frequency
621    pub fn term_freq(&self) -> u32 {
622        self.list.get_tf(self.pos)
623    }
624
625    /// Advance to next document
626    pub fn advance(&mut self) -> u32 {
627        self.pos += 1;
628        // Update current block if we've moved past it
629        if !self.list.blocks.is_empty() {
630            let new_block = self.list.block_for_pos(self.pos);
631            if new_block < self.list.blocks.len() {
632                self.current_block = new_block;
633            }
634        }
635        self.doc()
636    }
637
638    /// Seek to first doc >= target
639    pub fn seek(&mut self, target: u32) -> u32 {
640        // Use block skip list for faster seeking
641        if !self.list.blocks.is_empty() {
642            // Binary search to find the right block
643            let block_idx = self.list.blocks[self.current_block..].binary_search_by(|block| {
644                if block.last_doc_id < target {
645                    std::cmp::Ordering::Less
646                } else if block.first_doc_id > target {
647                    std::cmp::Ordering::Greater
648                } else {
649                    std::cmp::Ordering::Equal
650                }
651            });
652
653            let target_block = match block_idx {
654                Ok(idx) => self.current_block + idx,
655                Err(idx) => {
656                    let abs_idx = self.current_block + idx;
657                    if abs_idx >= self.list.blocks.len() {
658                        self.pos = self.list.len();
659                        return u32::MAX;
660                    }
661                    abs_idx
662                }
663            };
664
665            // Jump to block start if it's ahead
666            if target_block > self.current_block {
667                self.current_block = target_block;
668                self.pos = self.list.blocks[target_block].start_pos;
669            }
670        }
671
672        // Use Elias-Fano's next_geq for efficient seeking within block
673        if let Some((pos, val)) = self.list.doc_ids.next_geq(target)
674            && pos >= self.pos
675        {
676            self.pos = pos;
677            if !self.list.blocks.is_empty() {
678                self.current_block = self.list.block_for_pos(pos);
679            }
680            return val;
681        }
682
683        // Linear scan from current position (fallback)
684        while self.pos < self.list.len() {
685            let doc = self.doc();
686            if doc >= target {
687                return doc;
688            }
689            self.pos += 1;
690        }
691
692        u32::MAX
693    }
694
695    /// Check if exhausted
696    pub fn is_exhausted(&self) -> bool {
697        self.pos >= self.list.len()
698    }
699
700    /// Get current block's max score (for BlockMax WAND)
701    pub fn current_block_max_score(&self) -> f32 {
702        if self.is_exhausted() || self.list.blocks.is_empty() {
703            return 0.0;
704        }
705        if self.current_block < self.list.blocks.len() {
706            self.list.blocks[self.current_block].max_block_score
707        } else {
708            0.0
709        }
710    }
711
712    /// Get current block's max term frequency (for BM25F recalculation)
713    pub fn current_block_max_tf(&self) -> u32 {
714        if self.is_exhausted() || self.list.blocks.is_empty() {
715            return 0;
716        }
717        if self.current_block < self.list.blocks.len() {
718            self.list.blocks[self.current_block].max_tf
719        } else {
720            0
721        }
722    }
723
724    /// Get max score for remaining blocks (for MaxScore optimization)
725    pub fn max_remaining_score(&self) -> f32 {
726        if self.is_exhausted() || self.list.blocks.is_empty() {
727            return 0.0;
728        }
729        self.list.blocks[self.current_block..]
730            .iter()
731            .map(|b| b.max_block_score)
732            .fold(0.0f32, |a, b| a.max(b))
733    }
734
735    /// Skip to next block containing doc >= target (for BlockWAND)
736    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
737    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
738        if self.list.blocks.is_empty() {
739            return None;
740        }
741
742        while self.current_block < self.list.blocks.len() {
743            let block = &self.list.blocks[self.current_block];
744            if block.last_doc_id >= target {
745                self.pos = block.start_pos;
746                return Some((block.first_doc_id, block.max_block_score));
747            }
748            self.current_block += 1;
749        }
750
751        self.pos = self.list.len();
752        None
753    }
754}
755
756#[cfg(test)]
757mod tests {
758    use super::*;
759
760    #[test]
761    fn test_elias_fano_basic() {
762        let values = vec![2, 3, 5, 7, 11, 13, 24];
763        let ef = EliasFano::from_sorted_slice(&values);
764
765        assert_eq!(ef.len(), 7);
766
767        for (i, &expected) in values.iter().enumerate() {
768            assert_eq!(ef.get(i as u32), Some(expected), "Mismatch at index {}", i);
769        }
770    }
771
772    #[test]
773    fn test_elias_fano_next_geq() {
774        let values = vec![10, 20, 30, 100, 200, 300, 1000];
775        let ef = EliasFano::from_sorted_slice(&values);
776
777        assert_eq!(ef.next_geq(5), Some((0, 10)));
778        assert_eq!(ef.next_geq(10), Some((0, 10)));
779        assert_eq!(ef.next_geq(15), Some((1, 20)));
780        assert_eq!(ef.next_geq(100), Some((3, 100)));
781        assert_eq!(ef.next_geq(500), Some((6, 1000)));
782        assert_eq!(ef.next_geq(2000), None);
783    }
784
785    #[test]
786    fn test_elias_fano_serialization() {
787        let values: Vec<u32> = (0..1000).map(|i| i * 3).collect();
788        let ef = EliasFano::from_sorted_slice(&values);
789
790        let mut buffer = Vec::new();
791        ef.serialize(&mut buffer).unwrap();
792
793        let restored = EliasFano::deserialize(&mut &buffer[..]).unwrap();
794
795        assert_eq!(restored.len(), ef.len());
796        for i in 0..ef.len() {
797            assert_eq!(restored.get(i), ef.get(i));
798        }
799    }
800
801    #[test]
802    fn test_elias_fano_posting_list() {
803        let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
804        let term_freqs: Vec<u32> = vec![1, 2, 3, 4, 5, 6, 7];
805
806        let list = EliasFanoPostingList::from_postings(&doc_ids, &term_freqs);
807
808        assert_eq!(list.len(), 7);
809        assert_eq!(list.max_tf, 7);
810
811        let mut iter = list.iterator();
812        for (i, (&expected_doc, &expected_tf)) in doc_ids.iter().zip(term_freqs.iter()).enumerate()
813        {
814            assert_eq!(iter.doc(), expected_doc, "Doc mismatch at {}", i);
815            assert_eq!(iter.term_freq(), expected_tf, "TF mismatch at {}", i);
816            iter.advance();
817        }
818    }
819
820    #[test]
821    fn test_elias_fano_iterator_seek() {
822        let doc_ids: Vec<u32> = (0..100).map(|i| i * 10).collect();
823        let term_freqs: Vec<u32> = vec![1; 100];
824
825        let list = EliasFanoPostingList::from_postings(&doc_ids, &term_freqs);
826        let mut iter = list.iterator();
827
828        assert_eq!(iter.seek(55), 60);
829        assert_eq!(iter.seek(100), 100);
830        assert_eq!(iter.seek(999), u32::MAX);
831    }
832
833    #[test]
834    fn test_elias_fano_block_max() {
835        // Create a large posting list that spans multiple blocks
836        let doc_ids: Vec<u32> = (0..500).map(|i| i * 2).collect();
837        // Vary term frequencies so different blocks have different max_tf
838        let term_freqs: Vec<u32> = (0..500)
839            .map(|i| {
840                if i < 128 {
841                    1 // Block 0: max_tf = 1
842                } else if i < 256 {
843                    5 // Block 1: max_tf = 5
844                } else if i < 384 {
845                    10 // Block 2: max_tf = 10
846                } else {
847                    3 // Block 3: max_tf = 3
848                }
849            })
850            .collect();
851
852        let list = EliasFanoPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
853
854        // Should have 4 blocks (500 docs / 128 per block)
855        assert_eq!(list.num_blocks(), 4);
856        assert_eq!(list.blocks[0].max_tf, 1);
857        assert_eq!(list.blocks[1].max_tf, 5);
858        assert_eq!(list.blocks[2].max_tf, 10);
859        assert_eq!(list.blocks[3].max_tf, 3);
860
861        // Block 2 should have highest score (max_tf = 10)
862        assert!(list.blocks[2].max_block_score > list.blocks[0].max_block_score);
863        assert!(list.blocks[2].max_block_score > list.blocks[1].max_block_score);
864        assert!(list.blocks[2].max_block_score > list.blocks[3].max_block_score);
865
866        // Global max_score should equal block 2's score
867        assert_eq!(list.max_score, list.blocks[2].max_block_score);
868
869        // Test raw Elias-Fano next_geq first
870        let (pos, val) = list.doc_ids.next_geq(256).unwrap();
871        assert_eq!(val, 256, "next_geq(256) should return 256, got {}", val);
872        assert_eq!(pos, 128, "position of 256 should be 128, got {}", pos);
873
874        // Test iterator block-max methods
875        let mut iter = list.iterator();
876        assert_eq!(iter.current_block_max_tf(), 1); // Block 0
877
878        // Verify block boundaries
879        // Block 0: positions 0-127, doc_ids 0-254 (i*2 where i=0..127)
880        // Block 1: positions 128-255, doc_ids 256-510 (i*2 where i=128..255)
881        // Block 2: positions 256-383, doc_ids 512-766 (i*2 where i=256..383)
882        // Block 3: positions 384-499, doc_ids 768-998 (i*2 where i=384..499)
883        assert_eq!(list.blocks[0].first_doc_id, 0);
884        assert_eq!(list.blocks[0].last_doc_id, 254);
885        assert_eq!(list.blocks[1].first_doc_id, 256);
886        assert_eq!(list.blocks[1].last_doc_id, 510);
887
888        // Seek to block 1 - use a doc_id clearly in block 1's range
889        let doc = iter.seek(256); // first doc in block 1
890        assert_eq!(doc, 256, "seek(256) should return 256, got {}", doc);
891        // After seek, current_block should be updated
892        let block_tf = iter.current_block_max_tf();
893        assert_eq!(block_tf, 5, "block 1 max_tf should be 5, got {}", block_tf);
894
895        // Seek to block 2
896        let doc = iter.seek(512); // first doc in block 2
897        assert_eq!(doc, 512, "seek(512) should return 512, got {}", doc);
898        assert_eq!(iter.current_block_max_tf(), 10);
899    }
900
901    #[test]
902    fn test_elias_fano_block_max_serialization() {
903        let doc_ids: Vec<u32> = (0..300).map(|i| i * 3).collect();
904        let term_freqs: Vec<u32> = (0..300).map(|i| (i % 10) + 1).collect();
905
906        let list = EliasFanoPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
907
908        let mut buffer = Vec::new();
909        list.serialize(&mut buffer).unwrap();
910
911        let restored = EliasFanoPostingList::deserialize(&mut &buffer[..]).unwrap();
912
913        assert_eq!(restored.len(), list.len());
914        assert_eq!(restored.max_tf, list.max_tf);
915        assert_eq!(restored.max_score, list.max_score);
916        assert_eq!(restored.num_blocks(), list.num_blocks());
917
918        // Verify block metadata
919        for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
920            assert_eq!(orig.first_doc_id, rest.first_doc_id);
921            assert_eq!(orig.last_doc_id, rest.last_doc_id);
922            assert_eq!(orig.max_tf, rest.max_tf);
923            assert_eq!(orig.max_block_score, rest.max_block_score);
924        }
925
926        // Verify iteration produces same results
927        let mut iter1 = list.iterator();
928        let mut iter2 = restored.iterator();
929
930        while !iter1.is_exhausted() {
931            assert_eq!(iter1.doc(), iter2.doc());
932            assert_eq!(iter1.term_freq(), iter2.term_freq());
933            iter1.advance();
934            iter2.advance();
935        }
936        assert!(iter2.is_exhausted());
937    }
938}