hermes_core/structures/
partitioned_ef.rs

1//! Partitioned Elias-Fano (PEF) encoding
2//!
3//! Based on Ottaviano & Venturini (2014) "Partitioned Elias-Fano Indexes"
4//!
5//! Key improvements over standard Elias-Fano:
6//! - **Optimal partitioning**: Divides sequence into chunks with (1+ε)-optimal compression
7//! - **Better compression**: 30-40% smaller than standard EF on typical data
8//! - **Fast random access**: O(1) access within partitions
9//! - **Efficient NextGEQ**: Uses partition endpoints for fast seeking
10//!
11//! The algorithm finds optimal partition boundaries that minimize total encoding size.
12//! Each partition is encoded with its own Elias-Fano instance using local parameters.
13
14use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
15use std::io::{self, Read, Write};
16
17/// Minimum partition size (smaller partitions have too much overhead)
18const MIN_PARTITION_SIZE: usize = 64;
19
20/// Maximum partition size (larger partitions lose locality benefits)
21const MAX_PARTITION_SIZE: usize = 512;
22
23/// A single partition in the PEF structure
24#[derive(Debug, Clone)]
25pub struct EFPartition {
26    /// Lower bits array
27    lower_bits: Vec<u64>,
28    /// Upper bits array (unary encoded)
29    upper_bits: Vec<u64>,
30    /// Number of elements in this partition
31    len: u32,
32    /// First value in partition (absolute)
33    first_value: u32,
34    /// Last value in partition (absolute)
35    last_value: u32,
36    /// Number of lower bits per element
37    lower_bit_width: u8,
38}
39
40impl EFPartition {
41    /// Create a partition from a sorted slice
42    /// Values are stored relative to first_value for better compression
43    pub fn from_sorted_slice(values: &[u32]) -> Self {
44        if values.is_empty() {
45            return Self {
46                lower_bits: Vec::new(),
47                upper_bits: Vec::new(),
48                len: 0,
49                first_value: 0,
50                last_value: 0,
51                lower_bit_width: 0,
52            };
53        }
54
55        let first_value = values[0];
56        let last_value = *values.last().unwrap();
57        let n = values.len() as u64;
58
59        // Local universe: encode values relative to first_value
60        let local_universe = last_value - first_value + 1;
61
62        // Calculate lower bit width using local universe
63        let lower_bit_width = if n <= 1 {
64            0
65        } else {
66            let ratio = (local_universe as u64).max(1) / n.max(1);
67            if ratio <= 1 {
68                0
69            } else {
70                (64 - ratio.leading_zeros() - 1) as u8
71            }
72        };
73
74        // Allocate arrays
75        let lower_bits_total = (n as usize) * (lower_bit_width as usize);
76        let lower_words = lower_bits_total.div_ceil(64);
77        let mut lower_bits = vec![0u64; lower_words];
78
79        let max_relative = local_universe.saturating_sub(1) as u64;
80        let upper_bound = n + (max_relative >> lower_bit_width) + 1;
81        let upper_words = (upper_bound as usize).div_ceil(64);
82        let mut upper_bits = vec![0u64; upper_words];
83
84        let lower_mask = if lower_bit_width == 0 {
85            0
86        } else {
87            (1u64 << lower_bit_width) - 1
88        };
89
90        // Encode each value relative to first_value
91        for (i, &val) in values.iter().enumerate() {
92            let relative_val = (val - first_value) as u64;
93
94            // Store lower bits
95            if lower_bit_width > 0 {
96                let lower = relative_val & lower_mask;
97                let bit_pos = i * (lower_bit_width as usize);
98                let word_idx = bit_pos / 64;
99                let bit_offset = bit_pos % 64;
100
101                lower_bits[word_idx] |= lower << bit_offset;
102                if bit_offset + (lower_bit_width as usize) > 64 && word_idx + 1 < lower_bits.len() {
103                    lower_bits[word_idx + 1] |= lower >> (64 - bit_offset);
104                }
105            }
106
107            // Store upper bits
108            let upper = relative_val >> lower_bit_width;
109            let upper_pos = (i as u64) + upper;
110            let word_idx = (upper_pos / 64) as usize;
111            let bit_offset = upper_pos % 64;
112            if word_idx < upper_bits.len() {
113                upper_bits[word_idx] |= 1u64 << bit_offset;
114            }
115        }
116
117        Self {
118            lower_bits,
119            upper_bits,
120            len: values.len() as u32,
121            first_value,
122            last_value,
123            lower_bit_width,
124        }
125    }
126
127    /// Get element at position i (0-indexed)
128    #[inline]
129    pub fn get(&self, i: u32) -> Option<u32> {
130        if i >= self.len {
131            return None;
132        }
133
134        let i = i as usize;
135
136        // Get lower bits
137        let lower = if self.lower_bit_width == 0 {
138            0u64
139        } else {
140            let bit_pos = i * (self.lower_bit_width as usize);
141            let word_idx = bit_pos / 64;
142            let bit_offset = bit_pos % 64;
143            let lower_mask = (1u64 << self.lower_bit_width) - 1;
144
145            let mut val = (self.lower_bits[word_idx] >> bit_offset) & lower_mask;
146            if bit_offset + (self.lower_bit_width as usize) > 64
147                && word_idx + 1 < self.lower_bits.len()
148            {
149                val |= (self.lower_bits[word_idx + 1] << (64 - bit_offset)) & lower_mask;
150            }
151            val
152        };
153
154        // Get upper bits via select1(i) - i
155        let select_pos = self.select1(i as u32)?;
156        let upper = (select_pos as u64) - (i as u64);
157
158        // Reconstruct absolute value
159        let relative_val = (upper << self.lower_bit_width) | lower;
160        Some(self.first_value + relative_val as u32)
161    }
162
163    /// Find position of i-th set bit (optimized with NEON on aarch64)
164    fn select1(&self, i: u32) -> Option<u32> {
165        if i >= self.len {
166            return None;
167        }
168
169        let mut remaining = i + 1;
170        let mut pos = 0u32;
171
172        // Process 4 words at a time on aarch64 using NEON popcount
173        #[cfg(target_arch = "aarch64")]
174        {
175            use std::arch::aarch64::*;
176
177            let chunks = self.upper_bits.chunks_exact(4);
178            let remainder = chunks.remainder();
179
180            for chunk in chunks {
181                // Load 4 u64 words (32 bytes)
182                let words: [u64; 4] = [chunk[0], chunk[1], chunk[2], chunk[3]];
183
184                // Count bits in each word using NEON
185                // vcntq_u8 counts bits per byte, then sum horizontally
186                unsafe {
187                    let bytes = std::mem::transmute::<[u64; 4], [u8; 32]>(words);
188
189                    // Process first 16 bytes (words 0-1)
190                    let v0 = vld1q_u8(bytes.as_ptr());
191                    let cnt0 = vcntq_u8(v0);
192                    let sum0 = vaddlvq_u8(cnt0) as u32;
193
194                    // Process next 16 bytes (words 2-3)
195                    let v1 = vld1q_u8(bytes.as_ptr().add(16));
196                    let cnt1 = vcntq_u8(v1);
197                    let sum1 = vaddlvq_u8(cnt1) as u32;
198
199                    let total_popcount = sum0 + sum1;
200
201                    if total_popcount >= remaining {
202                        // Found the chunk, now find exact word
203                        for &word in chunk {
204                            let popcount = word.count_ones();
205                            if popcount >= remaining {
206                                let mut w = word;
207                                for _ in 0..remaining - 1 {
208                                    w &= w - 1;
209                                }
210                                return Some(pos + w.trailing_zeros());
211                            }
212                            remaining -= popcount;
213                            pos += 64;
214                        }
215                    }
216                    remaining -= total_popcount;
217                    pos += 256; // 4 words * 64 bits
218                }
219            }
220
221            // Handle remaining words
222            for &word in remainder {
223                let popcount = word.count_ones();
224                if popcount >= remaining {
225                    let mut w = word;
226                    for _ in 0..remaining - 1 {
227                        w &= w - 1;
228                    }
229                    return Some(pos + w.trailing_zeros());
230                }
231                remaining -= popcount;
232                pos += 64;
233            }
234        }
235
236        // Process 4 words at a time on x86_64 using POPCNT instruction
237        #[cfg(target_arch = "x86_64")]
238        {
239            #[cfg(target_feature = "popcnt")]
240            {
241                use std::arch::x86_64::*;
242
243                let chunks = self.upper_bits.chunks_exact(4);
244                let remainder = chunks.remainder();
245
246                for chunk in chunks {
247                    // Use hardware POPCNT for each word
248                    unsafe {
249                        let p0 = _popcnt64(chunk[0] as i64) as u32;
250                        let p1 = _popcnt64(chunk[1] as i64) as u32;
251                        let p2 = _popcnt64(chunk[2] as i64) as u32;
252                        let p3 = _popcnt64(chunk[3] as i64) as u32;
253
254                        let total_popcount = p0 + p1 + p2 + p3;
255
256                        if total_popcount >= remaining {
257                            // Found the chunk, now find exact word
258                            for &word in chunk {
259                                let popcount = word.count_ones();
260                                if popcount >= remaining {
261                                    let mut w = word;
262                                    for _ in 0..remaining - 1 {
263                                        w &= w - 1;
264                                    }
265                                    return Some(pos + w.trailing_zeros());
266                                }
267                                remaining -= popcount;
268                                pos += 64;
269                            }
270                        }
271                        remaining -= total_popcount;
272                        pos += 256; // 4 words * 64 bits
273                    }
274                }
275
276                // Handle remaining words
277                for &word in remainder {
278                    let popcount = word.count_ones();
279                    if popcount >= remaining {
280                        let mut w = word;
281                        for _ in 0..remaining - 1 {
282                            w &= w - 1;
283                        }
284                        return Some(pos + w.trailing_zeros());
285                    }
286                    remaining -= popcount;
287                    pos += 64;
288                }
289            }
290
291            // Scalar fallback when POPCNT not available
292            #[cfg(not(target_feature = "popcnt"))]
293            {
294                for &word in &self.upper_bits {
295                    let popcount = word.count_ones();
296                    if popcount >= remaining {
297                        let mut w = word;
298                        for _ in 0..remaining - 1 {
299                            w &= w - 1;
300                        }
301                        return Some(pos + w.trailing_zeros());
302                    }
303                    remaining -= popcount;
304                    pos += 64;
305                }
306            }
307        }
308
309        // Scalar fallback for other architectures
310        #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
311        {
312            for &word in &self.upper_bits {
313                let popcount = word.count_ones();
314                if popcount >= remaining {
315                    let mut w = word;
316                    for _ in 0..remaining - 1 {
317                        w &= w - 1;
318                    }
319                    return Some(pos + w.trailing_zeros());
320                }
321                remaining -= popcount;
322                pos += 64;
323            }
324        }
325
326        None
327    }
328
329    /// Find first element >= target within this partition
330    /// Returns (local_position, value) or None
331    pub fn next_geq(&self, target: u32) -> Option<(u32, u32)> {
332        if self.len == 0 || target > self.last_value {
333            return None;
334        }
335
336        if target <= self.first_value {
337            return Some((0, self.first_value));
338        }
339
340        // Binary search for target
341        let mut lo = 0u32;
342        let mut hi = self.len;
343
344        while lo < hi {
345            let mid = lo + (hi - lo) / 2;
346            if let Some(val) = self.get(mid) {
347                if val < target {
348                    lo = mid + 1;
349                } else {
350                    hi = mid;
351                }
352            } else {
353                break;
354            }
355        }
356
357        if lo < self.len {
358            self.get(lo).map(|v| (lo, v))
359        } else {
360            None
361        }
362    }
363
364    /// Serialize partition
365    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
366        writer.write_u32::<LittleEndian>(self.len)?;
367        writer.write_u32::<LittleEndian>(self.first_value)?;
368        writer.write_u32::<LittleEndian>(self.last_value)?;
369        // local_universe removed - computed as last_value - first_value + 1
370        writer.write_u8(self.lower_bit_width)?;
371
372        writer.write_u32::<LittleEndian>(self.lower_bits.len() as u32)?;
373        for &word in &self.lower_bits {
374            writer.write_u64::<LittleEndian>(word)?;
375        }
376
377        writer.write_u32::<LittleEndian>(self.upper_bits.len() as u32)?;
378        for &word in &self.upper_bits {
379            writer.write_u64::<LittleEndian>(word)?;
380        }
381
382        Ok(())
383    }
384
385    /// Deserialize partition
386    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
387        let len = reader.read_u32::<LittleEndian>()?;
388        let first_value = reader.read_u32::<LittleEndian>()?;
389        let last_value = reader.read_u32::<LittleEndian>()?;
390        // local_universe removed - computed as last_value - first_value + 1
391        let lower_bit_width = reader.read_u8()?;
392
393        let lower_len = reader.read_u32::<LittleEndian>()? as usize;
394        let mut lower_bits = Vec::with_capacity(lower_len);
395        for _ in 0..lower_len {
396            lower_bits.push(reader.read_u64::<LittleEndian>()?);
397        }
398
399        let upper_len = reader.read_u32::<LittleEndian>()? as usize;
400        let mut upper_bits = Vec::with_capacity(upper_len);
401        for _ in 0..upper_len {
402            upper_bits.push(reader.read_u64::<LittleEndian>()?);
403        }
404
405        Ok(Self {
406            lower_bits,
407            upper_bits,
408            len,
409            first_value,
410            last_value,
411            lower_bit_width,
412        })
413    }
414
415    /// Size in bytes (13 bytes header + arrays)
416    pub fn size_bytes(&self) -> usize {
417        13 + self.lower_bits.len() * 8 + self.upper_bits.len() * 8
418    }
419}
420
421/// Estimate encoding cost for a partition
422fn estimate_partition_cost(values: &[u32]) -> usize {
423    if values.is_empty() {
424        return 0;
425    }
426
427    let n = values.len();
428    let first = values[0];
429    let last = *values.last().unwrap();
430    let local_universe = (last - first + 1) as usize;
431
432    // Lower bits: n * log2(universe/n)
433    let lower_bits = if n <= 1 || local_universe <= n {
434        0
435    } else {
436        let ratio = local_universe / n;
437        let l = (usize::BITS - ratio.leading_zeros()) as usize;
438        n * l
439    };
440
441    // Upper bits: approximately 2n bits
442    let upper_bits = 2 * n;
443
444    // Overhead: partition header (13 bytes after removing local_universe)
445    let overhead = 13 * 8; // 13 bytes header
446
447    (lower_bits + upper_bits + overhead).div_ceil(8)
448}
449
450/// Find optimal partition boundaries using dynamic programming
451/// Returns partition endpoints (exclusive)
452fn find_optimal_partitions(values: &[u32]) -> Vec<usize> {
453    let n = values.len();
454    if n <= MIN_PARTITION_SIZE {
455        return vec![n];
456    }
457
458    // dp[i] = minimum cost to encode values[0..i]
459    // parent[i] = start of last partition ending at i
460    let mut dp = vec![usize::MAX; n + 1];
461    let mut parent = vec![0usize; n + 1];
462    dp[0] = 0;
463
464    for i in MIN_PARTITION_SIZE..=n {
465        // Try all valid partition sizes ending at i
466        let min_start = i.saturating_sub(MAX_PARTITION_SIZE);
467        let max_start = i.saturating_sub(MIN_PARTITION_SIZE);
468
469        for start in min_start..=max_start {
470            if dp[start] == usize::MAX {
471                continue;
472            }
473
474            let partition_cost = estimate_partition_cost(&values[start..i]);
475            let total_cost = dp[start].saturating_add(partition_cost);
476
477            if total_cost < dp[i] {
478                dp[i] = total_cost;
479                parent[i] = start;
480            }
481        }
482    }
483
484    // Handle case where n is not reachable (too small for partitioning)
485    if dp[n] == usize::MAX {
486        return vec![n];
487    }
488
489    // Reconstruct partition boundaries
490    let mut boundaries = Vec::new();
491    let mut pos = n;
492    while pos > 0 {
493        boundaries.push(pos);
494        pos = parent[pos];
495    }
496    boundaries.reverse();
497
498    boundaries
499}
500
501/// Partitioned Elias-Fano encoded sequence
502#[derive(Debug, Clone)]
503pub struct PartitionedEliasFano {
504    /// Individual partitions
505    partitions: Vec<EFPartition>,
506    /// Total number of elements
507    len: u32,
508    /// Cumulative element counts for each partition (for position lookup)
509    cumulative_counts: Vec<u32>,
510}
511
512impl PartitionedEliasFano {
513    /// Create from sorted slice with optimal partitioning
514    pub fn from_sorted_slice(values: &[u32]) -> Self {
515        if values.is_empty() {
516            return Self {
517                partitions: Vec::new(),
518                len: 0,
519                cumulative_counts: Vec::new(),
520            };
521        }
522
523        let boundaries = find_optimal_partitions(values);
524        let mut partitions = Vec::with_capacity(boundaries.len());
525        let mut cumulative_counts = Vec::with_capacity(boundaries.len());
526
527        let mut start = 0;
528        let mut cumulative = 0u32;
529
530        for &end in &boundaries {
531            let partition = EFPartition::from_sorted_slice(&values[start..end]);
532            cumulative += partition.len;
533            cumulative_counts.push(cumulative);
534            partitions.push(partition);
535            start = end;
536        }
537
538        Self {
539            partitions,
540            len: values.len() as u32,
541            cumulative_counts,
542        }
543    }
544
545    /// Number of elements
546    #[inline]
547    pub fn len(&self) -> u32 {
548        self.len
549    }
550
551    /// Check if empty
552    #[inline]
553    pub fn is_empty(&self) -> bool {
554        self.len == 0
555    }
556
557    /// Number of partitions
558    pub fn num_partitions(&self) -> usize {
559        self.partitions.len()
560    }
561
562    /// Get element at global position
563    pub fn get(&self, pos: u32) -> Option<u32> {
564        if pos >= self.len {
565            return None;
566        }
567
568        // Find partition containing this position
569        let partition_idx = self
570            .cumulative_counts
571            .binary_search(&(pos + 1))
572            .unwrap_or_else(|x| x);
573
574        if partition_idx >= self.partitions.len() {
575            return None;
576        }
577
578        let local_pos = if partition_idx == 0 {
579            pos
580        } else {
581            pos - self.cumulative_counts[partition_idx - 1]
582        };
583
584        self.partitions[partition_idx].get(local_pos)
585    }
586
587    /// Find first element >= target
588    /// Returns (global_position, value) or None
589    pub fn next_geq(&self, target: u32) -> Option<(u32, u32)> {
590        if self.partitions.is_empty() {
591            return None;
592        }
593
594        // Binary search for partition containing target
595        let partition_idx = self
596            .partitions
597            .binary_search_by(|p| {
598                if p.last_value < target {
599                    std::cmp::Ordering::Less
600                } else if p.first_value > target {
601                    std::cmp::Ordering::Greater
602                } else {
603                    std::cmp::Ordering::Equal
604                }
605            })
606            .unwrap_or_else(|x| x);
607
608        // Search from this partition onwards
609        for (i, partition) in self.partitions[partition_idx..].iter().enumerate() {
610            let actual_idx = partition_idx + i;
611
612            if let Some((local_pos, val)) = partition.next_geq(target) {
613                let global_pos = if actual_idx == 0 {
614                    local_pos
615                } else {
616                    self.cumulative_counts[actual_idx - 1] + local_pos
617                };
618                return Some((global_pos, val));
619            }
620        }
621
622        None
623    }
624
625    /// Serialize
626    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
627        writer.write_u32::<LittleEndian>(self.len)?;
628        writer.write_u32::<LittleEndian>(self.partitions.len() as u32)?;
629
630        for &count in &self.cumulative_counts {
631            writer.write_u32::<LittleEndian>(count)?;
632        }
633
634        for partition in &self.partitions {
635            partition.serialize(writer)?;
636        }
637
638        Ok(())
639    }
640
641    /// Deserialize
642    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
643        let len = reader.read_u32::<LittleEndian>()?;
644        let num_partitions = reader.read_u32::<LittleEndian>()? as usize;
645
646        let mut cumulative_counts = Vec::with_capacity(num_partitions);
647        for _ in 0..num_partitions {
648            cumulative_counts.push(reader.read_u32::<LittleEndian>()?);
649        }
650
651        let mut partitions = Vec::with_capacity(num_partitions);
652        for _ in 0..num_partitions {
653            partitions.push(EFPartition::deserialize(reader)?);
654        }
655
656        Ok(Self {
657            partitions,
658            len,
659            cumulative_counts,
660        })
661    }
662
663    /// Total size in bytes
664    pub fn size_bytes(&self) -> usize {
665        let mut size = 8 + self.cumulative_counts.len() * 4;
666        for p in &self.partitions {
667            size += p.size_bytes();
668        }
669        size
670    }
671
672    /// Create iterator
673    pub fn iter(&self) -> PartitionedEFIterator<'_> {
674        PartitionedEFIterator {
675            pef: self,
676            partition_idx: 0,
677            local_pos: 0,
678        }
679    }
680}
681
682/// Iterator over Partitioned Elias-Fano
683pub struct PartitionedEFIterator<'a> {
684    pef: &'a PartitionedEliasFano,
685    partition_idx: usize,
686    local_pos: u32,
687}
688
689impl<'a> Iterator for PartitionedEFIterator<'a> {
690    type Item = u32;
691
692    fn next(&mut self) -> Option<Self::Item> {
693        if self.partition_idx >= self.pef.partitions.len() {
694            return None;
695        }
696
697        let partition = &self.pef.partitions[self.partition_idx];
698        if let Some(val) = partition.get(self.local_pos) {
699            self.local_pos += 1;
700            if self.local_pos >= partition.len {
701                self.partition_idx += 1;
702                self.local_pos = 0;
703            }
704            Some(val)
705        } else {
706            None
707        }
708    }
709
710    fn size_hint(&self) -> (usize, Option<usize>) {
711        let current_global = if self.partition_idx == 0 {
712            self.local_pos
713        } else if self.partition_idx < self.pef.cumulative_counts.len() {
714            self.pef.cumulative_counts[self.partition_idx - 1] + self.local_pos
715        } else {
716            self.pef.len
717        };
718        let remaining = (self.pef.len - current_global) as usize;
719        (remaining, Some(remaining))
720    }
721}
722
723/// Block metadata for BlockMax WAND
724#[derive(Debug, Clone)]
725pub struct PEFBlockInfo {
726    /// First document ID in block
727    pub first_doc_id: u32,
728    /// Last document ID in block
729    pub last_doc_id: u32,
730    /// Maximum term frequency in block
731    pub max_tf: u32,
732    /// Maximum BM25 score upper bound
733    pub max_block_score: f32,
734    /// Starting partition index
735    pub partition_idx: u16,
736    /// Starting position within partition
737    pub local_start: u32,
738    /// Number of documents in block
739    pub num_docs: u16,
740}
741
742impl PEFBlockInfo {
743    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
744        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
745        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
746        writer.write_u32::<LittleEndian>(self.max_tf)?;
747        writer.write_f32::<LittleEndian>(self.max_block_score)?;
748        writer.write_u16::<LittleEndian>(self.partition_idx)?;
749        writer.write_u32::<LittleEndian>(self.local_start)?;
750        writer.write_u16::<LittleEndian>(self.num_docs)?;
751        Ok(())
752    }
753
754    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
755        Ok(Self {
756            first_doc_id: reader.read_u32::<LittleEndian>()?,
757            last_doc_id: reader.read_u32::<LittleEndian>()?,
758            max_tf: reader.read_u32::<LittleEndian>()?,
759            max_block_score: reader.read_f32::<LittleEndian>()?,
760            partition_idx: reader.read_u16::<LittleEndian>()?,
761            local_start: reader.read_u32::<LittleEndian>()?,
762            num_docs: reader.read_u16::<LittleEndian>()?,
763        })
764    }
765}
766
767/// Block size for BlockMax (matches other formats)
768pub const PEF_BLOCK_SIZE: usize = 128;
769
770/// Partitioned Elias-Fano posting list with term frequencies and BlockMax
771#[derive(Debug, Clone)]
772pub struct PartitionedEFPostingList {
773    /// Document IDs (Partitioned Elias-Fano encoded)
774    pub doc_ids: PartitionedEliasFano,
775    /// Term frequencies (packed)
776    pub term_freqs: Vec<u8>,
777    /// Bits per term frequency
778    pub tf_bits: u8,
779    /// Maximum term frequency
780    pub max_tf: u32,
781    /// Block metadata for BlockMax WAND
782    pub blocks: Vec<PEFBlockInfo>,
783    /// Global maximum score
784    pub max_score: f32,
785}
786
787impl PartitionedEFPostingList {
788    const K1: f32 = 1.2;
789    const B: f32 = 0.75;
790
791    #[inline]
792    pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
793        let tf = max_tf as f32;
794        let min_length_norm = 1.0 - Self::B;
795        let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
796        idf * tf_norm
797    }
798
799    /// Create from postings
800    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
801        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
802    }
803
804    /// Create from postings with IDF for block-max scores
805    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
806        assert_eq!(doc_ids.len(), term_freqs.len());
807
808        if doc_ids.is_empty() {
809            return Self {
810                doc_ids: PartitionedEliasFano::from_sorted_slice(&[]),
811                term_freqs: Vec::new(),
812                tf_bits: 0,
813                max_tf: 0,
814                blocks: Vec::new(),
815                max_score: 0.0,
816            };
817        }
818
819        let pef_doc_ids = PartitionedEliasFano::from_sorted_slice(doc_ids);
820
821        // Find max TF
822        let max_tf = *term_freqs.iter().max().unwrap();
823        let tf_bits = if max_tf == 0 {
824            0
825        } else {
826            (32 - max_tf.leading_zeros()) as u8
827        };
828
829        // Pack term frequencies
830        let total_bits = doc_ids.len() * (tf_bits as usize);
831        let total_bytes = total_bits.div_ceil(8);
832        let mut packed_tfs = vec![0u8; total_bytes];
833
834        if tf_bits > 0 {
835            for (i, &tf) in term_freqs.iter().enumerate() {
836                let bit_pos = i * (tf_bits as usize);
837                let byte_idx = bit_pos / 8;
838                let bit_offset = bit_pos % 8;
839
840                let val = tf.saturating_sub(1);
841
842                packed_tfs[byte_idx] |= (val as u8) << bit_offset;
843                if bit_offset + (tf_bits as usize) > 8 && byte_idx + 1 < packed_tfs.len() {
844                    packed_tfs[byte_idx + 1] |= (val >> (8 - bit_offset)) as u8;
845                }
846                if bit_offset + (tf_bits as usize) > 16 && byte_idx + 2 < packed_tfs.len() {
847                    packed_tfs[byte_idx + 2] |= (val >> (16 - bit_offset)) as u8;
848                }
849            }
850        }
851
852        // Build block metadata
853        let mut blocks = Vec::new();
854        let mut max_score = 0.0f32;
855        let mut i = 0;
856
857        while i < doc_ids.len() {
858            let block_end = (i + PEF_BLOCK_SIZE).min(doc_ids.len());
859            let block_doc_ids = &doc_ids[i..block_end];
860            let block_tfs = &term_freqs[i..block_end];
861
862            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
863            let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
864            max_score = max_score.max(block_score);
865
866            // Find partition info for this block start
867            let (partition_idx, local_start) =
868                Self::find_partition_position(&pef_doc_ids, i as u32);
869
870            blocks.push(PEFBlockInfo {
871                first_doc_id: block_doc_ids[0],
872                last_doc_id: *block_doc_ids.last().unwrap(),
873                max_tf: block_max_tf,
874                max_block_score: block_score,
875                partition_idx: partition_idx as u16,
876                local_start,
877                num_docs: (block_end - i) as u16,
878            });
879
880            i = block_end;
881        }
882
883        Self {
884            doc_ids: pef_doc_ids,
885            term_freqs: packed_tfs,
886            tf_bits,
887            max_tf,
888            blocks,
889            max_score,
890        }
891    }
892
893    fn find_partition_position(pef: &PartitionedEliasFano, global_pos: u32) -> (usize, u32) {
894        let partition_idx = pef
895            .cumulative_counts
896            .binary_search(&(global_pos + 1))
897            .unwrap_or_else(|x| x);
898
899        let local_pos = if partition_idx == 0 {
900            global_pos
901        } else {
902            global_pos - pef.cumulative_counts[partition_idx - 1]
903        };
904
905        (partition_idx, local_pos)
906    }
907
908    /// Get term frequency at position
909    pub fn get_tf(&self, pos: u32) -> u32 {
910        if self.tf_bits == 0 || pos >= self.doc_ids.len() {
911            return 1;
912        }
913
914        let bit_pos = (pos as usize) * (self.tf_bits as usize);
915        let byte_idx = bit_pos / 8;
916        let bit_offset = bit_pos % 8;
917        let mask = (1u32 << self.tf_bits) - 1;
918
919        let mut val = (self.term_freqs[byte_idx] >> bit_offset) as u32;
920        if bit_offset + (self.tf_bits as usize) > 8 && byte_idx + 1 < self.term_freqs.len() {
921            val |= (self.term_freqs[byte_idx + 1] as u32) << (8 - bit_offset);
922        }
923        if bit_offset + (self.tf_bits as usize) > 16 && byte_idx + 2 < self.term_freqs.len() {
924            val |= (self.term_freqs[byte_idx + 2] as u32) << (16 - bit_offset);
925        }
926
927        (val & mask) + 1
928    }
929
930    /// Number of documents
931    pub fn len(&self) -> u32 {
932        self.doc_ids.len()
933    }
934
935    /// Check if empty
936    pub fn is_empty(&self) -> bool {
937        self.doc_ids.is_empty()
938    }
939
940    /// Serialize
941    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
942        self.doc_ids.serialize(writer)?;
943        writer.write_u8(self.tf_bits)?;
944        writer.write_u32::<LittleEndian>(self.max_tf)?;
945        writer.write_f32::<LittleEndian>(self.max_score)?;
946        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
947        writer.write_all(&self.term_freqs)?;
948
949        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
950        for block in &self.blocks {
951            block.serialize(writer)?;
952        }
953
954        Ok(())
955    }
956
957    /// Deserialize
958    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
959        let doc_ids = PartitionedEliasFano::deserialize(reader)?;
960        let tf_bits = reader.read_u8()?;
961        let max_tf = reader.read_u32::<LittleEndian>()?;
962        let max_score = reader.read_f32::<LittleEndian>()?;
963        let tf_len = reader.read_u32::<LittleEndian>()? as usize;
964        let mut term_freqs = vec![0u8; tf_len];
965        reader.read_exact(&mut term_freqs)?;
966
967        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
968        let mut blocks = Vec::with_capacity(num_blocks);
969        for _ in 0..num_blocks {
970            blocks.push(PEFBlockInfo::deserialize(reader)?);
971        }
972
973        Ok(Self {
974            doc_ids,
975            term_freqs,
976            tf_bits,
977            max_tf,
978            blocks,
979            max_score,
980        })
981    }
982
983    /// Create iterator
984    pub fn iterator(&self) -> PartitionedEFPostingIterator<'_> {
985        PartitionedEFPostingIterator {
986            list: self,
987            pos: 0,
988            current_block: 0,
989        }
990    }
991
992    /// Get compression ratio compared to standard EF
993    pub fn compression_info(&self) -> (usize, usize) {
994        let pef_size = self.doc_ids.size_bytes();
995        // Estimate standard EF size
996        let n = self.len() as usize;
997        let max_val = if let Some(last_block) = self.blocks.last() {
998            last_block.last_doc_id
999        } else {
1000            0
1001        };
1002        let ef_size = if n > 0 {
1003            let l = if n <= 1 {
1004                0
1005            } else {
1006                let ratio = (max_val as usize + 1) / n;
1007                if ratio <= 1 {
1008                    0
1009                } else {
1010                    (usize::BITS - ratio.leading_zeros()) as usize
1011                }
1012            };
1013            (n * l + 2 * n).div_ceil(8) + 16
1014        } else {
1015            0
1016        };
1017        (pef_size, ef_size)
1018    }
1019}
1020
1021/// Iterator over Partitioned EF posting list
1022pub struct PartitionedEFPostingIterator<'a> {
1023    list: &'a PartitionedEFPostingList,
1024    pos: u32,
1025    current_block: usize,
1026}
1027
1028impl<'a> PartitionedEFPostingIterator<'a> {
1029    /// Current document ID
1030    pub fn doc(&self) -> u32 {
1031        self.list.doc_ids.get(self.pos).unwrap_or(u32::MAX)
1032    }
1033
1034    /// Current term frequency
1035    pub fn term_freq(&self) -> u32 {
1036        self.list.get_tf(self.pos)
1037    }
1038
1039    /// Advance to next document
1040    pub fn advance(&mut self) -> u32 {
1041        self.pos += 1;
1042        if !self.list.blocks.is_empty() {
1043            let new_block = (self.pos as usize) / PEF_BLOCK_SIZE;
1044            if new_block < self.list.blocks.len() {
1045                self.current_block = new_block;
1046            }
1047        }
1048        self.doc()
1049    }
1050
1051    /// Seek to first doc >= target
1052    pub fn seek(&mut self, target: u32) -> u32 {
1053        // Use block skip list
1054        if !self.list.blocks.is_empty() {
1055            let block_idx = self.list.blocks[self.current_block..].binary_search_by(|b| {
1056                if b.last_doc_id < target {
1057                    std::cmp::Ordering::Less
1058                } else if b.first_doc_id > target {
1059                    std::cmp::Ordering::Greater
1060                } else {
1061                    std::cmp::Ordering::Equal
1062                }
1063            });
1064
1065            let target_block = match block_idx {
1066                Ok(idx) => self.current_block + idx,
1067                Err(idx) => {
1068                    let abs_idx = self.current_block + idx;
1069                    if abs_idx >= self.list.blocks.len() {
1070                        self.pos = self.list.len();
1071                        return u32::MAX;
1072                    }
1073                    abs_idx
1074                }
1075            };
1076
1077            if target_block > self.current_block {
1078                self.current_block = target_block;
1079                self.pos = (target_block * PEF_BLOCK_SIZE) as u32;
1080            }
1081        }
1082
1083        // Use PEF's next_geq
1084        if let Some((pos, val)) = self.list.doc_ids.next_geq(target)
1085            && pos >= self.pos
1086        {
1087            self.pos = pos;
1088            if !self.list.blocks.is_empty() {
1089                self.current_block = (pos as usize) / PEF_BLOCK_SIZE;
1090            }
1091            return val;
1092        }
1093
1094        // Linear scan fallback
1095        while self.pos < self.list.len() {
1096            let doc = self.doc();
1097            if doc >= target {
1098                return doc;
1099            }
1100            self.pos += 1;
1101        }
1102
1103        u32::MAX
1104    }
1105
1106    /// Check if exhausted
1107    pub fn is_exhausted(&self) -> bool {
1108        self.pos >= self.list.len()
1109    }
1110
1111    /// Current block's max score
1112    pub fn current_block_max_score(&self) -> f32 {
1113        if self.is_exhausted() || self.list.blocks.is_empty() {
1114            return 0.0;
1115        }
1116        if self.current_block < self.list.blocks.len() {
1117            self.list.blocks[self.current_block].max_block_score
1118        } else {
1119            0.0
1120        }
1121    }
1122
1123    /// Current block's max TF
1124    pub fn current_block_max_tf(&self) -> u32 {
1125        if self.is_exhausted() || self.list.blocks.is_empty() {
1126            return 0;
1127        }
1128        if self.current_block < self.list.blocks.len() {
1129            self.list.blocks[self.current_block].max_tf
1130        } else {
1131            0
1132        }
1133    }
1134
1135    /// Max score for remaining blocks
1136    pub fn max_remaining_score(&self) -> f32 {
1137        if self.is_exhausted() || self.list.blocks.is_empty() {
1138            return 0.0;
1139        }
1140        self.list.blocks[self.current_block..]
1141            .iter()
1142            .map(|b| b.max_block_score)
1143            .fold(0.0f32, |a, b| a.max(b))
1144    }
1145
1146    /// Skip to next block containing doc >= target (for BlockWAND)
1147    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
1148    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1149        if self.list.blocks.is_empty() {
1150            return None;
1151        }
1152
1153        while self.current_block < self.list.blocks.len() {
1154            let block = &self.list.blocks[self.current_block];
1155            if block.last_doc_id >= target {
1156                self.pos = (self.current_block * PEF_BLOCK_SIZE) as u32;
1157                return Some((block.first_doc_id, block.max_block_score));
1158            }
1159            self.current_block += 1;
1160        }
1161
1162        self.pos = self.list.len();
1163        None
1164    }
1165}
1166
1167#[cfg(test)]
1168mod tests {
1169    use super::*;
1170
1171    #[test]
1172    fn test_ef_partition_basic() {
1173        let values = vec![10, 20, 30, 40, 50];
1174        let partition = EFPartition::from_sorted_slice(&values);
1175
1176        assert_eq!(partition.len, 5);
1177        assert_eq!(partition.first_value, 10);
1178        assert_eq!(partition.last_value, 50);
1179
1180        for (i, &expected) in values.iter().enumerate() {
1181            assert_eq!(partition.get(i as u32), Some(expected));
1182        }
1183    }
1184
1185    #[test]
1186    fn test_ef_partition_next_geq() {
1187        let values = vec![10, 20, 30, 100, 200, 300];
1188        let partition = EFPartition::from_sorted_slice(&values);
1189
1190        assert_eq!(partition.next_geq(5), Some((0, 10)));
1191        assert_eq!(partition.next_geq(10), Some((0, 10)));
1192        assert_eq!(partition.next_geq(15), Some((1, 20)));
1193        assert_eq!(partition.next_geq(100), Some((3, 100)));
1194        assert_eq!(partition.next_geq(301), None);
1195    }
1196
1197    #[test]
1198    fn test_partitioned_ef_basic() {
1199        let values: Vec<u32> = (0..500).map(|i| i * 2).collect();
1200        let pef = PartitionedEliasFano::from_sorted_slice(&values);
1201
1202        assert_eq!(pef.len(), 500);
1203        assert!(pef.num_partitions() >= 1);
1204
1205        for (i, &expected) in values.iter().enumerate() {
1206            assert_eq!(pef.get(i as u32), Some(expected), "Mismatch at {}", i);
1207        }
1208    }
1209
1210    #[test]
1211    fn test_partitioned_ef_next_geq() {
1212        let values: Vec<u32> = (0..1000).map(|i| i * 3).collect();
1213        let pef = PartitionedEliasFano::from_sorted_slice(&values);
1214
1215        assert_eq!(pef.next_geq(0), Some((0, 0)));
1216        assert_eq!(pef.next_geq(100), Some((34, 102))); // 100/3 = 33.33, next is 34*3=102
1217        assert_eq!(pef.next_geq(1500), Some((500, 1500)));
1218        assert_eq!(pef.next_geq(3000), None);
1219    }
1220
1221    #[test]
1222    fn test_partitioned_ef_serialization() {
1223        let values: Vec<u32> = (0..500).map(|i| i * 5).collect();
1224        let pef = PartitionedEliasFano::from_sorted_slice(&values);
1225
1226        let mut buffer = Vec::new();
1227        pef.serialize(&mut buffer).unwrap();
1228
1229        let restored = PartitionedEliasFano::deserialize(&mut &buffer[..]).unwrap();
1230
1231        assert_eq!(restored.len(), pef.len());
1232        assert_eq!(restored.num_partitions(), pef.num_partitions());
1233
1234        for i in 0..pef.len() {
1235            assert_eq!(restored.get(i), pef.get(i));
1236        }
1237    }
1238
1239    #[test]
1240    fn test_partitioned_ef_posting_list() {
1241        let doc_ids: Vec<u32> = (0..300).map(|i| i * 2).collect();
1242        let term_freqs: Vec<u32> = (0..300).map(|i| (i % 10) + 1).collect();
1243
1244        let list = PartitionedEFPostingList::from_postings(&doc_ids, &term_freqs);
1245
1246        assert_eq!(list.len(), 300);
1247
1248        let mut iter = list.iterator();
1249        for (i, (&expected_doc, &expected_tf)) in doc_ids.iter().zip(term_freqs.iter()).enumerate()
1250        {
1251            assert_eq!(iter.doc(), expected_doc, "Doc mismatch at {}", i);
1252            assert_eq!(iter.term_freq(), expected_tf, "TF mismatch at {}", i);
1253            iter.advance();
1254        }
1255    }
1256
1257    #[test]
1258    fn test_partitioned_ef_seek() {
1259        let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
1260        let term_freqs: Vec<u32> = vec![1; 500];
1261
1262        let list = PartitionedEFPostingList::from_postings(&doc_ids, &term_freqs);
1263        let mut iter = list.iterator();
1264
1265        assert_eq!(iter.seek(100), 102); // 100/3 = 33.33, next is 34*3=102
1266        assert_eq!(iter.seek(300), 300);
1267        assert_eq!(iter.seek(1500), u32::MAX);
1268    }
1269
1270    #[test]
1271    fn test_compression_improvement() {
1272        // Test that PEF achieves better compression than standard EF
1273        // on data with varying density
1274        let values: Vec<u32> = (0..10000)
1275            .map(|i| {
1276                if i < 5000 {
1277                    i * 2 // Dense region
1278                } else {
1279                    10000 + (i - 5000) * 100 // Sparse region
1280                }
1281            })
1282            .collect();
1283
1284        let pef = PartitionedEliasFano::from_sorted_slice(&values);
1285
1286        // PEF should use multiple partitions for this mixed-density data
1287        assert!(
1288            pef.num_partitions() > 1,
1289            "Expected multiple partitions, got {}",
1290            pef.num_partitions()
1291        );
1292
1293        // Verify correctness
1294        for (i, &expected) in values.iter().enumerate() {
1295            assert_eq!(pef.get(i as u32), Some(expected));
1296        }
1297    }
1298
1299    #[test]
1300    fn test_partitioned_ef_block_max() {
1301        // Create a large posting list that spans multiple blocks
1302        let doc_ids: Vec<u32> = (0..500).map(|i| i * 2).collect();
1303        // Vary term frequencies so different blocks have different max_tf
1304        let term_freqs: Vec<u32> = (0..500)
1305            .map(|i| {
1306                if i < 128 {
1307                    1 // Block 0: max_tf = 1
1308                } else if i < 256 {
1309                    5 // Block 1: max_tf = 5
1310                } else if i < 384 {
1311                    10 // Block 2: max_tf = 10
1312                } else {
1313                    3 // Block 3: max_tf = 3
1314                }
1315            })
1316            .collect();
1317
1318        let list = PartitionedEFPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1319
1320        // Should have 4 blocks (500 docs / 128 per block)
1321        assert_eq!(list.blocks.len(), 4);
1322        assert_eq!(list.blocks[0].max_tf, 1);
1323        assert_eq!(list.blocks[1].max_tf, 5);
1324        assert_eq!(list.blocks[2].max_tf, 10);
1325        assert_eq!(list.blocks[3].max_tf, 3);
1326
1327        // Block 2 should have highest score (max_tf = 10)
1328        assert!(list.blocks[2].max_block_score > list.blocks[0].max_block_score);
1329        assert!(list.blocks[2].max_block_score > list.blocks[1].max_block_score);
1330        assert!(list.blocks[2].max_block_score > list.blocks[3].max_block_score);
1331
1332        // Global max_score should equal block 2's score
1333        assert_eq!(list.max_score, list.blocks[2].max_block_score);
1334
1335        // Test iterator block-max methods
1336        let mut iter = list.iterator();
1337        assert_eq!(iter.current_block_max_tf(), 1); // Block 0
1338
1339        // Seek to block 1
1340        iter.seek(256); // first doc in block 1
1341        assert_eq!(iter.current_block_max_tf(), 5);
1342
1343        // Seek to block 2
1344        iter.seek(512); // first doc in block 2
1345        assert_eq!(iter.current_block_max_tf(), 10);
1346
1347        // Test skip_to_block_with_doc
1348        let mut iter2 = list.iterator();
1349        let result = iter2.skip_to_block_with_doc(300);
1350        assert!(result.is_some());
1351        let (first_doc, score) = result.unwrap();
1352        assert!(first_doc <= 300);
1353        assert!(score > 0.0);
1354
1355        // Test max_remaining_score
1356        let mut iter3 = list.iterator();
1357        let max_score = iter3.max_remaining_score();
1358        assert_eq!(max_score, list.max_score);
1359
1360        // After seeking past block 2, max_remaining should be lower
1361        iter3.seek(768); // Block 3
1362        let remaining = iter3.max_remaining_score();
1363        assert!(remaining < max_score);
1364    }
1365}