hermes_core/structures/
roaring.rs

1//! Roaring Bitmap implementation for compressed integer sets
2//!
3//! Roaring bitmaps use a hybrid approach:
4//! - Sparse containers: sorted arrays for low-density chunks
5//! - Dense containers: bitmaps for high-density chunks
6//! - Run containers: RLE for consecutive ranges
7//!
8//! This provides excellent compression and fast set operations.
9//! Used by Apache Lucene, Spark, Druid, and many databases.
10
11use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14/// Threshold for switching from array to bitmap container
15/// If a container has more than this many elements, use bitmap
16const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18/// Container types for different density patterns
19#[derive(Debug, Clone)]
20enum Container {
21    /// Sorted array of 16-bit values (for sparse containers)
22    Array(Vec<u16>),
23    /// Bitmap of 2^16 bits (for dense containers)
24    Bitmap(Box<[u64; 1024]>),
25    /// Run-length encoded (start, length) pairs
26    Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30    fn new_array() -> Self {
31        Container::Array(Vec::new())
32    }
33
34    #[allow(dead_code)]
35    fn new_bitmap() -> Self {
36        Container::Bitmap(Box::new([0u64; 1024]))
37    }
38
39    fn cardinality(&self) -> u32 {
40        match self {
41            Container::Array(arr) => arr.len() as u32,
42            Container::Bitmap(bm) => Self::bitmap_cardinality(bm),
43            Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44        }
45    }
46
47    /// SIMD-accelerated bitmap cardinality (NEON on aarch64, POPCNT on x86_64)
48    #[inline]
49    fn bitmap_cardinality(bm: &[u64; 1024]) -> u32 {
50        #[cfg(target_arch = "aarch64")]
51        {
52            use std::arch::aarch64::*;
53
54            let mut total = 0u32;
55            let bytes = bm.as_ptr() as *const u8;
56
57            // Process 16 bytes (2 u64 words) at a time using NEON vcntq_u8
58            unsafe {
59                for i in (0..8192).step_by(16) {
60                    let v = vld1q_u8(bytes.add(i));
61                    let cnt = vcntq_u8(v);
62                    total += vaddlvq_u8(cnt) as u32;
63                }
64            }
65
66            total
67        }
68
69        #[cfg(target_arch = "x86_64")]
70        {
71            // Use POPCNT instruction on x86_64 (available on all modern CPUs)
72            let mut total = 0u32;
73            for &word in bm.iter() {
74                total += word.count_ones();
75            }
76            total
77        }
78
79        #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
80        {
81            bm.iter().map(|w| w.count_ones()).sum()
82        }
83    }
84
85    fn contains(&self, val: u16) -> bool {
86        match self {
87            Container::Array(arr) => arr.binary_search(&val).is_ok(),
88            Container::Bitmap(bm) => {
89                let word_idx = (val / 64) as usize;
90                let bit_idx = val % 64;
91                (bm[word_idx] >> bit_idx) & 1 == 1
92            }
93            Container::Runs(runs) => {
94                for &(start, len) in runs {
95                    if val >= start && val <= start + len {
96                        return true;
97                    }
98                    if val < start {
99                        return false;
100                    }
101                }
102                false
103            }
104        }
105    }
106
107    fn insert(&mut self, val: u16) -> bool {
108        match self {
109            Container::Array(arr) => {
110                match arr.binary_search(&val) {
111                    Ok(_) => false, // Already exists
112                    Err(pos) => {
113                        arr.insert(pos, val);
114                        true
115                    }
116                }
117            }
118            Container::Bitmap(bm) => {
119                let word_idx = (val / 64) as usize;
120                let bit_idx = val % 64;
121                let old = bm[word_idx];
122                bm[word_idx] |= 1u64 << bit_idx;
123                old != bm[word_idx]
124            }
125            Container::Runs(runs) => {
126                // For simplicity, convert to array, insert, then optimize
127                let mut arr: Vec<u16> = Vec::new();
128                for &(start, len) in runs.iter() {
129                    for i in 0..=len {
130                        arr.push(start + i);
131                    }
132                }
133                let inserted = match arr.binary_search(&val) {
134                    Ok(_) => false,
135                    Err(pos) => {
136                        arr.insert(pos, val);
137                        true
138                    }
139                };
140                *self = Container::Array(arr);
141                inserted
142            }
143        }
144    }
145
146    fn optimize(&mut self) {
147        let card = self.cardinality() as usize;
148
149        match self {
150            Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
151                // Convert to bitmap
152                let mut bm = Box::new([0u64; 1024]);
153                for &val in arr.iter() {
154                    let word_idx = (val / 64) as usize;
155                    let bit_idx = val % 64;
156                    bm[word_idx] |= 1u64 << bit_idx;
157                }
158                *self = Container::Bitmap(bm);
159            }
160            Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
161                // Convert to array
162                let mut arr = Vec::with_capacity(card);
163                for (word_idx, &word) in bm.iter().enumerate() {
164                    let mut w = word;
165                    while w != 0 {
166                        let bit_idx = w.trailing_zeros();
167                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
168                        w &= w - 1;
169                    }
170                }
171                *self = Container::Array(arr);
172            }
173            _ => {}
174        }
175
176        // Try run-length encoding if beneficial
177        self.try_run_encode();
178    }
179
180    fn try_run_encode(&mut self) {
181        let arr = match self {
182            Container::Array(arr) if arr.len() >= 4 => arr,
183            _ => return,
184        };
185
186        let mut runs = Vec::new();
187        let mut i = 0;
188
189        while i < arr.len() {
190            let start = arr[i];
191            let mut end = start;
192
193            while i + 1 < arr.len() && arr[i + 1] == end + 1 {
194                end = arr[i + 1];
195                i += 1;
196            }
197
198            runs.push((start, end - start));
199            i += 1;
200        }
201
202        // Only use runs if it saves space
203        // Array: 2 bytes per element
204        // Runs: 4 bytes per run
205        if runs.len() * 4 < arr.len() * 2 {
206            *self = Container::Runs(runs);
207        }
208    }
209
210    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
211        match self {
212            Container::Array(arr) => {
213                writer.write_u8(0)?; // Type tag
214                writer.write_u16::<LittleEndian>(arr.len() as u16)?;
215                for &val in arr {
216                    writer.write_u16::<LittleEndian>(val)?;
217                }
218            }
219            Container::Bitmap(bm) => {
220                writer.write_u8(1)?; // Type tag
221                for &word in bm.iter() {
222                    writer.write_u64::<LittleEndian>(word)?;
223                }
224            }
225            Container::Runs(runs) => {
226                writer.write_u8(2)?; // Type tag
227                writer.write_u16::<LittleEndian>(runs.len() as u16)?;
228                for &(start, len) in runs {
229                    writer.write_u16::<LittleEndian>(start)?;
230                    writer.write_u16::<LittleEndian>(len)?;
231                }
232            }
233        }
234        Ok(())
235    }
236
237    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
238        let tag = reader.read_u8()?;
239        match tag {
240            0 => {
241                let len = reader.read_u16::<LittleEndian>()? as usize;
242                let mut arr = Vec::with_capacity(len);
243                for _ in 0..len {
244                    arr.push(reader.read_u16::<LittleEndian>()?);
245                }
246                Ok(Container::Array(arr))
247            }
248            1 => {
249                let mut bm = Box::new([0u64; 1024]);
250                for word in bm.iter_mut() {
251                    *word = reader.read_u64::<LittleEndian>()?;
252                }
253                Ok(Container::Bitmap(bm))
254            }
255            2 => {
256                let len = reader.read_u16::<LittleEndian>()? as usize;
257                let mut runs = Vec::with_capacity(len);
258                for _ in 0..len {
259                    let start = reader.read_u16::<LittleEndian>()?;
260                    let run_len = reader.read_u16::<LittleEndian>()?;
261                    runs.push((start, run_len));
262                }
263                Ok(Container::Runs(runs))
264            }
265            _ => Err(io::Error::new(
266                io::ErrorKind::InvalidData,
267                "Invalid container type",
268            )),
269        }
270    }
271
272    fn size_bytes(&self) -> usize {
273        match self {
274            Container::Array(arr) => arr.len() * 2 + 4,
275            Container::Bitmap(_) => 8 * 1024 + 1,
276            Container::Runs(runs) => runs.len() * 4 + 4,
277        }
278    }
279}
280
281/// Roaring Bitmap for compressed integer sets
282#[derive(Debug, Clone)]
283pub struct RoaringBitmap {
284    /// High 16 bits -> container mapping
285    containers: Vec<(u16, Container)>,
286}
287
288impl RoaringBitmap {
289    /// Create empty bitmap
290    pub fn new() -> Self {
291        Self {
292            containers: Vec::new(),
293        }
294    }
295
296    /// Create from sorted slice
297    pub fn from_sorted_slice(values: &[u32]) -> Self {
298        let mut bitmap = Self::new();
299
300        if values.is_empty() {
301            return bitmap;
302        }
303
304        let mut current_high = (values[0] >> 16) as u16;
305        let mut current_container = Container::new_array();
306
307        for &val in values {
308            let high = (val >> 16) as u16;
309            let low = val as u16;
310
311            if high != current_high {
312                current_container.optimize();
313                bitmap.containers.push((current_high, current_container));
314                current_high = high;
315                current_container = Container::new_array();
316            }
317
318            current_container.insert(low);
319        }
320
321        current_container.optimize();
322        bitmap.containers.push((current_high, current_container));
323
324        bitmap
325    }
326
327    /// Insert a value
328    pub fn insert(&mut self, val: u32) -> bool {
329        let high = (val >> 16) as u16;
330        let low = val as u16;
331
332        // Find or create container
333        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
334            Ok(idx) => self.containers[idx].1.insert(low),
335            Err(idx) => {
336                let mut container = Container::new_array();
337                container.insert(low);
338                self.containers.insert(idx, (high, container));
339                true
340            }
341        }
342    }
343
344    /// Check if value exists
345    pub fn contains(&self, val: u32) -> bool {
346        let high = (val >> 16) as u16;
347        let low = val as u16;
348
349        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
350            Ok(idx) => self.containers[idx].1.contains(low),
351            Err(_) => false,
352        }
353    }
354
355    /// Number of elements
356    pub fn cardinality(&self) -> u32 {
357        self.containers.iter().map(|(_, c)| c.cardinality()).sum()
358    }
359
360    /// Check if empty
361    pub fn is_empty(&self) -> bool {
362        self.containers.is_empty()
363    }
364
365    /// Optimize all containers
366    pub fn optimize(&mut self) {
367        for (_, container) in &mut self.containers {
368            container.optimize();
369        }
370    }
371
372    /// Intersection with another bitmap
373    pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
374        let mut result = RoaringBitmap::new();
375
376        let mut i = 0;
377        let mut j = 0;
378
379        while i < self.containers.len() && j < other.containers.len() {
380            let (high1, c1) = &self.containers[i];
381            let (high2, c2) = &other.containers[j];
382
383            match high1.cmp(high2) {
384                std::cmp::Ordering::Less => i += 1,
385                std::cmp::Ordering::Greater => j += 1,
386                std::cmp::Ordering::Equal => {
387                    let intersected = Self::intersect_containers(c1, c2);
388                    if intersected.cardinality() > 0 {
389                        result.containers.push((*high1, intersected));
390                    }
391                    i += 1;
392                    j += 1;
393                }
394            }
395        }
396
397        result
398    }
399
400    fn intersect_containers(c1: &Container, c2: &Container) -> Container {
401        match (c1, c2) {
402            (Container::Array(a1), Container::Array(a2)) => {
403                let mut result = Vec::new();
404                let mut i = 0;
405                let mut j = 0;
406                while i < a1.len() && j < a2.len() {
407                    match a1[i].cmp(&a2[j]) {
408                        std::cmp::Ordering::Less => i += 1,
409                        std::cmp::Ordering::Greater => j += 1,
410                        std::cmp::Ordering::Equal => {
411                            result.push(a1[i]);
412                            i += 1;
413                            j += 1;
414                        }
415                    }
416                }
417                Container::Array(result)
418            }
419            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
420                let mut result = Box::new([0u64; 1024]);
421                for i in 0..1024 {
422                    result[i] = b1[i] & b2[i];
423                }
424                let mut c = Container::Bitmap(result);
425                c.optimize();
426                c
427            }
428            (Container::Array(arr), Container::Bitmap(bm))
429            | (Container::Bitmap(bm), Container::Array(arr)) => {
430                let mut result = Vec::new();
431                for &val in arr {
432                    let word_idx = (val / 64) as usize;
433                    let bit_idx = val % 64;
434                    if (bm[word_idx] >> bit_idx) & 1 == 1 {
435                        result.push(val);
436                    }
437                }
438                Container::Array(result)
439            }
440            _ => {
441                // For runs, convert to array first
442                let arr1 = Self::container_to_array(c1);
443                let arr2 = Self::container_to_array(c2);
444                Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
445            }
446        }
447    }
448
449    fn container_to_array(c: &Container) -> Vec<u16> {
450        match c {
451            Container::Array(arr) => arr.clone(),
452            Container::Bitmap(bm) => {
453                let mut arr = Vec::new();
454                for (word_idx, &word) in bm.iter().enumerate() {
455                    let mut w = word;
456                    while w != 0 {
457                        let bit_idx = w.trailing_zeros();
458                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
459                        w &= w - 1;
460                    }
461                }
462                arr
463            }
464            Container::Runs(runs) => {
465                let mut arr = Vec::new();
466                for &(start, len) in runs {
467                    for i in 0..=len {
468                        arr.push(start + i);
469                    }
470                }
471                arr
472            }
473        }
474    }
475
476    /// Union with another bitmap
477    pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
478        let mut result = RoaringBitmap::new();
479
480        let mut i = 0;
481        let mut j = 0;
482
483        while i < self.containers.len() || j < other.containers.len() {
484            if i >= self.containers.len() {
485                result.containers.push(other.containers[j].clone());
486                j += 1;
487            } else if j >= other.containers.len() {
488                result.containers.push(self.containers[i].clone());
489                i += 1;
490            } else {
491                let (high1, c1) = &self.containers[i];
492                let (high2, c2) = &other.containers[j];
493
494                match high1.cmp(high2) {
495                    std::cmp::Ordering::Less => {
496                        result.containers.push(self.containers[i].clone());
497                        i += 1;
498                    }
499                    std::cmp::Ordering::Greater => {
500                        result.containers.push(other.containers[j].clone());
501                        j += 1;
502                    }
503                    std::cmp::Ordering::Equal => {
504                        let united = Self::union_containers(c1, c2);
505                        result.containers.push((*high1, united));
506                        i += 1;
507                        j += 1;
508                    }
509                }
510            }
511        }
512
513        result
514    }
515
516    fn union_containers(c1: &Container, c2: &Container) -> Container {
517        match (c1, c2) {
518            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
519                let mut result = Box::new([0u64; 1024]);
520                for i in 0..1024 {
521                    result[i] = b1[i] | b2[i];
522                }
523                Container::Bitmap(result)
524            }
525            _ => {
526                // Convert both to arrays and merge
527                let arr1 = Self::container_to_array(c1);
528                let arr2 = Self::container_to_array(c2);
529
530                let mut result = Vec::with_capacity(arr1.len() + arr2.len());
531                let mut i = 0;
532                let mut j = 0;
533
534                while i < arr1.len() && j < arr2.len() {
535                    match arr1[i].cmp(&arr2[j]) {
536                        std::cmp::Ordering::Less => {
537                            result.push(arr1[i]);
538                            i += 1;
539                        }
540                        std::cmp::Ordering::Greater => {
541                            result.push(arr2[j]);
542                            j += 1;
543                        }
544                        std::cmp::Ordering::Equal => {
545                            result.push(arr1[i]);
546                            i += 1;
547                            j += 1;
548                        }
549                    }
550                }
551
552                result.extend_from_slice(&arr1[i..]);
553                result.extend_from_slice(&arr2[j..]);
554
555                let mut c = Container::Array(result);
556                c.optimize();
557                c
558            }
559        }
560    }
561
562    /// Serialize
563    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
564        writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
565        for (high, container) in &self.containers {
566            writer.write_u16::<LittleEndian>(*high)?;
567            container.serialize(writer)?;
568        }
569        Ok(())
570    }
571
572    /// Deserialize
573    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
574        let num_containers = reader.read_u32::<LittleEndian>()? as usize;
575        let mut containers = Vec::with_capacity(num_containers);
576
577        for _ in 0..num_containers {
578            let high = reader.read_u16::<LittleEndian>()?;
579            let container = Container::deserialize(reader)?;
580            containers.push((high, container));
581        }
582
583        Ok(Self { containers })
584    }
585
586    /// Approximate size in bytes
587    pub fn size_bytes(&self) -> usize {
588        4 + self
589            .containers
590            .iter()
591            .map(|(_, c)| 2 + c.size_bytes())
592            .sum::<usize>()
593    }
594
595    /// Create an iterator
596    pub fn iter(&self) -> RoaringIterator<'_> {
597        RoaringIterator {
598            bitmap: self,
599            container_idx: 0,
600            value_idx: 0,
601            current_values: Vec::new(),
602        }
603    }
604}
605
606impl Default for RoaringBitmap {
607    fn default() -> Self {
608        Self::new()
609    }
610}
611
612/// Iterator over Roaring Bitmap
613pub struct RoaringIterator<'a> {
614    bitmap: &'a RoaringBitmap,
615    container_idx: usize,
616    value_idx: usize,
617    current_values: Vec<u16>,
618}
619
620impl<'a> Iterator for RoaringIterator<'a> {
621    type Item = u32;
622
623    fn next(&mut self) -> Option<Self::Item> {
624        loop {
625            if self.value_idx < self.current_values.len() {
626                let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
627                let low = self.current_values[self.value_idx] as u32;
628                self.value_idx += 1;
629                return Some((high << 16) | low);
630            }
631
632            if self.container_idx >= self.bitmap.containers.len() {
633                return None;
634            }
635
636            let (_, container) = &self.bitmap.containers[self.container_idx];
637            self.current_values = RoaringBitmap::container_to_array(container);
638            self.value_idx = 0;
639            self.container_idx += 1;
640        }
641    }
642}
643
644/// Block size for Roaring BlockMax (matches container size = 65536 doc_ids)
645pub const ROARING_BLOCK_SIZE: usize = 65536;
646
647/// Block metadata for BlockMax WAND optimization in Roaring
648#[derive(Debug, Clone)]
649pub struct RoaringBlockInfo {
650    /// High 16 bits (container key)
651    pub container_key: u16,
652    /// First doc_id in this container
653    pub first_doc_id: u32,
654    /// Last doc_id in this container
655    pub last_doc_id: u32,
656    /// Maximum term frequency in this block
657    pub max_tf: u32,
658    /// Upper bound BM25 score for this block
659    pub max_block_score: f32,
660    /// Number of documents in this container
661    pub num_docs: u32,
662}
663
664impl RoaringBlockInfo {
665    /// Serialize block info
666    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
667        writer.write_u16::<LittleEndian>(self.container_key)?;
668        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
669        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
670        writer.write_u32::<LittleEndian>(self.max_tf)?;
671        writer.write_f32::<LittleEndian>(self.max_block_score)?;
672        writer.write_u32::<LittleEndian>(self.num_docs)?;
673        Ok(())
674    }
675
676    /// Deserialize block info
677    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
678        Ok(Self {
679            container_key: reader.read_u16::<LittleEndian>()?,
680            first_doc_id: reader.read_u32::<LittleEndian>()?,
681            last_doc_id: reader.read_u32::<LittleEndian>()?,
682            max_tf: reader.read_u32::<LittleEndian>()?,
683            max_block_score: reader.read_f32::<LittleEndian>()?,
684            num_docs: reader.read_u32::<LittleEndian>()?,
685        })
686    }
687}
688
689/// Roaring bitmap with term frequencies for posting lists
690#[derive(Debug, Clone)]
691pub struct RoaringPostingList {
692    /// Document IDs as roaring bitmap
693    pub doc_ids: RoaringBitmap,
694    /// Term frequencies (sparse map for non-1 frequencies)
695    /// Most terms have tf=1, so we only store exceptions
696    pub term_freqs: Vec<(u32, u32)>,
697    /// Default term frequency (usually 1)
698    pub default_tf: u32,
699    /// Maximum term frequency
700    pub max_tf: u32,
701    /// Block metadata for BlockMax WAND (one per container)
702    pub blocks: Vec<RoaringBlockInfo>,
703    /// Global maximum score across all blocks
704    pub max_score: f32,
705}
706
707impl RoaringPostingList {
708    /// BM25 parameters for block-max score calculation
709    const K1: f32 = 1.2;
710    const B: f32 = 0.75;
711
712    /// Compute BM25 upper bound score for a given max_tf and IDF
713    #[inline]
714    pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
715        let tf = max_tf as f32;
716        // Conservative upper bound: assume dl=0, so length_norm = 1 - b = 0.25
717        let min_length_norm = 1.0 - Self::B;
718        let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
719        idf * tf_norm
720    }
721
722    /// Create from doc_ids and term frequencies (without IDF - for backwards compatibility)
723    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
724        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
725    }
726
727    /// Create from doc_ids and term frequencies with IDF for block-max scores
728    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
729        assert_eq!(doc_ids.len(), term_freqs.len());
730
731        if doc_ids.is_empty() {
732            return Self {
733                doc_ids: RoaringBitmap::new(),
734                term_freqs: Vec::new(),
735                default_tf: 1,
736                max_tf: 0,
737                blocks: Vec::new(),
738                max_score: 0.0,
739            };
740        }
741
742        let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
743
744        // Find most common TF (usually 1)
745        let mut tf_counts = std::collections::HashMap::new();
746        for &tf in term_freqs {
747            *tf_counts.entry(tf).or_insert(0u32) += 1;
748        }
749        let default_tf = tf_counts
750            .iter()
751            .max_by_key(|&(_, count)| count)
752            .map(|(&tf, _)| tf)
753            .unwrap_or(1);
754
755        // Store only non-default TFs
756        let exceptions: Vec<(u32, u32)> = doc_ids
757            .iter()
758            .zip(term_freqs.iter())
759            .filter(|&(_, &tf)| tf != default_tf)
760            .map(|(&doc, &tf)| (doc, tf))
761            .collect();
762
763        let max_tf = *term_freqs.iter().max().unwrap_or(&1);
764
765        // Build block metadata (one block per container)
766        // Group doc_ids by container (high 16 bits)
767        let mut blocks = Vec::new();
768        let mut max_score = 0.0f32;
769        let mut i = 0;
770
771        while i < doc_ids.len() {
772            let container_key = (doc_ids[i] >> 16) as u16;
773            let block_start = i;
774
775            // Find all docs in this container
776            while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
777                i += 1;
778            }
779
780            let block_doc_ids = &doc_ids[block_start..i];
781            let block_tfs = &term_freqs[block_start..i];
782            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
783            let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
784            max_score = max_score.max(block_score);
785
786            blocks.push(RoaringBlockInfo {
787                container_key,
788                first_doc_id: block_doc_ids[0],
789                last_doc_id: *block_doc_ids.last().unwrap(),
790                max_tf: block_max_tf,
791                max_block_score: block_score,
792                num_docs: block_doc_ids.len() as u32,
793            });
794        }
795
796        Self {
797            doc_ids: bitmap,
798            term_freqs: exceptions,
799            default_tf,
800            max_tf,
801            blocks,
802            max_score,
803        }
804    }
805
806    /// Get term frequency for a document
807    pub fn get_tf(&self, doc_id: u32) -> u32 {
808        // Binary search in exceptions
809        match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
810            Ok(idx) => self.term_freqs[idx].1,
811            Err(_) => self.default_tf,
812        }
813    }
814
815    /// Number of documents
816    pub fn len(&self) -> u32 {
817        self.doc_ids.cardinality()
818    }
819
820    /// Check if empty
821    pub fn is_empty(&self) -> bool {
822        self.doc_ids.is_empty()
823    }
824
825    /// Get number of blocks (containers)
826    pub fn num_blocks(&self) -> usize {
827        self.blocks.len()
828    }
829
830    /// Get block index for a doc_id
831    pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
832        let container_key = (doc_id >> 16) as u16;
833        self.blocks
834            .binary_search_by_key(&container_key, |b| b.container_key)
835            .ok()
836    }
837
838    /// Serialize
839    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
840        self.doc_ids.serialize(writer)?;
841        writer.write_u32::<LittleEndian>(self.default_tf)?;
842        writer.write_u32::<LittleEndian>(self.max_tf)?;
843        writer.write_f32::<LittleEndian>(self.max_score)?;
844        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
845        for &(doc, tf) in &self.term_freqs {
846            writer.write_u32::<LittleEndian>(doc)?;
847            writer.write_u32::<LittleEndian>(tf)?;
848        }
849
850        // Write block metadata
851        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
852        for block in &self.blocks {
853            block.serialize(writer)?;
854        }
855
856        Ok(())
857    }
858
859    /// Deserialize
860    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
861        let doc_ids = RoaringBitmap::deserialize(reader)?;
862        let default_tf = reader.read_u32::<LittleEndian>()?;
863        let max_tf = reader.read_u32::<LittleEndian>()?;
864        let max_score = reader.read_f32::<LittleEndian>()?;
865        let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
866        let mut term_freqs = Vec::with_capacity(num_exceptions);
867        for _ in 0..num_exceptions {
868            let doc = reader.read_u32::<LittleEndian>()?;
869            let tf = reader.read_u32::<LittleEndian>()?;
870            term_freqs.push((doc, tf));
871        }
872
873        // Read block metadata
874        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
875        let mut blocks = Vec::with_capacity(num_blocks);
876        for _ in 0..num_blocks {
877            blocks.push(RoaringBlockInfo::deserialize(reader)?);
878        }
879
880        Ok(Self {
881            doc_ids,
882            term_freqs,
883            default_tf,
884            max_tf,
885            blocks,
886            max_score,
887        })
888    }
889
890    /// Create iterator
891    pub fn iterator(&self) -> RoaringPostingIterator<'_> {
892        RoaringPostingIterator {
893            list: self,
894            doc_iter: self.doc_ids.iter(),
895            current_doc: None,
896            current_block: 0,
897        }
898    }
899}
900
901/// Iterator over Roaring posting list with BlockMax support
902pub struct RoaringPostingIterator<'a> {
903    list: &'a RoaringPostingList,
904    doc_iter: RoaringIterator<'a>,
905    current_doc: Option<u32>,
906    current_block: usize,
907}
908
909impl<'a> RoaringPostingIterator<'a> {
910    /// Current document ID
911    pub fn doc(&self) -> u32 {
912        self.current_doc.unwrap_or(u32::MAX)
913    }
914
915    /// Current term frequency
916    pub fn term_freq(&self) -> u32 {
917        match self.current_doc {
918            Some(doc) => self.list.get_tf(doc),
919            None => 0,
920        }
921    }
922
923    /// Advance to next document
924    pub fn advance(&mut self) -> u32 {
925        self.current_doc = self.doc_iter.next();
926        // Update current block if needed
927        if let Some(doc) = self.current_doc
928            && !self.list.blocks.is_empty()
929        {
930            let container_key = (doc >> 16) as u16;
931            // Move to next block if we've passed current one
932            while self.current_block < self.list.blocks.len()
933                && self.list.blocks[self.current_block].container_key < container_key
934            {
935                self.current_block += 1;
936            }
937        }
938        self.doc()
939    }
940
941    /// Initialize (must be called before first use)
942    pub fn init(&mut self) {
943        self.current_doc = self.doc_iter.next();
944        self.current_block = 0;
945    }
946
947    /// Seek to first doc >= target
948    pub fn seek(&mut self, target: u32) -> u32 {
949        // Use block skip list for faster seeking
950        if !self.list.blocks.is_empty() {
951            let target_container = (target >> 16) as u16;
952
953            // Skip blocks that are entirely before target
954            while self.current_block < self.list.blocks.len()
955                && self.list.blocks[self.current_block].last_doc_id < target
956            {
957                self.current_block += 1;
958            }
959
960            // If we've exhausted all blocks
961            if self.current_block >= self.list.blocks.len() {
962                self.current_doc = None;
963                return u32::MAX;
964            }
965
966            // Skip docs until we reach the target block's first doc
967            let block = &self.list.blocks[self.current_block];
968            if block.container_key > target_container
969                || (block.container_key == target_container && block.first_doc_id > self.doc())
970            {
971                // Need to advance iterator to this block
972                while let Some(doc) = self.current_doc {
973                    if doc >= block.first_doc_id {
974                        break;
975                    }
976                    self.current_doc = self.doc_iter.next();
977                }
978            }
979        }
980
981        // Linear scan within block
982        while let Some(doc) = self.current_doc {
983            if doc >= target {
984                return doc;
985            }
986            self.current_doc = self.doc_iter.next();
987        }
988        u32::MAX
989    }
990
991    /// Check if iterator is exhausted
992    pub fn is_exhausted(&self) -> bool {
993        self.current_doc.is_none()
994    }
995
996    /// Get current block's max score (for BlockMax WAND)
997    pub fn current_block_max_score(&self) -> f32 {
998        if self.current_doc.is_none() || self.list.blocks.is_empty() {
999            return 0.0;
1000        }
1001        if self.current_block < self.list.blocks.len() {
1002            self.list.blocks[self.current_block].max_block_score
1003        } else {
1004            0.0
1005        }
1006    }
1007
1008    /// Get current block's max term frequency
1009    pub fn current_block_max_tf(&self) -> u32 {
1010        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1011            return 0;
1012        }
1013        if self.current_block < self.list.blocks.len() {
1014            self.list.blocks[self.current_block].max_tf
1015        } else {
1016            0
1017        }
1018    }
1019
1020    /// Get max score for remaining blocks (for MaxScore optimization)
1021    pub fn max_remaining_score(&self) -> f32 {
1022        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1023            return 0.0;
1024        }
1025        self.list.blocks[self.current_block..]
1026            .iter()
1027            .map(|b| b.max_block_score)
1028            .fold(0.0f32, |a, b| a.max(b))
1029    }
1030
1031    /// Skip to next block containing doc >= target (for BlockWAND)
1032    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
1033    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1034        if self.list.blocks.is_empty() {
1035            return None;
1036        }
1037
1038        while self.current_block < self.list.blocks.len() {
1039            let block = &self.list.blocks[self.current_block];
1040            if block.last_doc_id >= target {
1041                // Advance iterator to this block's first doc
1042                while let Some(doc) = self.current_doc {
1043                    if doc >= block.first_doc_id {
1044                        break;
1045                    }
1046                    self.current_doc = self.doc_iter.next();
1047                }
1048                return Some((block.first_doc_id, block.max_block_score));
1049            }
1050            self.current_block += 1;
1051        }
1052
1053        self.current_doc = None;
1054        None
1055    }
1056}
1057
1058#[cfg(test)]
1059mod tests {
1060    use super::*;
1061
1062    #[test]
1063    fn test_roaring_basic() {
1064        let mut bitmap = RoaringBitmap::new();
1065        bitmap.insert(1);
1066        bitmap.insert(100);
1067        bitmap.insert(1000);
1068        bitmap.insert(100000);
1069
1070        assert!(bitmap.contains(1));
1071        assert!(bitmap.contains(100));
1072        assert!(bitmap.contains(1000));
1073        assert!(bitmap.contains(100000));
1074        assert!(!bitmap.contains(2));
1075        assert!(!bitmap.contains(50000));
1076
1077        assert_eq!(bitmap.cardinality(), 4);
1078    }
1079
1080    #[test]
1081    fn test_roaring_from_sorted() {
1082        let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1083        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1084
1085        assert_eq!(bitmap.cardinality(), 10000);
1086
1087        for &val in &values {
1088            assert!(bitmap.contains(val), "Missing value {}", val);
1089        }
1090    }
1091
1092    #[test]
1093    fn test_roaring_intersection() {
1094        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1095        let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1096
1097        let result = a.and(&b);
1098
1099        assert_eq!(result.cardinality(), 4);
1100        assert!(result.contains(2));
1101        assert!(result.contains(3));
1102        assert!(result.contains(200));
1103        assert!(result.contains(300));
1104    }
1105
1106    #[test]
1107    fn test_roaring_union() {
1108        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1109        let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1110
1111        let result = a.or(&b);
1112
1113        assert_eq!(result.cardinality(), 5);
1114        for i in 1..=5 {
1115            assert!(result.contains(i));
1116        }
1117    }
1118
1119    #[test]
1120    fn test_roaring_serialization() {
1121        let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1122        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1123
1124        let mut buffer = Vec::new();
1125        bitmap.serialize(&mut buffer).unwrap();
1126
1127        let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1128
1129        assert_eq!(restored.cardinality(), bitmap.cardinality());
1130        for &val in &values {
1131            assert!(restored.contains(val));
1132        }
1133    }
1134
1135    #[test]
1136    fn test_roaring_posting_list() {
1137        let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1138        let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1139
1140        let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1141
1142        assert_eq!(list.len(), 7);
1143        assert_eq!(list.default_tf, 1);
1144        assert_eq!(list.max_tf, 3);
1145
1146        // Check TF lookups
1147        assert_eq!(list.get_tf(1), 1);
1148        assert_eq!(list.get_tf(10), 2);
1149        assert_eq!(list.get_tf(100), 3);
1150    }
1151
1152    #[test]
1153    fn test_roaring_iterator() {
1154        let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1155        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1156
1157        let collected: Vec<u32> = bitmap.iter().collect();
1158        assert_eq!(collected, values);
1159    }
1160
1161    #[test]
1162    fn test_roaring_block_max() {
1163        // Create posting list spanning multiple containers (high 16 bits)
1164        // Container 0: doc_ids 0-65535
1165        // Container 1: doc_ids 65536-131071
1166        // Container 2: doc_ids 131072-196607
1167        let mut doc_ids = Vec::new();
1168        let mut term_freqs = Vec::new();
1169
1170        // Container 0: 100 docs with max_tf = 2
1171        for i in 0..100 {
1172            doc_ids.push(i * 100);
1173            term_freqs.push(if i == 50 { 2 } else { 1 });
1174        }
1175
1176        // Container 1: 100 docs with max_tf = 5
1177        for i in 0..100 {
1178            doc_ids.push(65536 + i * 100);
1179            term_freqs.push(if i == 25 { 5 } else { 1 });
1180        }
1181
1182        // Container 2: 100 docs with max_tf = 3
1183        for i in 0..100 {
1184            doc_ids.push(131072 + i * 100);
1185            term_freqs.push(if i == 75 { 3 } else { 1 });
1186        }
1187
1188        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1189
1190        // Should have 3 blocks (one per container)
1191        assert_eq!(list.num_blocks(), 3);
1192        assert_eq!(list.blocks[0].container_key, 0);
1193        assert_eq!(list.blocks[1].container_key, 1);
1194        assert_eq!(list.blocks[2].container_key, 2);
1195
1196        assert_eq!(list.blocks[0].max_tf, 2);
1197        assert_eq!(list.blocks[1].max_tf, 5);
1198        assert_eq!(list.blocks[2].max_tf, 3);
1199
1200        // Block 1 should have highest score (max_tf = 5)
1201        assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1202        assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1203
1204        // Global max_score should equal block 1's score
1205        assert_eq!(list.max_score, list.blocks[1].max_block_score);
1206
1207        // Test iterator block-max methods
1208        let mut iter = list.iterator();
1209        iter.init();
1210        assert_eq!(iter.current_block_max_tf(), 2); // Block 0
1211
1212        // Seek to block 1
1213        iter.seek(65536);
1214        assert_eq!(iter.current_block_max_tf(), 5);
1215
1216        // Seek to block 2
1217        iter.seek(131072);
1218        assert_eq!(iter.current_block_max_tf(), 3);
1219    }
1220
1221    #[test]
1222    fn test_roaring_block_max_serialization() {
1223        let mut doc_ids = Vec::new();
1224        let mut term_freqs = Vec::new();
1225
1226        // Two containers
1227        for i in 0..50 {
1228            doc_ids.push(i * 10);
1229            term_freqs.push((i % 5) + 1);
1230        }
1231        for i in 0..50 {
1232            doc_ids.push(65536 + i * 10);
1233            term_freqs.push((i % 3) + 1);
1234        }
1235
1236        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1237
1238        let mut buffer = Vec::new();
1239        list.serialize(&mut buffer).unwrap();
1240
1241        let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1242
1243        assert_eq!(restored.len(), list.len());
1244        assert_eq!(restored.max_tf, list.max_tf);
1245        assert_eq!(restored.max_score, list.max_score);
1246        assert_eq!(restored.num_blocks(), list.num_blocks());
1247
1248        // Verify block metadata
1249        for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1250            assert_eq!(orig.container_key, rest.container_key);
1251            assert_eq!(orig.first_doc_id, rest.first_doc_id);
1252            assert_eq!(orig.last_doc_id, rest.last_doc_id);
1253            assert_eq!(orig.max_tf, rest.max_tf);
1254            assert_eq!(orig.max_block_score, rest.max_block_score);
1255        }
1256
1257        // Verify iteration produces same results
1258        let mut iter1 = list.iterator();
1259        let mut iter2 = restored.iterator();
1260        iter1.init();
1261        iter2.init();
1262
1263        while !iter1.is_exhausted() {
1264            assert_eq!(iter1.doc(), iter2.doc());
1265            assert_eq!(iter1.term_freq(), iter2.term_freq());
1266            iter1.advance();
1267            iter2.advance();
1268        }
1269        assert!(iter2.is_exhausted());
1270    }
1271}