hermes_core/structures/
roaring.rs

1//! Roaring Bitmap implementation for compressed integer sets
2//!
3//! Roaring bitmaps use a hybrid approach:
4//! - Sparse containers: sorted arrays for low-density chunks
5//! - Dense containers: bitmaps for high-density chunks
6//! - Run containers: RLE for consecutive ranges
7//!
8//! This provides excellent compression and fast set operations.
9//! Used by Apache Lucene, Spark, Druid, and many databases.
10
11use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14/// Threshold for switching from array to bitmap container
15/// If a container has more than this many elements, use bitmap
16const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18/// Container types for different density patterns
19#[derive(Debug, Clone)]
20enum Container {
21    /// Sorted array of 16-bit values (for sparse containers)
22    Array(Vec<u16>),
23    /// Bitmap of 2^16 bits (for dense containers)
24    Bitmap(Box<[u64; 1024]>),
25    /// Run-length encoded (start, length) pairs
26    Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30    fn new_array() -> Self {
31        Container::Array(Vec::new())
32    }
33
34    #[allow(dead_code)]
35    fn new_bitmap() -> Self {
36        Container::Bitmap(Box::new([0u64; 1024]))
37    }
38
39    fn cardinality(&self) -> u32 {
40        match self {
41            Container::Array(arr) => arr.len() as u32,
42            Container::Bitmap(bm) => bm.iter().map(|w| w.count_ones()).sum(),
43            Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44        }
45    }
46
47    fn contains(&self, val: u16) -> bool {
48        match self {
49            Container::Array(arr) => arr.binary_search(&val).is_ok(),
50            Container::Bitmap(bm) => {
51                let word_idx = (val / 64) as usize;
52                let bit_idx = val % 64;
53                (bm[word_idx] >> bit_idx) & 1 == 1
54            }
55            Container::Runs(runs) => {
56                for &(start, len) in runs {
57                    if val >= start && val <= start + len {
58                        return true;
59                    }
60                    if val < start {
61                        return false;
62                    }
63                }
64                false
65            }
66        }
67    }
68
69    fn insert(&mut self, val: u16) -> bool {
70        match self {
71            Container::Array(arr) => {
72                match arr.binary_search(&val) {
73                    Ok(_) => false, // Already exists
74                    Err(pos) => {
75                        arr.insert(pos, val);
76                        true
77                    }
78                }
79            }
80            Container::Bitmap(bm) => {
81                let word_idx = (val / 64) as usize;
82                let bit_idx = val % 64;
83                let old = bm[word_idx];
84                bm[word_idx] |= 1u64 << bit_idx;
85                old != bm[word_idx]
86            }
87            Container::Runs(runs) => {
88                // For simplicity, convert to array, insert, then optimize
89                let mut arr: Vec<u16> = Vec::new();
90                for &(start, len) in runs.iter() {
91                    for i in 0..=len {
92                        arr.push(start + i);
93                    }
94                }
95                let inserted = match arr.binary_search(&val) {
96                    Ok(_) => false,
97                    Err(pos) => {
98                        arr.insert(pos, val);
99                        true
100                    }
101                };
102                *self = Container::Array(arr);
103                inserted
104            }
105        }
106    }
107
108    fn optimize(&mut self) {
109        let card = self.cardinality() as usize;
110
111        match self {
112            Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
113                // Convert to bitmap
114                let mut bm = Box::new([0u64; 1024]);
115                for &val in arr.iter() {
116                    let word_idx = (val / 64) as usize;
117                    let bit_idx = val % 64;
118                    bm[word_idx] |= 1u64 << bit_idx;
119                }
120                *self = Container::Bitmap(bm);
121            }
122            Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
123                // Convert to array
124                let mut arr = Vec::with_capacity(card);
125                for (word_idx, &word) in bm.iter().enumerate() {
126                    let mut w = word;
127                    while w != 0 {
128                        let bit_idx = w.trailing_zeros();
129                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
130                        w &= w - 1;
131                    }
132                }
133                *self = Container::Array(arr);
134            }
135            _ => {}
136        }
137
138        // Try run-length encoding if beneficial
139        self.try_run_encode();
140    }
141
142    fn try_run_encode(&mut self) {
143        let arr = match self {
144            Container::Array(arr) if arr.len() >= 4 => arr,
145            _ => return,
146        };
147
148        let mut runs = Vec::new();
149        let mut i = 0;
150
151        while i < arr.len() {
152            let start = arr[i];
153            let mut end = start;
154
155            while i + 1 < arr.len() && arr[i + 1] == end + 1 {
156                end = arr[i + 1];
157                i += 1;
158            }
159
160            runs.push((start, end - start));
161            i += 1;
162        }
163
164        // Only use runs if it saves space
165        // Array: 2 bytes per element
166        // Runs: 4 bytes per run
167        if runs.len() * 4 < arr.len() * 2 {
168            *self = Container::Runs(runs);
169        }
170    }
171
172    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
173        match self {
174            Container::Array(arr) => {
175                writer.write_u8(0)?; // Type tag
176                writer.write_u16::<LittleEndian>(arr.len() as u16)?;
177                for &val in arr {
178                    writer.write_u16::<LittleEndian>(val)?;
179                }
180            }
181            Container::Bitmap(bm) => {
182                writer.write_u8(1)?; // Type tag
183                for &word in bm.iter() {
184                    writer.write_u64::<LittleEndian>(word)?;
185                }
186            }
187            Container::Runs(runs) => {
188                writer.write_u8(2)?; // Type tag
189                writer.write_u16::<LittleEndian>(runs.len() as u16)?;
190                for &(start, len) in runs {
191                    writer.write_u16::<LittleEndian>(start)?;
192                    writer.write_u16::<LittleEndian>(len)?;
193                }
194            }
195        }
196        Ok(())
197    }
198
199    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
200        let tag = reader.read_u8()?;
201        match tag {
202            0 => {
203                let len = reader.read_u16::<LittleEndian>()? as usize;
204                let mut arr = Vec::with_capacity(len);
205                for _ in 0..len {
206                    arr.push(reader.read_u16::<LittleEndian>()?);
207                }
208                Ok(Container::Array(arr))
209            }
210            1 => {
211                let mut bm = Box::new([0u64; 1024]);
212                for word in bm.iter_mut() {
213                    *word = reader.read_u64::<LittleEndian>()?;
214                }
215                Ok(Container::Bitmap(bm))
216            }
217            2 => {
218                let len = reader.read_u16::<LittleEndian>()? as usize;
219                let mut runs = Vec::with_capacity(len);
220                for _ in 0..len {
221                    let start = reader.read_u16::<LittleEndian>()?;
222                    let run_len = reader.read_u16::<LittleEndian>()?;
223                    runs.push((start, run_len));
224                }
225                Ok(Container::Runs(runs))
226            }
227            _ => Err(io::Error::new(
228                io::ErrorKind::InvalidData,
229                "Invalid container type",
230            )),
231        }
232    }
233
234    fn size_bytes(&self) -> usize {
235        match self {
236            Container::Array(arr) => arr.len() * 2 + 4,
237            Container::Bitmap(_) => 8 * 1024 + 1,
238            Container::Runs(runs) => runs.len() * 4 + 4,
239        }
240    }
241}
242
243/// Roaring Bitmap for compressed integer sets
244#[derive(Debug, Clone)]
245pub struct RoaringBitmap {
246    /// High 16 bits -> container mapping
247    containers: Vec<(u16, Container)>,
248}
249
250impl RoaringBitmap {
251    /// Create empty bitmap
252    pub fn new() -> Self {
253        Self {
254            containers: Vec::new(),
255        }
256    }
257
258    /// Create from sorted slice
259    pub fn from_sorted_slice(values: &[u32]) -> Self {
260        let mut bitmap = Self::new();
261
262        if values.is_empty() {
263            return bitmap;
264        }
265
266        let mut current_high = (values[0] >> 16) as u16;
267        let mut current_container = Container::new_array();
268
269        for &val in values {
270            let high = (val >> 16) as u16;
271            let low = val as u16;
272
273            if high != current_high {
274                current_container.optimize();
275                bitmap.containers.push((current_high, current_container));
276                current_high = high;
277                current_container = Container::new_array();
278            }
279
280            current_container.insert(low);
281        }
282
283        current_container.optimize();
284        bitmap.containers.push((current_high, current_container));
285
286        bitmap
287    }
288
289    /// Insert a value
290    pub fn insert(&mut self, val: u32) -> bool {
291        let high = (val >> 16) as u16;
292        let low = val as u16;
293
294        // Find or create container
295        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
296            Ok(idx) => self.containers[idx].1.insert(low),
297            Err(idx) => {
298                let mut container = Container::new_array();
299                container.insert(low);
300                self.containers.insert(idx, (high, container));
301                true
302            }
303        }
304    }
305
306    /// Check if value exists
307    pub fn contains(&self, val: u32) -> bool {
308        let high = (val >> 16) as u16;
309        let low = val as u16;
310
311        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
312            Ok(idx) => self.containers[idx].1.contains(low),
313            Err(_) => false,
314        }
315    }
316
317    /// Number of elements
318    pub fn cardinality(&self) -> u32 {
319        self.containers.iter().map(|(_, c)| c.cardinality()).sum()
320    }
321
322    /// Check if empty
323    pub fn is_empty(&self) -> bool {
324        self.containers.is_empty()
325    }
326
327    /// Optimize all containers
328    pub fn optimize(&mut self) {
329        for (_, container) in &mut self.containers {
330            container.optimize();
331        }
332    }
333
334    /// Intersection with another bitmap
335    pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
336        let mut result = RoaringBitmap::new();
337
338        let mut i = 0;
339        let mut j = 0;
340
341        while i < self.containers.len() && j < other.containers.len() {
342            let (high1, c1) = &self.containers[i];
343            let (high2, c2) = &other.containers[j];
344
345            match high1.cmp(high2) {
346                std::cmp::Ordering::Less => i += 1,
347                std::cmp::Ordering::Greater => j += 1,
348                std::cmp::Ordering::Equal => {
349                    let intersected = Self::intersect_containers(c1, c2);
350                    if intersected.cardinality() > 0 {
351                        result.containers.push((*high1, intersected));
352                    }
353                    i += 1;
354                    j += 1;
355                }
356            }
357        }
358
359        result
360    }
361
362    fn intersect_containers(c1: &Container, c2: &Container) -> Container {
363        match (c1, c2) {
364            (Container::Array(a1), Container::Array(a2)) => {
365                let mut result = Vec::new();
366                let mut i = 0;
367                let mut j = 0;
368                while i < a1.len() && j < a2.len() {
369                    match a1[i].cmp(&a2[j]) {
370                        std::cmp::Ordering::Less => i += 1,
371                        std::cmp::Ordering::Greater => j += 1,
372                        std::cmp::Ordering::Equal => {
373                            result.push(a1[i]);
374                            i += 1;
375                            j += 1;
376                        }
377                    }
378                }
379                Container::Array(result)
380            }
381            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
382                let mut result = Box::new([0u64; 1024]);
383                for i in 0..1024 {
384                    result[i] = b1[i] & b2[i];
385                }
386                let mut c = Container::Bitmap(result);
387                c.optimize();
388                c
389            }
390            (Container::Array(arr), Container::Bitmap(bm))
391            | (Container::Bitmap(bm), Container::Array(arr)) => {
392                let mut result = Vec::new();
393                for &val in arr {
394                    let word_idx = (val / 64) as usize;
395                    let bit_idx = val % 64;
396                    if (bm[word_idx] >> bit_idx) & 1 == 1 {
397                        result.push(val);
398                    }
399                }
400                Container::Array(result)
401            }
402            _ => {
403                // For runs, convert to array first
404                let arr1 = Self::container_to_array(c1);
405                let arr2 = Self::container_to_array(c2);
406                Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
407            }
408        }
409    }
410
411    fn container_to_array(c: &Container) -> Vec<u16> {
412        match c {
413            Container::Array(arr) => arr.clone(),
414            Container::Bitmap(bm) => {
415                let mut arr = Vec::new();
416                for (word_idx, &word) in bm.iter().enumerate() {
417                    let mut w = word;
418                    while w != 0 {
419                        let bit_idx = w.trailing_zeros();
420                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
421                        w &= w - 1;
422                    }
423                }
424                arr
425            }
426            Container::Runs(runs) => {
427                let mut arr = Vec::new();
428                for &(start, len) in runs {
429                    for i in 0..=len {
430                        arr.push(start + i);
431                    }
432                }
433                arr
434            }
435        }
436    }
437
438    /// Union with another bitmap
439    pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
440        let mut result = RoaringBitmap::new();
441
442        let mut i = 0;
443        let mut j = 0;
444
445        while i < self.containers.len() || j < other.containers.len() {
446            if i >= self.containers.len() {
447                result.containers.push(other.containers[j].clone());
448                j += 1;
449            } else if j >= other.containers.len() {
450                result.containers.push(self.containers[i].clone());
451                i += 1;
452            } else {
453                let (high1, c1) = &self.containers[i];
454                let (high2, c2) = &other.containers[j];
455
456                match high1.cmp(high2) {
457                    std::cmp::Ordering::Less => {
458                        result.containers.push(self.containers[i].clone());
459                        i += 1;
460                    }
461                    std::cmp::Ordering::Greater => {
462                        result.containers.push(other.containers[j].clone());
463                        j += 1;
464                    }
465                    std::cmp::Ordering::Equal => {
466                        let united = Self::union_containers(c1, c2);
467                        result.containers.push((*high1, united));
468                        i += 1;
469                        j += 1;
470                    }
471                }
472            }
473        }
474
475        result
476    }
477
478    fn union_containers(c1: &Container, c2: &Container) -> Container {
479        match (c1, c2) {
480            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
481                let mut result = Box::new([0u64; 1024]);
482                for i in 0..1024 {
483                    result[i] = b1[i] | b2[i];
484                }
485                Container::Bitmap(result)
486            }
487            _ => {
488                // Convert both to arrays and merge
489                let arr1 = Self::container_to_array(c1);
490                let arr2 = Self::container_to_array(c2);
491
492                let mut result = Vec::with_capacity(arr1.len() + arr2.len());
493                let mut i = 0;
494                let mut j = 0;
495
496                while i < arr1.len() && j < arr2.len() {
497                    match arr1[i].cmp(&arr2[j]) {
498                        std::cmp::Ordering::Less => {
499                            result.push(arr1[i]);
500                            i += 1;
501                        }
502                        std::cmp::Ordering::Greater => {
503                            result.push(arr2[j]);
504                            j += 1;
505                        }
506                        std::cmp::Ordering::Equal => {
507                            result.push(arr1[i]);
508                            i += 1;
509                            j += 1;
510                        }
511                    }
512                }
513
514                result.extend_from_slice(&arr1[i..]);
515                result.extend_from_slice(&arr2[j..]);
516
517                let mut c = Container::Array(result);
518                c.optimize();
519                c
520            }
521        }
522    }
523
524    /// Serialize
525    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
526        writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
527        for (high, container) in &self.containers {
528            writer.write_u16::<LittleEndian>(*high)?;
529            container.serialize(writer)?;
530        }
531        Ok(())
532    }
533
534    /// Deserialize
535    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
536        let num_containers = reader.read_u32::<LittleEndian>()? as usize;
537        let mut containers = Vec::with_capacity(num_containers);
538
539        for _ in 0..num_containers {
540            let high = reader.read_u16::<LittleEndian>()?;
541            let container = Container::deserialize(reader)?;
542            containers.push((high, container));
543        }
544
545        Ok(Self { containers })
546    }
547
548    /// Approximate size in bytes
549    pub fn size_bytes(&self) -> usize {
550        4 + self
551            .containers
552            .iter()
553            .map(|(_, c)| 2 + c.size_bytes())
554            .sum::<usize>()
555    }
556
557    /// Create an iterator
558    pub fn iter(&self) -> RoaringIterator<'_> {
559        RoaringIterator {
560            bitmap: self,
561            container_idx: 0,
562            value_idx: 0,
563            current_values: Vec::new(),
564        }
565    }
566}
567
568impl Default for RoaringBitmap {
569    fn default() -> Self {
570        Self::new()
571    }
572}
573
574/// Iterator over Roaring Bitmap
575pub struct RoaringIterator<'a> {
576    bitmap: &'a RoaringBitmap,
577    container_idx: usize,
578    value_idx: usize,
579    current_values: Vec<u16>,
580}
581
582impl<'a> Iterator for RoaringIterator<'a> {
583    type Item = u32;
584
585    fn next(&mut self) -> Option<Self::Item> {
586        loop {
587            if self.value_idx < self.current_values.len() {
588                let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
589                let low = self.current_values[self.value_idx] as u32;
590                self.value_idx += 1;
591                return Some((high << 16) | low);
592            }
593
594            if self.container_idx >= self.bitmap.containers.len() {
595                return None;
596            }
597
598            let (_, container) = &self.bitmap.containers[self.container_idx];
599            self.current_values = RoaringBitmap::container_to_array(container);
600            self.value_idx = 0;
601            self.container_idx += 1;
602        }
603    }
604}
605
606/// Block size for Roaring BlockMax (matches container size = 65536 doc_ids)
607pub const ROARING_BLOCK_SIZE: usize = 65536;
608
609/// Block metadata for BlockMax WAND optimization in Roaring
610#[derive(Debug, Clone)]
611pub struct RoaringBlockInfo {
612    /// High 16 bits (container key)
613    pub container_key: u16,
614    /// First doc_id in this container
615    pub first_doc_id: u32,
616    /// Last doc_id in this container
617    pub last_doc_id: u32,
618    /// Maximum term frequency in this block
619    pub max_tf: u32,
620    /// Upper bound BM25 score for this block
621    pub max_block_score: f32,
622    /// Number of documents in this container
623    pub num_docs: u32,
624}
625
626impl RoaringBlockInfo {
627    /// Serialize block info
628    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
629        writer.write_u16::<LittleEndian>(self.container_key)?;
630        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
631        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
632        writer.write_u32::<LittleEndian>(self.max_tf)?;
633        writer.write_f32::<LittleEndian>(self.max_block_score)?;
634        writer.write_u32::<LittleEndian>(self.num_docs)?;
635        Ok(())
636    }
637
638    /// Deserialize block info
639    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
640        Ok(Self {
641            container_key: reader.read_u16::<LittleEndian>()?,
642            first_doc_id: reader.read_u32::<LittleEndian>()?,
643            last_doc_id: reader.read_u32::<LittleEndian>()?,
644            max_tf: reader.read_u32::<LittleEndian>()?,
645            max_block_score: reader.read_f32::<LittleEndian>()?,
646            num_docs: reader.read_u32::<LittleEndian>()?,
647        })
648    }
649}
650
651/// Roaring bitmap with term frequencies for posting lists
652#[derive(Debug, Clone)]
653pub struct RoaringPostingList {
654    /// Document IDs as roaring bitmap
655    pub doc_ids: RoaringBitmap,
656    /// Term frequencies (sparse map for non-1 frequencies)
657    /// Most terms have tf=1, so we only store exceptions
658    pub term_freqs: Vec<(u32, u32)>,
659    /// Default term frequency (usually 1)
660    pub default_tf: u32,
661    /// Maximum term frequency
662    pub max_tf: u32,
663    /// Block metadata for BlockMax WAND (one per container)
664    pub blocks: Vec<RoaringBlockInfo>,
665    /// Global maximum score across all blocks
666    pub max_score: f32,
667}
668
669impl RoaringPostingList {
670    /// BM25 parameters for block-max score calculation
671    const K1: f32 = 1.2;
672    const B: f32 = 0.75;
673
674    /// Compute BM25 upper bound score for a given max_tf and IDF
675    #[inline]
676    pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
677        let tf = max_tf as f32;
678        // Conservative upper bound: assume dl=0, so length_norm = 1 - b = 0.25
679        let min_length_norm = 1.0 - Self::B;
680        let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
681        idf * tf_norm
682    }
683
684    /// Create from doc_ids and term frequencies (without IDF - for backwards compatibility)
685    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
686        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
687    }
688
689    /// Create from doc_ids and term frequencies with IDF for block-max scores
690    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
691        assert_eq!(doc_ids.len(), term_freqs.len());
692
693        if doc_ids.is_empty() {
694            return Self {
695                doc_ids: RoaringBitmap::new(),
696                term_freqs: Vec::new(),
697                default_tf: 1,
698                max_tf: 0,
699                blocks: Vec::new(),
700                max_score: 0.0,
701            };
702        }
703
704        let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
705
706        // Find most common TF (usually 1)
707        let mut tf_counts = std::collections::HashMap::new();
708        for &tf in term_freqs {
709            *tf_counts.entry(tf).or_insert(0u32) += 1;
710        }
711        let default_tf = tf_counts
712            .iter()
713            .max_by_key(|&(_, count)| count)
714            .map(|(&tf, _)| tf)
715            .unwrap_or(1);
716
717        // Store only non-default TFs
718        let exceptions: Vec<(u32, u32)> = doc_ids
719            .iter()
720            .zip(term_freqs.iter())
721            .filter(|&(_, &tf)| tf != default_tf)
722            .map(|(&doc, &tf)| (doc, tf))
723            .collect();
724
725        let max_tf = *term_freqs.iter().max().unwrap_or(&1);
726
727        // Build block metadata (one block per container)
728        // Group doc_ids by container (high 16 bits)
729        let mut blocks = Vec::new();
730        let mut max_score = 0.0f32;
731        let mut i = 0;
732
733        while i < doc_ids.len() {
734            let container_key = (doc_ids[i] >> 16) as u16;
735            let block_start = i;
736
737            // Find all docs in this container
738            while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
739                i += 1;
740            }
741
742            let block_doc_ids = &doc_ids[block_start..i];
743            let block_tfs = &term_freqs[block_start..i];
744            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
745            let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
746            max_score = max_score.max(block_score);
747
748            blocks.push(RoaringBlockInfo {
749                container_key,
750                first_doc_id: block_doc_ids[0],
751                last_doc_id: *block_doc_ids.last().unwrap(),
752                max_tf: block_max_tf,
753                max_block_score: block_score,
754                num_docs: block_doc_ids.len() as u32,
755            });
756        }
757
758        Self {
759            doc_ids: bitmap,
760            term_freqs: exceptions,
761            default_tf,
762            max_tf,
763            blocks,
764            max_score,
765        }
766    }
767
768    /// Get term frequency for a document
769    pub fn get_tf(&self, doc_id: u32) -> u32 {
770        // Binary search in exceptions
771        match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
772            Ok(idx) => self.term_freqs[idx].1,
773            Err(_) => self.default_tf,
774        }
775    }
776
777    /// Number of documents
778    pub fn len(&self) -> u32 {
779        self.doc_ids.cardinality()
780    }
781
782    /// Check if empty
783    pub fn is_empty(&self) -> bool {
784        self.doc_ids.is_empty()
785    }
786
787    /// Get number of blocks (containers)
788    pub fn num_blocks(&self) -> usize {
789        self.blocks.len()
790    }
791
792    /// Get block index for a doc_id
793    pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
794        let container_key = (doc_id >> 16) as u16;
795        self.blocks
796            .binary_search_by_key(&container_key, |b| b.container_key)
797            .ok()
798    }
799
800    /// Serialize
801    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
802        self.doc_ids.serialize(writer)?;
803        writer.write_u32::<LittleEndian>(self.default_tf)?;
804        writer.write_u32::<LittleEndian>(self.max_tf)?;
805        writer.write_f32::<LittleEndian>(self.max_score)?;
806        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
807        for &(doc, tf) in &self.term_freqs {
808            writer.write_u32::<LittleEndian>(doc)?;
809            writer.write_u32::<LittleEndian>(tf)?;
810        }
811
812        // Write block metadata
813        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
814        for block in &self.blocks {
815            block.serialize(writer)?;
816        }
817
818        Ok(())
819    }
820
821    /// Deserialize
822    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
823        let doc_ids = RoaringBitmap::deserialize(reader)?;
824        let default_tf = reader.read_u32::<LittleEndian>()?;
825        let max_tf = reader.read_u32::<LittleEndian>()?;
826        let max_score = reader.read_f32::<LittleEndian>()?;
827        let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
828        let mut term_freqs = Vec::with_capacity(num_exceptions);
829        for _ in 0..num_exceptions {
830            let doc = reader.read_u32::<LittleEndian>()?;
831            let tf = reader.read_u32::<LittleEndian>()?;
832            term_freqs.push((doc, tf));
833        }
834
835        // Read block metadata
836        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
837        let mut blocks = Vec::with_capacity(num_blocks);
838        for _ in 0..num_blocks {
839            blocks.push(RoaringBlockInfo::deserialize(reader)?);
840        }
841
842        Ok(Self {
843            doc_ids,
844            term_freqs,
845            default_tf,
846            max_tf,
847            blocks,
848            max_score,
849        })
850    }
851
852    /// Create iterator
853    pub fn iterator(&self) -> RoaringPostingIterator<'_> {
854        RoaringPostingIterator {
855            list: self,
856            doc_iter: self.doc_ids.iter(),
857            current_doc: None,
858            current_block: 0,
859        }
860    }
861}
862
863/// Iterator over Roaring posting list with BlockMax support
864pub struct RoaringPostingIterator<'a> {
865    list: &'a RoaringPostingList,
866    doc_iter: RoaringIterator<'a>,
867    current_doc: Option<u32>,
868    current_block: usize,
869}
870
871impl<'a> RoaringPostingIterator<'a> {
872    /// Current document ID
873    pub fn doc(&self) -> u32 {
874        self.current_doc.unwrap_or(u32::MAX)
875    }
876
877    /// Current term frequency
878    pub fn term_freq(&self) -> u32 {
879        match self.current_doc {
880            Some(doc) => self.list.get_tf(doc),
881            None => 0,
882        }
883    }
884
885    /// Advance to next document
886    pub fn advance(&mut self) -> u32 {
887        self.current_doc = self.doc_iter.next();
888        // Update current block if needed
889        if let Some(doc) = self.current_doc
890            && !self.list.blocks.is_empty()
891        {
892            let container_key = (doc >> 16) as u16;
893            // Move to next block if we've passed current one
894            while self.current_block < self.list.blocks.len()
895                && self.list.blocks[self.current_block].container_key < container_key
896            {
897                self.current_block += 1;
898            }
899        }
900        self.doc()
901    }
902
903    /// Initialize (must be called before first use)
904    pub fn init(&mut self) {
905        self.current_doc = self.doc_iter.next();
906        self.current_block = 0;
907    }
908
909    /// Seek to first doc >= target
910    pub fn seek(&mut self, target: u32) -> u32 {
911        // Use block skip list for faster seeking
912        if !self.list.blocks.is_empty() {
913            let target_container = (target >> 16) as u16;
914
915            // Skip blocks that are entirely before target
916            while self.current_block < self.list.blocks.len()
917                && self.list.blocks[self.current_block].last_doc_id < target
918            {
919                self.current_block += 1;
920            }
921
922            // If we've exhausted all blocks
923            if self.current_block >= self.list.blocks.len() {
924                self.current_doc = None;
925                return u32::MAX;
926            }
927
928            // Skip docs until we reach the target block's first doc
929            let block = &self.list.blocks[self.current_block];
930            if block.container_key > target_container
931                || (block.container_key == target_container && block.first_doc_id > self.doc())
932            {
933                // Need to advance iterator to this block
934                while let Some(doc) = self.current_doc {
935                    if doc >= block.first_doc_id {
936                        break;
937                    }
938                    self.current_doc = self.doc_iter.next();
939                }
940            }
941        }
942
943        // Linear scan within block
944        while let Some(doc) = self.current_doc {
945            if doc >= target {
946                return doc;
947            }
948            self.current_doc = self.doc_iter.next();
949        }
950        u32::MAX
951    }
952
953    /// Check if iterator is exhausted
954    pub fn is_exhausted(&self) -> bool {
955        self.current_doc.is_none()
956    }
957
958    /// Get current block's max score (for BlockMax WAND)
959    pub fn current_block_max_score(&self) -> f32 {
960        if self.current_doc.is_none() || self.list.blocks.is_empty() {
961            return 0.0;
962        }
963        if self.current_block < self.list.blocks.len() {
964            self.list.blocks[self.current_block].max_block_score
965        } else {
966            0.0
967        }
968    }
969
970    /// Get current block's max term frequency
971    pub fn current_block_max_tf(&self) -> u32 {
972        if self.current_doc.is_none() || self.list.blocks.is_empty() {
973            return 0;
974        }
975        if self.current_block < self.list.blocks.len() {
976            self.list.blocks[self.current_block].max_tf
977        } else {
978            0
979        }
980    }
981
982    /// Get max score for remaining blocks (for MaxScore optimization)
983    pub fn max_remaining_score(&self) -> f32 {
984        if self.current_doc.is_none() || self.list.blocks.is_empty() {
985            return 0.0;
986        }
987        self.list.blocks[self.current_block..]
988            .iter()
989            .map(|b| b.max_block_score)
990            .fold(0.0f32, |a, b| a.max(b))
991    }
992
993    /// Skip to next block containing doc >= target (for BlockWAND)
994    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
995    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
996        if self.list.blocks.is_empty() {
997            return None;
998        }
999
1000        while self.current_block < self.list.blocks.len() {
1001            let block = &self.list.blocks[self.current_block];
1002            if block.last_doc_id >= target {
1003                // Advance iterator to this block's first doc
1004                while let Some(doc) = self.current_doc {
1005                    if doc >= block.first_doc_id {
1006                        break;
1007                    }
1008                    self.current_doc = self.doc_iter.next();
1009                }
1010                return Some((block.first_doc_id, block.max_block_score));
1011            }
1012            self.current_block += 1;
1013        }
1014
1015        self.current_doc = None;
1016        None
1017    }
1018}
1019
1020#[cfg(test)]
1021mod tests {
1022    use super::*;
1023
1024    #[test]
1025    fn test_roaring_basic() {
1026        let mut bitmap = RoaringBitmap::new();
1027        bitmap.insert(1);
1028        bitmap.insert(100);
1029        bitmap.insert(1000);
1030        bitmap.insert(100000);
1031
1032        assert!(bitmap.contains(1));
1033        assert!(bitmap.contains(100));
1034        assert!(bitmap.contains(1000));
1035        assert!(bitmap.contains(100000));
1036        assert!(!bitmap.contains(2));
1037        assert!(!bitmap.contains(50000));
1038
1039        assert_eq!(bitmap.cardinality(), 4);
1040    }
1041
1042    #[test]
1043    fn test_roaring_from_sorted() {
1044        let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1045        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1046
1047        assert_eq!(bitmap.cardinality(), 10000);
1048
1049        for &val in &values {
1050            assert!(bitmap.contains(val), "Missing value {}", val);
1051        }
1052    }
1053
1054    #[test]
1055    fn test_roaring_intersection() {
1056        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1057        let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1058
1059        let result = a.and(&b);
1060
1061        assert_eq!(result.cardinality(), 4);
1062        assert!(result.contains(2));
1063        assert!(result.contains(3));
1064        assert!(result.contains(200));
1065        assert!(result.contains(300));
1066    }
1067
1068    #[test]
1069    fn test_roaring_union() {
1070        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1071        let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1072
1073        let result = a.or(&b);
1074
1075        assert_eq!(result.cardinality(), 5);
1076        for i in 1..=5 {
1077            assert!(result.contains(i));
1078        }
1079    }
1080
1081    #[test]
1082    fn test_roaring_serialization() {
1083        let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1084        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1085
1086        let mut buffer = Vec::new();
1087        bitmap.serialize(&mut buffer).unwrap();
1088
1089        let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1090
1091        assert_eq!(restored.cardinality(), bitmap.cardinality());
1092        for &val in &values {
1093            assert!(restored.contains(val));
1094        }
1095    }
1096
1097    #[test]
1098    fn test_roaring_posting_list() {
1099        let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1100        let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1101
1102        let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1103
1104        assert_eq!(list.len(), 7);
1105        assert_eq!(list.default_tf, 1);
1106        assert_eq!(list.max_tf, 3);
1107
1108        // Check TF lookups
1109        assert_eq!(list.get_tf(1), 1);
1110        assert_eq!(list.get_tf(10), 2);
1111        assert_eq!(list.get_tf(100), 3);
1112    }
1113
1114    #[test]
1115    fn test_roaring_iterator() {
1116        let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1117        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1118
1119        let collected: Vec<u32> = bitmap.iter().collect();
1120        assert_eq!(collected, values);
1121    }
1122
1123    #[test]
1124    fn test_roaring_block_max() {
1125        // Create posting list spanning multiple containers (high 16 bits)
1126        // Container 0: doc_ids 0-65535
1127        // Container 1: doc_ids 65536-131071
1128        // Container 2: doc_ids 131072-196607
1129        let mut doc_ids = Vec::new();
1130        let mut term_freqs = Vec::new();
1131
1132        // Container 0: 100 docs with max_tf = 2
1133        for i in 0..100 {
1134            doc_ids.push(i * 100);
1135            term_freqs.push(if i == 50 { 2 } else { 1 });
1136        }
1137
1138        // Container 1: 100 docs with max_tf = 5
1139        for i in 0..100 {
1140            doc_ids.push(65536 + i * 100);
1141            term_freqs.push(if i == 25 { 5 } else { 1 });
1142        }
1143
1144        // Container 2: 100 docs with max_tf = 3
1145        for i in 0..100 {
1146            doc_ids.push(131072 + i * 100);
1147            term_freqs.push(if i == 75 { 3 } else { 1 });
1148        }
1149
1150        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1151
1152        // Should have 3 blocks (one per container)
1153        assert_eq!(list.num_blocks(), 3);
1154        assert_eq!(list.blocks[0].container_key, 0);
1155        assert_eq!(list.blocks[1].container_key, 1);
1156        assert_eq!(list.blocks[2].container_key, 2);
1157
1158        assert_eq!(list.blocks[0].max_tf, 2);
1159        assert_eq!(list.blocks[1].max_tf, 5);
1160        assert_eq!(list.blocks[2].max_tf, 3);
1161
1162        // Block 1 should have highest score (max_tf = 5)
1163        assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1164        assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1165
1166        // Global max_score should equal block 1's score
1167        assert_eq!(list.max_score, list.blocks[1].max_block_score);
1168
1169        // Test iterator block-max methods
1170        let mut iter = list.iterator();
1171        iter.init();
1172        assert_eq!(iter.current_block_max_tf(), 2); // Block 0
1173
1174        // Seek to block 1
1175        iter.seek(65536);
1176        assert_eq!(iter.current_block_max_tf(), 5);
1177
1178        // Seek to block 2
1179        iter.seek(131072);
1180        assert_eq!(iter.current_block_max_tf(), 3);
1181    }
1182
1183    #[test]
1184    fn test_roaring_block_max_serialization() {
1185        let mut doc_ids = Vec::new();
1186        let mut term_freqs = Vec::new();
1187
1188        // Two containers
1189        for i in 0..50 {
1190            doc_ids.push(i * 10);
1191            term_freqs.push((i % 5) + 1);
1192        }
1193        for i in 0..50 {
1194            doc_ids.push(65536 + i * 10);
1195            term_freqs.push((i % 3) + 1);
1196        }
1197
1198        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1199
1200        let mut buffer = Vec::new();
1201        list.serialize(&mut buffer).unwrap();
1202
1203        let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1204
1205        assert_eq!(restored.len(), list.len());
1206        assert_eq!(restored.max_tf, list.max_tf);
1207        assert_eq!(restored.max_score, list.max_score);
1208        assert_eq!(restored.num_blocks(), list.num_blocks());
1209
1210        // Verify block metadata
1211        for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1212            assert_eq!(orig.container_key, rest.container_key);
1213            assert_eq!(orig.first_doc_id, rest.first_doc_id);
1214            assert_eq!(orig.last_doc_id, rest.last_doc_id);
1215            assert_eq!(orig.max_tf, rest.max_tf);
1216            assert_eq!(orig.max_block_score, rest.max_block_score);
1217        }
1218
1219        // Verify iteration produces same results
1220        let mut iter1 = list.iterator();
1221        let mut iter2 = restored.iterator();
1222        iter1.init();
1223        iter2.init();
1224
1225        while !iter1.is_exhausted() {
1226            assert_eq!(iter1.doc(), iter2.doc());
1227            assert_eq!(iter1.term_freq(), iter2.term_freq());
1228            iter1.advance();
1229            iter2.advance();
1230        }
1231        assert!(iter2.is_exhausted());
1232    }
1233}