hermes_core/structures/
roaring.rs

1//! Roaring Bitmap implementation for compressed integer sets
2//!
3//! Roaring bitmaps use a hybrid approach:
4//! - Sparse containers: sorted arrays for low-density chunks
5//! - Dense containers: bitmaps for high-density chunks
6//! - Run containers: RLE for consecutive ranges
7//!
8//! This provides excellent compression and fast set operations.
9//! Used by Apache Lucene, Spark, Druid, and many databases.
10
11use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14/// Threshold for switching from array to bitmap container
15/// If a container has more than this many elements, use bitmap
16const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18/// Container types for different density patterns
19#[derive(Debug, Clone)]
20enum Container {
21    /// Sorted array of 16-bit values (for sparse containers)
22    Array(Vec<u16>),
23    /// Bitmap of 2^16 bits (for dense containers)
24    Bitmap(Box<[u64; 1024]>),
25    /// Run-length encoded (start, length) pairs
26    Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30    fn new_array() -> Self {
31        Container::Array(Vec::new())
32    }
33
34    #[allow(dead_code)]
35    fn new_bitmap() -> Self {
36        Container::Bitmap(Box::new([0u64; 1024]))
37    }
38
39    fn cardinality(&self) -> u32 {
40        match self {
41            Container::Array(arr) => arr.len() as u32,
42            Container::Bitmap(bm) => Self::bitmap_cardinality(bm),
43            Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44        }
45    }
46
47    /// SIMD-accelerated bitmap cardinality (NEON on aarch64, POPCNT on x86_64)
48    #[inline]
49    fn bitmap_cardinality(bm: &[u64; 1024]) -> u32 {
50        #[cfg(target_arch = "aarch64")]
51        {
52            use std::arch::aarch64::*;
53
54            let mut total = 0u32;
55            let bytes = bm.as_ptr() as *const u8;
56
57            // Process 16 bytes (2 u64 words) at a time using NEON vcntq_u8
58            unsafe {
59                for i in (0..8192).step_by(16) {
60                    let v = vld1q_u8(bytes.add(i));
61                    let cnt = vcntq_u8(v);
62                    total += vaddlvq_u8(cnt) as u32;
63                }
64            }
65
66            total
67        }
68
69        #[cfg(target_arch = "x86_64")]
70        {
71            // Use POPCNT instruction on x86_64 (available on all modern CPUs)
72            let mut total = 0u32;
73            for &word in bm.iter() {
74                total += word.count_ones();
75            }
76            total
77        }
78
79        #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
80        {
81            bm.iter().map(|w| w.count_ones()).sum()
82        }
83    }
84
85    fn contains(&self, val: u16) -> bool {
86        match self {
87            Container::Array(arr) => arr.binary_search(&val).is_ok(),
88            Container::Bitmap(bm) => {
89                let word_idx = (val / 64) as usize;
90                let bit_idx = val % 64;
91                (bm[word_idx] >> bit_idx) & 1 == 1
92            }
93            Container::Runs(runs) => {
94                for &(start, len) in runs {
95                    if val >= start && val <= start + len {
96                        return true;
97                    }
98                    if val < start {
99                        return false;
100                    }
101                }
102                false
103            }
104        }
105    }
106
107    fn insert(&mut self, val: u16) -> bool {
108        match self {
109            Container::Array(arr) => {
110                match arr.binary_search(&val) {
111                    Ok(_) => false, // Already exists
112                    Err(pos) => {
113                        arr.insert(pos, val);
114                        true
115                    }
116                }
117            }
118            Container::Bitmap(bm) => {
119                let word_idx = (val / 64) as usize;
120                let bit_idx = val % 64;
121                let old = bm[word_idx];
122                bm[word_idx] |= 1u64 << bit_idx;
123                old != bm[word_idx]
124            }
125            Container::Runs(runs) => {
126                // For simplicity, convert to array, insert, then optimize
127                let mut arr: Vec<u16> = Vec::new();
128                for &(start, len) in runs.iter() {
129                    for i in 0..=len {
130                        arr.push(start + i);
131                    }
132                }
133                let inserted = match arr.binary_search(&val) {
134                    Ok(_) => false,
135                    Err(pos) => {
136                        arr.insert(pos, val);
137                        true
138                    }
139                };
140                *self = Container::Array(arr);
141                inserted
142            }
143        }
144    }
145
146    fn optimize(&mut self) {
147        let card = self.cardinality() as usize;
148
149        match self {
150            Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
151                // Convert to bitmap
152                let mut bm = Box::new([0u64; 1024]);
153                for &val in arr.iter() {
154                    let word_idx = (val / 64) as usize;
155                    let bit_idx = val % 64;
156                    bm[word_idx] |= 1u64 << bit_idx;
157                }
158                *self = Container::Bitmap(bm);
159            }
160            Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
161                // Convert to array
162                let mut arr = Vec::with_capacity(card);
163                for (word_idx, &word) in bm.iter().enumerate() {
164                    let mut w = word;
165                    while w != 0 {
166                        let bit_idx = w.trailing_zeros();
167                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
168                        w &= w - 1;
169                    }
170                }
171                *self = Container::Array(arr);
172            }
173            _ => {}
174        }
175
176        // Try run-length encoding if beneficial
177        self.try_run_encode();
178    }
179
180    fn try_run_encode(&mut self) {
181        let arr = match self {
182            Container::Array(arr) if arr.len() >= 4 => arr,
183            _ => return,
184        };
185
186        let mut runs = Vec::new();
187        let mut i = 0;
188
189        while i < arr.len() {
190            let start = arr[i];
191            let mut end = start;
192
193            while i + 1 < arr.len() && arr[i + 1] == end + 1 {
194                end = arr[i + 1];
195                i += 1;
196            }
197
198            runs.push((start, end - start));
199            i += 1;
200        }
201
202        // Only use runs if it saves space
203        // Array: 2 bytes per element
204        // Runs: 4 bytes per run
205        if runs.len() * 4 < arr.len() * 2 {
206            *self = Container::Runs(runs);
207        }
208    }
209
210    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
211        match self {
212            Container::Array(arr) => {
213                writer.write_u8(0)?; // Type tag
214                writer.write_u16::<LittleEndian>(arr.len() as u16)?;
215                for &val in arr {
216                    writer.write_u16::<LittleEndian>(val)?;
217                }
218            }
219            Container::Bitmap(bm) => {
220                writer.write_u8(1)?; // Type tag
221                for &word in bm.iter() {
222                    writer.write_u64::<LittleEndian>(word)?;
223                }
224            }
225            Container::Runs(runs) => {
226                writer.write_u8(2)?; // Type tag
227                writer.write_u16::<LittleEndian>(runs.len() as u16)?;
228                for &(start, len) in runs {
229                    writer.write_u16::<LittleEndian>(start)?;
230                    writer.write_u16::<LittleEndian>(len)?;
231                }
232            }
233        }
234        Ok(())
235    }
236
237    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
238        let tag = reader.read_u8()?;
239        match tag {
240            0 => {
241                let len = reader.read_u16::<LittleEndian>()? as usize;
242                let mut arr = Vec::with_capacity(len);
243                for _ in 0..len {
244                    arr.push(reader.read_u16::<LittleEndian>()?);
245                }
246                Ok(Container::Array(arr))
247            }
248            1 => {
249                let mut bm = Box::new([0u64; 1024]);
250                for word in bm.iter_mut() {
251                    *word = reader.read_u64::<LittleEndian>()?;
252                }
253                Ok(Container::Bitmap(bm))
254            }
255            2 => {
256                let len = reader.read_u16::<LittleEndian>()? as usize;
257                let mut runs = Vec::with_capacity(len);
258                for _ in 0..len {
259                    let start = reader.read_u16::<LittleEndian>()?;
260                    let run_len = reader.read_u16::<LittleEndian>()?;
261                    runs.push((start, run_len));
262                }
263                Ok(Container::Runs(runs))
264            }
265            _ => Err(io::Error::new(
266                io::ErrorKind::InvalidData,
267                "Invalid container type",
268            )),
269        }
270    }
271
272    fn size_bytes(&self) -> usize {
273        match self {
274            Container::Array(arr) => arr.len() * 2 + 4,
275            Container::Bitmap(_) => 8 * 1024 + 1,
276            Container::Runs(runs) => runs.len() * 4 + 4,
277        }
278    }
279}
280
281/// Roaring Bitmap for compressed integer sets
282#[derive(Debug, Clone)]
283pub struct RoaringBitmap {
284    /// High 16 bits -> container mapping
285    containers: Vec<(u16, Container)>,
286}
287
288impl RoaringBitmap {
289    /// Create empty bitmap
290    pub fn new() -> Self {
291        Self {
292            containers: Vec::new(),
293        }
294    }
295
296    /// Create from sorted slice
297    pub fn from_sorted_slice(values: &[u32]) -> Self {
298        let mut bitmap = Self::new();
299
300        if values.is_empty() {
301            return bitmap;
302        }
303
304        let mut current_high = (values[0] >> 16) as u16;
305        let mut current_container = Container::new_array();
306
307        for &val in values {
308            let high = (val >> 16) as u16;
309            let low = val as u16;
310
311            if high != current_high {
312                current_container.optimize();
313                bitmap.containers.push((current_high, current_container));
314                current_high = high;
315                current_container = Container::new_array();
316            }
317
318            current_container.insert(low);
319        }
320
321        current_container.optimize();
322        bitmap.containers.push((current_high, current_container));
323
324        bitmap
325    }
326
327    /// Insert a value
328    pub fn insert(&mut self, val: u32) -> bool {
329        let high = (val >> 16) as u16;
330        let low = val as u16;
331
332        // Find or create container
333        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
334            Ok(idx) => self.containers[idx].1.insert(low),
335            Err(idx) => {
336                let mut container = Container::new_array();
337                container.insert(low);
338                self.containers.insert(idx, (high, container));
339                true
340            }
341        }
342    }
343
344    /// Check if value exists
345    pub fn contains(&self, val: u32) -> bool {
346        let high = (val >> 16) as u16;
347        let low = val as u16;
348
349        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
350            Ok(idx) => self.containers[idx].1.contains(low),
351            Err(_) => false,
352        }
353    }
354
355    /// Number of elements
356    pub fn cardinality(&self) -> u32 {
357        self.containers.iter().map(|(_, c)| c.cardinality()).sum()
358    }
359
360    /// Check if empty
361    pub fn is_empty(&self) -> bool {
362        self.containers.is_empty()
363    }
364
365    /// Optimize all containers
366    pub fn optimize(&mut self) {
367        for (_, container) in &mut self.containers {
368            container.optimize();
369        }
370    }
371
372    /// Intersection with another bitmap
373    pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
374        let mut result = RoaringBitmap::new();
375
376        let mut i = 0;
377        let mut j = 0;
378
379        while i < self.containers.len() && j < other.containers.len() {
380            let (high1, c1) = &self.containers[i];
381            let (high2, c2) = &other.containers[j];
382
383            match high1.cmp(high2) {
384                std::cmp::Ordering::Less => i += 1,
385                std::cmp::Ordering::Greater => j += 1,
386                std::cmp::Ordering::Equal => {
387                    let intersected = Self::intersect_containers(c1, c2);
388                    if intersected.cardinality() > 0 {
389                        result.containers.push((*high1, intersected));
390                    }
391                    i += 1;
392                    j += 1;
393                }
394            }
395        }
396
397        result
398    }
399
400    fn intersect_containers(c1: &Container, c2: &Container) -> Container {
401        match (c1, c2) {
402            (Container::Array(a1), Container::Array(a2)) => {
403                let mut result = Vec::new();
404                let mut i = 0;
405                let mut j = 0;
406                while i < a1.len() && j < a2.len() {
407                    match a1[i].cmp(&a2[j]) {
408                        std::cmp::Ordering::Less => i += 1,
409                        std::cmp::Ordering::Greater => j += 1,
410                        std::cmp::Ordering::Equal => {
411                            result.push(a1[i]);
412                            i += 1;
413                            j += 1;
414                        }
415                    }
416                }
417                Container::Array(result)
418            }
419            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
420                let mut result = Box::new([0u64; 1024]);
421                for i in 0..1024 {
422                    result[i] = b1[i] & b2[i];
423                }
424                let mut c = Container::Bitmap(result);
425                c.optimize();
426                c
427            }
428            (Container::Array(arr), Container::Bitmap(bm))
429            | (Container::Bitmap(bm), Container::Array(arr)) => {
430                let mut result = Vec::new();
431                for &val in arr {
432                    let word_idx = (val / 64) as usize;
433                    let bit_idx = val % 64;
434                    if (bm[word_idx] >> bit_idx) & 1 == 1 {
435                        result.push(val);
436                    }
437                }
438                Container::Array(result)
439            }
440            _ => {
441                // For runs, convert to array first
442                let arr1 = Self::container_to_array(c1);
443                let arr2 = Self::container_to_array(c2);
444                Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
445            }
446        }
447    }
448
449    fn container_to_array(c: &Container) -> Vec<u16> {
450        match c {
451            Container::Array(arr) => arr.clone(),
452            Container::Bitmap(bm) => {
453                let mut arr = Vec::new();
454                for (word_idx, &word) in bm.iter().enumerate() {
455                    let mut w = word;
456                    while w != 0 {
457                        let bit_idx = w.trailing_zeros();
458                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
459                        w &= w - 1;
460                    }
461                }
462                arr
463            }
464            Container::Runs(runs) => {
465                let mut arr = Vec::new();
466                for &(start, len) in runs {
467                    for i in 0..=len {
468                        arr.push(start + i);
469                    }
470                }
471                arr
472            }
473        }
474    }
475
476    /// Union with another bitmap
477    pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
478        let mut result = RoaringBitmap::new();
479
480        let mut i = 0;
481        let mut j = 0;
482
483        while i < self.containers.len() || j < other.containers.len() {
484            if i >= self.containers.len() {
485                result.containers.push(other.containers[j].clone());
486                j += 1;
487            } else if j >= other.containers.len() {
488                result.containers.push(self.containers[i].clone());
489                i += 1;
490            } else {
491                let (high1, c1) = &self.containers[i];
492                let (high2, c2) = &other.containers[j];
493
494                match high1.cmp(high2) {
495                    std::cmp::Ordering::Less => {
496                        result.containers.push(self.containers[i].clone());
497                        i += 1;
498                    }
499                    std::cmp::Ordering::Greater => {
500                        result.containers.push(other.containers[j].clone());
501                        j += 1;
502                    }
503                    std::cmp::Ordering::Equal => {
504                        let united = Self::union_containers(c1, c2);
505                        result.containers.push((*high1, united));
506                        i += 1;
507                        j += 1;
508                    }
509                }
510            }
511        }
512
513        result
514    }
515
516    fn union_containers(c1: &Container, c2: &Container) -> Container {
517        match (c1, c2) {
518            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
519                let mut result = Box::new([0u64; 1024]);
520                for i in 0..1024 {
521                    result[i] = b1[i] | b2[i];
522                }
523                Container::Bitmap(result)
524            }
525            _ => {
526                // Convert both to arrays and merge
527                let arr1 = Self::container_to_array(c1);
528                let arr2 = Self::container_to_array(c2);
529
530                let mut result = Vec::with_capacity(arr1.len() + arr2.len());
531                let mut i = 0;
532                let mut j = 0;
533
534                while i < arr1.len() && j < arr2.len() {
535                    match arr1[i].cmp(&arr2[j]) {
536                        std::cmp::Ordering::Less => {
537                            result.push(arr1[i]);
538                            i += 1;
539                        }
540                        std::cmp::Ordering::Greater => {
541                            result.push(arr2[j]);
542                            j += 1;
543                        }
544                        std::cmp::Ordering::Equal => {
545                            result.push(arr1[i]);
546                            i += 1;
547                            j += 1;
548                        }
549                    }
550                }
551
552                result.extend_from_slice(&arr1[i..]);
553                result.extend_from_slice(&arr2[j..]);
554
555                let mut c = Container::Array(result);
556                c.optimize();
557                c
558            }
559        }
560    }
561
562    /// Serialize
563    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
564        writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
565        for (high, container) in &self.containers {
566            writer.write_u16::<LittleEndian>(*high)?;
567            container.serialize(writer)?;
568        }
569        Ok(())
570    }
571
572    /// Deserialize
573    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
574        let num_containers = reader.read_u32::<LittleEndian>()? as usize;
575        let mut containers = Vec::with_capacity(num_containers);
576
577        for _ in 0..num_containers {
578            let high = reader.read_u16::<LittleEndian>()?;
579            let container = Container::deserialize(reader)?;
580            containers.push((high, container));
581        }
582
583        Ok(Self { containers })
584    }
585
586    /// Approximate size in bytes
587    pub fn size_bytes(&self) -> usize {
588        4 + self
589            .containers
590            .iter()
591            .map(|(_, c)| 2 + c.size_bytes())
592            .sum::<usize>()
593    }
594
595    /// Create an iterator
596    pub fn iter(&self) -> RoaringIterator<'_> {
597        // Pre-allocate buffer for typical container size (4096 is array threshold)
598        RoaringIterator {
599            bitmap: self,
600            container_idx: 0,
601            value_idx: 0,
602            current_len: 0,
603            current_values: Vec::with_capacity(ARRAY_TO_BITMAP_THRESHOLD),
604        }
605    }
606}
607
608impl Default for RoaringBitmap {
609    fn default() -> Self {
610        Self::new()
611    }
612}
613
614/// Iterator over Roaring Bitmap
615pub struct RoaringIterator<'a> {
616    bitmap: &'a RoaringBitmap,
617    container_idx: usize,
618    value_idx: usize,
619    /// Number of valid values in current_values
620    current_len: usize,
621    /// Pre-allocated buffer for container values (max 65536 values per container)
622    current_values: Vec<u16>,
623}
624
625impl<'a> Iterator for RoaringIterator<'a> {
626    type Item = u32;
627
628    fn next(&mut self) -> Option<Self::Item> {
629        loop {
630            if self.value_idx < self.current_len {
631                let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
632                let low = self.current_values[self.value_idx] as u32;
633                self.value_idx += 1;
634                return Some((high << 16) | low);
635            }
636
637            if self.container_idx >= self.bitmap.containers.len() {
638                return None;
639            }
640
641            let (_, container) = &self.bitmap.containers[self.container_idx];
642            // Decode into pre-allocated buffer (no allocation!)
643            self.current_len = Self::decode_container_into(container, &mut self.current_values);
644            self.value_idx = 0;
645            self.container_idx += 1;
646        }
647    }
648}
649
650impl<'a> RoaringIterator<'a> {
651    /// Decode container values into a pre-allocated buffer
652    /// Returns the number of values decoded
653    #[inline]
654    fn decode_container_into(container: &Container, output: &mut Vec<u16>) -> usize {
655        match container {
656            Container::Array(arr) => {
657                let len = arr.len();
658                if output.len() < len {
659                    output.resize(len, 0);
660                }
661                output[..len].copy_from_slice(arr);
662                len
663            }
664            Container::Bitmap(bm) => Self::decode_bitmap_into(bm, output),
665            Container::Runs(runs) => {
666                let mut count = 0;
667                for &(start, len) in runs {
668                    for i in 0..=len {
669                        let val = start + i;
670                        if count >= output.len() {
671                            output.push(val);
672                        } else {
673                            output[count] = val;
674                        }
675                        count += 1;
676                    }
677                }
678                count
679            }
680        }
681    }
682
683    /// Decode bitmap container into output buffer
684    /// Uses optimized bit extraction
685    #[inline]
686    fn decode_bitmap_into(bm: &[u64; 1024], output: &mut Vec<u16>) -> usize {
687        let mut count = 0;
688
689        for (word_idx, &word) in bm.iter().enumerate() {
690            if word == 0 {
691                continue;
692            }
693
694            let base = (word_idx * 64) as u16;
695            let mut w = word;
696
697            // Unroll the bit extraction loop for better performance
698            while w != 0 {
699                let bit_idx = w.trailing_zeros() as u16;
700                let val = base + bit_idx;
701
702                if count >= output.len() {
703                    output.push(val);
704                } else {
705                    output[count] = val;
706                }
707                count += 1;
708                w &= w - 1; // Clear lowest set bit
709            }
710        }
711
712        count
713    }
714}
715
716/// Block size for Roaring BlockMax (matches container size = 65536 doc_ids)
717pub const ROARING_BLOCK_SIZE: usize = 65536;
718
719/// Block metadata for BlockMax WAND optimization in Roaring
720#[derive(Debug, Clone)]
721pub struct RoaringBlockInfo {
722    /// High 16 bits (container key)
723    pub container_key: u16,
724    /// First doc_id in this container
725    pub first_doc_id: u32,
726    /// Last doc_id in this container
727    pub last_doc_id: u32,
728    /// Maximum term frequency in this block
729    pub max_tf: u32,
730    /// Upper bound BM25 score for this block
731    pub max_block_score: f32,
732    /// Number of documents in this container
733    pub num_docs: u32,
734}
735
736impl RoaringBlockInfo {
737    /// Serialize block info
738    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
739        writer.write_u16::<LittleEndian>(self.container_key)?;
740        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
741        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
742        writer.write_u32::<LittleEndian>(self.max_tf)?;
743        writer.write_f32::<LittleEndian>(self.max_block_score)?;
744        writer.write_u32::<LittleEndian>(self.num_docs)?;
745        Ok(())
746    }
747
748    /// Deserialize block info
749    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
750        Ok(Self {
751            container_key: reader.read_u16::<LittleEndian>()?,
752            first_doc_id: reader.read_u32::<LittleEndian>()?,
753            last_doc_id: reader.read_u32::<LittleEndian>()?,
754            max_tf: reader.read_u32::<LittleEndian>()?,
755            max_block_score: reader.read_f32::<LittleEndian>()?,
756            num_docs: reader.read_u32::<LittleEndian>()?,
757        })
758    }
759}
760
761/// Roaring bitmap with term frequencies for posting lists
762#[derive(Debug, Clone)]
763pub struct RoaringPostingList {
764    /// Document IDs as roaring bitmap
765    pub doc_ids: RoaringBitmap,
766    /// Term frequencies (sparse map for non-1 frequencies)
767    /// Most terms have tf=1, so we only store exceptions
768    pub term_freqs: Vec<(u32, u32)>,
769    /// Default term frequency (usually 1)
770    pub default_tf: u32,
771    /// Maximum term frequency
772    pub max_tf: u32,
773    /// Block metadata for BlockMax WAND (one per container)
774    pub blocks: Vec<RoaringBlockInfo>,
775    /// Global maximum score across all blocks
776    pub max_score: f32,
777}
778
779impl RoaringPostingList {
780    /// BM25 parameters for block-max score calculation
781    const K1: f32 = 1.2;
782    const B: f32 = 0.75;
783
784    /// Compute BM25 upper bound score for a given max_tf and IDF
785    #[inline]
786    pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
787        let tf = max_tf as f32;
788        // Conservative upper bound: assume dl=0, so length_norm = 1 - b = 0.25
789        let min_length_norm = 1.0 - Self::B;
790        let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
791        idf * tf_norm
792    }
793
794    /// Create from doc_ids and term frequencies (without IDF - for backwards compatibility)
795    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
796        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
797    }
798
799    /// Create from doc_ids and term frequencies with IDF for block-max scores
800    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
801        assert_eq!(doc_ids.len(), term_freqs.len());
802
803        if doc_ids.is_empty() {
804            return Self {
805                doc_ids: RoaringBitmap::new(),
806                term_freqs: Vec::new(),
807                default_tf: 1,
808                max_tf: 0,
809                blocks: Vec::new(),
810                max_score: 0.0,
811            };
812        }
813
814        let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
815
816        // Find most common TF (usually 1)
817        let mut tf_counts = std::collections::HashMap::new();
818        for &tf in term_freqs {
819            *tf_counts.entry(tf).or_insert(0u32) += 1;
820        }
821        let default_tf = tf_counts
822            .iter()
823            .max_by_key(|&(_, count)| count)
824            .map(|(&tf, _)| tf)
825            .unwrap_or(1);
826
827        // Store only non-default TFs
828        let exceptions: Vec<(u32, u32)> = doc_ids
829            .iter()
830            .zip(term_freqs.iter())
831            .filter(|&(_, &tf)| tf != default_tf)
832            .map(|(&doc, &tf)| (doc, tf))
833            .collect();
834
835        let max_tf = *term_freqs.iter().max().unwrap_or(&1);
836
837        // Build block metadata (one block per container)
838        // Group doc_ids by container (high 16 bits)
839        let mut blocks = Vec::new();
840        let mut max_score = 0.0f32;
841        let mut i = 0;
842
843        while i < doc_ids.len() {
844            let container_key = (doc_ids[i] >> 16) as u16;
845            let block_start = i;
846
847            // Find all docs in this container
848            while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
849                i += 1;
850            }
851
852            let block_doc_ids = &doc_ids[block_start..i];
853            let block_tfs = &term_freqs[block_start..i];
854            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
855            let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
856            max_score = max_score.max(block_score);
857
858            blocks.push(RoaringBlockInfo {
859                container_key,
860                first_doc_id: block_doc_ids[0],
861                last_doc_id: *block_doc_ids.last().unwrap(),
862                max_tf: block_max_tf,
863                max_block_score: block_score,
864                num_docs: block_doc_ids.len() as u32,
865            });
866        }
867
868        Self {
869            doc_ids: bitmap,
870            term_freqs: exceptions,
871            default_tf,
872            max_tf,
873            blocks,
874            max_score,
875        }
876    }
877
878    /// Get term frequency for a document
879    pub fn get_tf(&self, doc_id: u32) -> u32 {
880        // Binary search in exceptions
881        match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
882            Ok(idx) => self.term_freqs[idx].1,
883            Err(_) => self.default_tf,
884        }
885    }
886
887    /// Number of documents
888    pub fn len(&self) -> u32 {
889        self.doc_ids.cardinality()
890    }
891
892    /// Check if empty
893    pub fn is_empty(&self) -> bool {
894        self.doc_ids.is_empty()
895    }
896
897    /// Get number of blocks (containers)
898    pub fn num_blocks(&self) -> usize {
899        self.blocks.len()
900    }
901
902    /// Get block index for a doc_id
903    pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
904        let container_key = (doc_id >> 16) as u16;
905        self.blocks
906            .binary_search_by_key(&container_key, |b| b.container_key)
907            .ok()
908    }
909
910    /// Serialize
911    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
912        self.doc_ids.serialize(writer)?;
913        writer.write_u32::<LittleEndian>(self.default_tf)?;
914        writer.write_u32::<LittleEndian>(self.max_tf)?;
915        writer.write_f32::<LittleEndian>(self.max_score)?;
916        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
917        for &(doc, tf) in &self.term_freqs {
918            writer.write_u32::<LittleEndian>(doc)?;
919            writer.write_u32::<LittleEndian>(tf)?;
920        }
921
922        // Write block metadata
923        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
924        for block in &self.blocks {
925            block.serialize(writer)?;
926        }
927
928        Ok(())
929    }
930
931    /// Deserialize
932    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
933        let doc_ids = RoaringBitmap::deserialize(reader)?;
934        let default_tf = reader.read_u32::<LittleEndian>()?;
935        let max_tf = reader.read_u32::<LittleEndian>()?;
936        let max_score = reader.read_f32::<LittleEndian>()?;
937        let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
938        let mut term_freqs = Vec::with_capacity(num_exceptions);
939        for _ in 0..num_exceptions {
940            let doc = reader.read_u32::<LittleEndian>()?;
941            let tf = reader.read_u32::<LittleEndian>()?;
942            term_freqs.push((doc, tf));
943        }
944
945        // Read block metadata
946        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
947        let mut blocks = Vec::with_capacity(num_blocks);
948        for _ in 0..num_blocks {
949            blocks.push(RoaringBlockInfo::deserialize(reader)?);
950        }
951
952        Ok(Self {
953            doc_ids,
954            term_freqs,
955            default_tf,
956            max_tf,
957            blocks,
958            max_score,
959        })
960    }
961
962    /// Create iterator
963    pub fn iterator(&self) -> RoaringPostingIterator<'_> {
964        RoaringPostingIterator {
965            list: self,
966            doc_iter: self.doc_ids.iter(),
967            current_doc: None,
968            current_block: 0,
969        }
970    }
971}
972
973/// Iterator over Roaring posting list with BlockMax support
974pub struct RoaringPostingIterator<'a> {
975    list: &'a RoaringPostingList,
976    doc_iter: RoaringIterator<'a>,
977    current_doc: Option<u32>,
978    current_block: usize,
979}
980
981impl<'a> RoaringPostingIterator<'a> {
982    /// Current document ID
983    pub fn doc(&self) -> u32 {
984        self.current_doc.unwrap_or(u32::MAX)
985    }
986
987    /// Current term frequency
988    pub fn term_freq(&self) -> u32 {
989        match self.current_doc {
990            Some(doc) => self.list.get_tf(doc),
991            None => 0,
992        }
993    }
994
995    /// Advance to next document
996    pub fn advance(&mut self) -> u32 {
997        self.current_doc = self.doc_iter.next();
998        // Update current block if needed
999        if let Some(doc) = self.current_doc
1000            && !self.list.blocks.is_empty()
1001        {
1002            let container_key = (doc >> 16) as u16;
1003            // Move to next block if we've passed current one
1004            while self.current_block < self.list.blocks.len()
1005                && self.list.blocks[self.current_block].container_key < container_key
1006            {
1007                self.current_block += 1;
1008            }
1009        }
1010        self.doc()
1011    }
1012
1013    /// Initialize (must be called before first use)
1014    pub fn init(&mut self) {
1015        self.current_doc = self.doc_iter.next();
1016        self.current_block = 0;
1017    }
1018
1019    /// Seek to first doc >= target
1020    pub fn seek(&mut self, target: u32) -> u32 {
1021        // Use block skip list for faster seeking
1022        if !self.list.blocks.is_empty() {
1023            let target_container = (target >> 16) as u16;
1024
1025            // Skip blocks that are entirely before target
1026            while self.current_block < self.list.blocks.len()
1027                && self.list.blocks[self.current_block].last_doc_id < target
1028            {
1029                self.current_block += 1;
1030            }
1031
1032            // If we've exhausted all blocks
1033            if self.current_block >= self.list.blocks.len() {
1034                self.current_doc = None;
1035                return u32::MAX;
1036            }
1037
1038            // Skip docs until we reach the target block's first doc
1039            let block = &self.list.blocks[self.current_block];
1040            if block.container_key > target_container
1041                || (block.container_key == target_container && block.first_doc_id > self.doc())
1042            {
1043                // Need to advance iterator to this block
1044                while let Some(doc) = self.current_doc {
1045                    if doc >= block.first_doc_id {
1046                        break;
1047                    }
1048                    self.current_doc = self.doc_iter.next();
1049                }
1050            }
1051        }
1052
1053        // Linear scan within block
1054        while let Some(doc) = self.current_doc {
1055            if doc >= target {
1056                return doc;
1057            }
1058            self.current_doc = self.doc_iter.next();
1059        }
1060        u32::MAX
1061    }
1062
1063    /// Check if iterator is exhausted
1064    pub fn is_exhausted(&self) -> bool {
1065        self.current_doc.is_none()
1066    }
1067
1068    /// Get current block's max score (for BlockMax WAND)
1069    pub fn current_block_max_score(&self) -> f32 {
1070        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1071            return 0.0;
1072        }
1073        if self.current_block < self.list.blocks.len() {
1074            self.list.blocks[self.current_block].max_block_score
1075        } else {
1076            0.0
1077        }
1078    }
1079
1080    /// Get current block's max term frequency
1081    pub fn current_block_max_tf(&self) -> u32 {
1082        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1083            return 0;
1084        }
1085        if self.current_block < self.list.blocks.len() {
1086            self.list.blocks[self.current_block].max_tf
1087        } else {
1088            0
1089        }
1090    }
1091
1092    /// Get max score for remaining blocks (for MaxScore optimization)
1093    pub fn max_remaining_score(&self) -> f32 {
1094        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1095            return 0.0;
1096        }
1097        self.list.blocks[self.current_block..]
1098            .iter()
1099            .map(|b| b.max_block_score)
1100            .fold(0.0f32, |a, b| a.max(b))
1101    }
1102
1103    /// Skip to next block containing doc >= target (for BlockWAND)
1104    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
1105    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1106        if self.list.blocks.is_empty() {
1107            return None;
1108        }
1109
1110        while self.current_block < self.list.blocks.len() {
1111            let block = &self.list.blocks[self.current_block];
1112            if block.last_doc_id >= target {
1113                // Advance iterator to this block's first doc
1114                while let Some(doc) = self.current_doc {
1115                    if doc >= block.first_doc_id {
1116                        break;
1117                    }
1118                    self.current_doc = self.doc_iter.next();
1119                }
1120                return Some((block.first_doc_id, block.max_block_score));
1121            }
1122            self.current_block += 1;
1123        }
1124
1125        self.current_doc = None;
1126        None
1127    }
1128}
1129
1130#[cfg(test)]
1131mod tests {
1132    use super::*;
1133
1134    #[test]
1135    fn test_roaring_basic() {
1136        let mut bitmap = RoaringBitmap::new();
1137        bitmap.insert(1);
1138        bitmap.insert(100);
1139        bitmap.insert(1000);
1140        bitmap.insert(100000);
1141
1142        assert!(bitmap.contains(1));
1143        assert!(bitmap.contains(100));
1144        assert!(bitmap.contains(1000));
1145        assert!(bitmap.contains(100000));
1146        assert!(!bitmap.contains(2));
1147        assert!(!bitmap.contains(50000));
1148
1149        assert_eq!(bitmap.cardinality(), 4);
1150    }
1151
1152    #[test]
1153    fn test_roaring_from_sorted() {
1154        let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1155        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1156
1157        assert_eq!(bitmap.cardinality(), 10000);
1158
1159        for &val in &values {
1160            assert!(bitmap.contains(val), "Missing value {}", val);
1161        }
1162    }
1163
1164    #[test]
1165    fn test_roaring_intersection() {
1166        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1167        let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1168
1169        let result = a.and(&b);
1170
1171        assert_eq!(result.cardinality(), 4);
1172        assert!(result.contains(2));
1173        assert!(result.contains(3));
1174        assert!(result.contains(200));
1175        assert!(result.contains(300));
1176    }
1177
1178    #[test]
1179    fn test_roaring_union() {
1180        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1181        let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1182
1183        let result = a.or(&b);
1184
1185        assert_eq!(result.cardinality(), 5);
1186        for i in 1..=5 {
1187            assert!(result.contains(i));
1188        }
1189    }
1190
1191    #[test]
1192    fn test_roaring_serialization() {
1193        let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1194        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1195
1196        let mut buffer = Vec::new();
1197        bitmap.serialize(&mut buffer).unwrap();
1198
1199        let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1200
1201        assert_eq!(restored.cardinality(), bitmap.cardinality());
1202        for &val in &values {
1203            assert!(restored.contains(val));
1204        }
1205    }
1206
1207    #[test]
1208    fn test_roaring_posting_list() {
1209        let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1210        let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1211
1212        let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1213
1214        assert_eq!(list.len(), 7);
1215        assert_eq!(list.default_tf, 1);
1216        assert_eq!(list.max_tf, 3);
1217
1218        // Check TF lookups
1219        assert_eq!(list.get_tf(1), 1);
1220        assert_eq!(list.get_tf(10), 2);
1221        assert_eq!(list.get_tf(100), 3);
1222    }
1223
1224    #[test]
1225    fn test_roaring_iterator() {
1226        let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1227        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1228
1229        let collected: Vec<u32> = bitmap.iter().collect();
1230        assert_eq!(collected, values);
1231    }
1232
1233    #[test]
1234    fn test_roaring_block_max() {
1235        // Create posting list spanning multiple containers (high 16 bits)
1236        // Container 0: doc_ids 0-65535
1237        // Container 1: doc_ids 65536-131071
1238        // Container 2: doc_ids 131072-196607
1239        let mut doc_ids = Vec::new();
1240        let mut term_freqs = Vec::new();
1241
1242        // Container 0: 100 docs with max_tf = 2
1243        for i in 0..100 {
1244            doc_ids.push(i * 100);
1245            term_freqs.push(if i == 50 { 2 } else { 1 });
1246        }
1247
1248        // Container 1: 100 docs with max_tf = 5
1249        for i in 0..100 {
1250            doc_ids.push(65536 + i * 100);
1251            term_freqs.push(if i == 25 { 5 } else { 1 });
1252        }
1253
1254        // Container 2: 100 docs with max_tf = 3
1255        for i in 0..100 {
1256            doc_ids.push(131072 + i * 100);
1257            term_freqs.push(if i == 75 { 3 } else { 1 });
1258        }
1259
1260        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1261
1262        // Should have 3 blocks (one per container)
1263        assert_eq!(list.num_blocks(), 3);
1264        assert_eq!(list.blocks[0].container_key, 0);
1265        assert_eq!(list.blocks[1].container_key, 1);
1266        assert_eq!(list.blocks[2].container_key, 2);
1267
1268        assert_eq!(list.blocks[0].max_tf, 2);
1269        assert_eq!(list.blocks[1].max_tf, 5);
1270        assert_eq!(list.blocks[2].max_tf, 3);
1271
1272        // Block 1 should have highest score (max_tf = 5)
1273        assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1274        assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1275
1276        // Global max_score should equal block 1's score
1277        assert_eq!(list.max_score, list.blocks[1].max_block_score);
1278
1279        // Test iterator block-max methods
1280        let mut iter = list.iterator();
1281        iter.init();
1282        assert_eq!(iter.current_block_max_tf(), 2); // Block 0
1283
1284        // Seek to block 1
1285        iter.seek(65536);
1286        assert_eq!(iter.current_block_max_tf(), 5);
1287
1288        // Seek to block 2
1289        iter.seek(131072);
1290        assert_eq!(iter.current_block_max_tf(), 3);
1291    }
1292
1293    #[test]
1294    fn test_roaring_block_max_serialization() {
1295        let mut doc_ids = Vec::new();
1296        let mut term_freqs = Vec::new();
1297
1298        // Two containers
1299        for i in 0..50 {
1300            doc_ids.push(i * 10);
1301            term_freqs.push((i % 5) + 1);
1302        }
1303        for i in 0..50 {
1304            doc_ids.push(65536 + i * 10);
1305            term_freqs.push((i % 3) + 1);
1306        }
1307
1308        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1309
1310        let mut buffer = Vec::new();
1311        list.serialize(&mut buffer).unwrap();
1312
1313        let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1314
1315        assert_eq!(restored.len(), list.len());
1316        assert_eq!(restored.max_tf, list.max_tf);
1317        assert_eq!(restored.max_score, list.max_score);
1318        assert_eq!(restored.num_blocks(), list.num_blocks());
1319
1320        // Verify block metadata
1321        for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1322            assert_eq!(orig.container_key, rest.container_key);
1323            assert_eq!(orig.first_doc_id, rest.first_doc_id);
1324            assert_eq!(orig.last_doc_id, rest.last_doc_id);
1325            assert_eq!(orig.max_tf, rest.max_tf);
1326            assert_eq!(orig.max_block_score, rest.max_block_score);
1327        }
1328
1329        // Verify iteration produces same results
1330        let mut iter1 = list.iterator();
1331        let mut iter2 = restored.iterator();
1332        iter1.init();
1333        iter2.init();
1334
1335        while !iter1.is_exhausted() {
1336            assert_eq!(iter1.doc(), iter2.doc());
1337            assert_eq!(iter1.term_freq(), iter2.term_freq());
1338            iter1.advance();
1339            iter2.advance();
1340        }
1341        assert!(iter2.is_exhausted());
1342    }
1343}