Skip to main content

hermes_core/structures/postings/
roaring.rs

1//! Roaring Bitmap implementation for compressed integer sets
2//!
3//! Roaring bitmaps use a hybrid approach:
4//! - Sparse containers: sorted arrays for low-density chunks
5//! - Dense containers: bitmaps for high-density chunks
6//! - Run containers: RLE for consecutive ranges
7//!
8//! This provides excellent compression and fast set operations.
9//! Used by Apache Lucene, Spark, Druid, and many databases.
10
11use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14/// Threshold for switching from array to bitmap container
15/// If a container has more than this many elements, use bitmap
16const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18/// Container types for different density patterns
19#[derive(Debug, Clone)]
20enum Container {
21    /// Sorted array of 16-bit values (for sparse containers)
22    Array(Vec<u16>),
23    /// Bitmap of 2^16 bits (for dense containers)
24    Bitmap(Box<[u64; 1024]>),
25    /// Run-length encoded (start, length) pairs
26    Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30    fn new_array() -> Self {
31        Container::Array(Vec::new())
32    }
33
34    #[allow(dead_code)]
35    fn new_bitmap() -> Self {
36        Container::Bitmap(Box::new([0u64; 1024]))
37    }
38
39    fn cardinality(&self) -> u32 {
40        match self {
41            Container::Array(arr) => arr.len() as u32,
42            Container::Bitmap(bm) => Self::bitmap_cardinality(bm),
43            Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44        }
45    }
46
47    /// SIMD-accelerated bitmap cardinality (NEON on aarch64, POPCNT on x86_64)
48    #[inline]
49    fn bitmap_cardinality(bm: &[u64; 1024]) -> u32 {
50        #[cfg(target_arch = "aarch64")]
51        {
52            use std::arch::aarch64::*;
53
54            let mut total = 0u32;
55            let bytes = bm.as_ptr() as *const u8;
56
57            // Process 16 bytes (2 u64 words) at a time using NEON vcntq_u8
58            unsafe {
59                for i in (0..8192).step_by(16) {
60                    let v = vld1q_u8(bytes.add(i));
61                    let cnt = vcntq_u8(v);
62                    total += vaddlvq_u8(cnt) as u32;
63                }
64            }
65
66            total
67        }
68
69        #[cfg(target_arch = "x86_64")]
70        {
71            // Use POPCNT instruction on x86_64 (available on all modern CPUs)
72            let mut total = 0u32;
73            for &word in bm.iter() {
74                total += word.count_ones();
75            }
76            total
77        }
78
79        #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
80        {
81            bm.iter().map(|w| w.count_ones()).sum()
82        }
83    }
84
85    fn contains(&self, val: u16) -> bool {
86        match self {
87            Container::Array(arr) => arr.binary_search(&val).is_ok(),
88            Container::Bitmap(bm) => {
89                let word_idx = (val / 64) as usize;
90                let bit_idx = val % 64;
91                (bm[word_idx] >> bit_idx) & 1 == 1
92            }
93            Container::Runs(runs) => {
94                for &(start, len) in runs {
95                    if val >= start && val <= start + len {
96                        return true;
97                    }
98                    if val < start {
99                        return false;
100                    }
101                }
102                false
103            }
104        }
105    }
106
107    fn insert(&mut self, val: u16) -> bool {
108        match self {
109            Container::Array(arr) => {
110                match arr.binary_search(&val) {
111                    Ok(_) => false, // Already exists
112                    Err(pos) => {
113                        arr.insert(pos, val);
114                        true
115                    }
116                }
117            }
118            Container::Bitmap(bm) => {
119                let word_idx = (val / 64) as usize;
120                let bit_idx = val % 64;
121                let old = bm[word_idx];
122                bm[word_idx] |= 1u64 << bit_idx;
123                old != bm[word_idx]
124            }
125            Container::Runs(runs) => {
126                // For simplicity, convert to array, insert, then optimize
127                let mut arr: Vec<u16> = Vec::new();
128                for &(start, len) in runs.iter() {
129                    for i in 0..=len {
130                        arr.push(start + i);
131                    }
132                }
133                let inserted = match arr.binary_search(&val) {
134                    Ok(_) => false,
135                    Err(pos) => {
136                        arr.insert(pos, val);
137                        true
138                    }
139                };
140                *self = Container::Array(arr);
141                inserted
142            }
143        }
144    }
145
146    fn optimize(&mut self) {
147        let card = self.cardinality() as usize;
148
149        match self {
150            Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
151                // Convert to bitmap
152                let mut bm = Box::new([0u64; 1024]);
153                for &val in arr.iter() {
154                    let word_idx = (val / 64) as usize;
155                    let bit_idx = val % 64;
156                    bm[word_idx] |= 1u64 << bit_idx;
157                }
158                *self = Container::Bitmap(bm);
159            }
160            Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
161                // Convert to array
162                let mut arr = Vec::with_capacity(card);
163                for (word_idx, &word) in bm.iter().enumerate() {
164                    let mut w = word;
165                    while w != 0 {
166                        let bit_idx = w.trailing_zeros();
167                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
168                        w &= w - 1;
169                    }
170                }
171                *self = Container::Array(arr);
172            }
173            _ => {}
174        }
175
176        // Try run-length encoding if beneficial
177        self.try_run_encode();
178    }
179
180    fn try_run_encode(&mut self) {
181        let arr = match self {
182            Container::Array(arr) if arr.len() >= 4 => arr,
183            _ => return,
184        };
185
186        let mut runs = Vec::new();
187        let mut i = 0;
188
189        while i < arr.len() {
190            let start = arr[i];
191            let mut end = start;
192
193            while i + 1 < arr.len() && arr[i + 1] == end + 1 {
194                end = arr[i + 1];
195                i += 1;
196            }
197
198            runs.push((start, end - start));
199            i += 1;
200        }
201
202        // Only use runs if it saves space
203        // Array: 2 bytes per element
204        // Runs: 4 bytes per run
205        if runs.len() * 4 < arr.len() * 2 {
206            *self = Container::Runs(runs);
207        }
208    }
209
210    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
211        match self {
212            Container::Array(arr) => {
213                writer.write_u8(0)?; // Type tag
214                writer.write_u16::<LittleEndian>(arr.len() as u16)?;
215                for &val in arr {
216                    writer.write_u16::<LittleEndian>(val)?;
217                }
218            }
219            Container::Bitmap(bm) => {
220                writer.write_u8(1)?; // Type tag
221                for &word in bm.iter() {
222                    writer.write_u64::<LittleEndian>(word)?;
223                }
224            }
225            Container::Runs(runs) => {
226                writer.write_u8(2)?; // Type tag
227                writer.write_u16::<LittleEndian>(runs.len() as u16)?;
228                for &(start, len) in runs {
229                    writer.write_u16::<LittleEndian>(start)?;
230                    writer.write_u16::<LittleEndian>(len)?;
231                }
232            }
233        }
234        Ok(())
235    }
236
237    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
238        let tag = reader.read_u8()?;
239        match tag {
240            0 => {
241                let len = reader.read_u16::<LittleEndian>()? as usize;
242                let mut arr = Vec::with_capacity(len);
243                for _ in 0..len {
244                    arr.push(reader.read_u16::<LittleEndian>()?);
245                }
246                Ok(Container::Array(arr))
247            }
248            1 => {
249                let mut bm = Box::new([0u64; 1024]);
250                for word in bm.iter_mut() {
251                    *word = reader.read_u64::<LittleEndian>()?;
252                }
253                Ok(Container::Bitmap(bm))
254            }
255            2 => {
256                let len = reader.read_u16::<LittleEndian>()? as usize;
257                let mut runs = Vec::with_capacity(len);
258                for _ in 0..len {
259                    let start = reader.read_u16::<LittleEndian>()?;
260                    let run_len = reader.read_u16::<LittleEndian>()?;
261                    runs.push((start, run_len));
262                }
263                Ok(Container::Runs(runs))
264            }
265            _ => Err(io::Error::new(
266                io::ErrorKind::InvalidData,
267                "Invalid container type",
268            )),
269        }
270    }
271
272    fn size_bytes(&self) -> usize {
273        match self {
274            Container::Array(arr) => arr.len() * 2 + 4,
275            Container::Bitmap(_) => 8 * 1024 + 1,
276            Container::Runs(runs) => runs.len() * 4 + 4,
277        }
278    }
279}
280
281/// Roaring Bitmap for compressed integer sets
282#[derive(Debug, Clone)]
283pub struct RoaringBitmap {
284    /// High 16 bits -> container mapping
285    containers: Vec<(u16, Container)>,
286}
287
288impl RoaringBitmap {
289    /// Create empty bitmap
290    pub fn new() -> Self {
291        Self {
292            containers: Vec::new(),
293        }
294    }
295
296    /// Create from sorted slice
297    pub fn from_sorted_slice(values: &[u32]) -> Self {
298        let mut bitmap = Self::new();
299
300        if values.is_empty() {
301            return bitmap;
302        }
303
304        let mut current_high = (values[0] >> 16) as u16;
305        let mut current_container = Container::new_array();
306
307        for &val in values {
308            let high = (val >> 16) as u16;
309            let low = val as u16;
310
311            if high != current_high {
312                current_container.optimize();
313                bitmap.containers.push((current_high, current_container));
314                current_high = high;
315                current_container = Container::new_array();
316            }
317
318            current_container.insert(low);
319        }
320
321        current_container.optimize();
322        bitmap.containers.push((current_high, current_container));
323
324        bitmap
325    }
326
327    /// Insert a value
328    pub fn insert(&mut self, val: u32) -> bool {
329        let high = (val >> 16) as u16;
330        let low = val as u16;
331
332        // Find or create container
333        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
334            Ok(idx) => self.containers[idx].1.insert(low),
335            Err(idx) => {
336                let mut container = Container::new_array();
337                container.insert(low);
338                self.containers.insert(idx, (high, container));
339                true
340            }
341        }
342    }
343
344    /// Check if value exists
345    pub fn contains(&self, val: u32) -> bool {
346        let high = (val >> 16) as u16;
347        let low = val as u16;
348
349        match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
350            Ok(idx) => self.containers[idx].1.contains(low),
351            Err(_) => false,
352        }
353    }
354
355    /// Number of elements
356    pub fn cardinality(&self) -> u32 {
357        self.containers.iter().map(|(_, c)| c.cardinality()).sum()
358    }
359
360    /// Check if empty
361    pub fn is_empty(&self) -> bool {
362        self.containers.is_empty()
363    }
364
365    /// Optimize all containers
366    pub fn optimize(&mut self) {
367        for (_, container) in &mut self.containers {
368            container.optimize();
369        }
370    }
371
372    /// Intersection with another bitmap
373    pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
374        let mut result = RoaringBitmap::new();
375
376        let mut i = 0;
377        let mut j = 0;
378
379        while i < self.containers.len() && j < other.containers.len() {
380            let (high1, c1) = &self.containers[i];
381            let (high2, c2) = &other.containers[j];
382
383            match high1.cmp(high2) {
384                std::cmp::Ordering::Less => i += 1,
385                std::cmp::Ordering::Greater => j += 1,
386                std::cmp::Ordering::Equal => {
387                    let intersected = Self::intersect_containers(c1, c2);
388                    if intersected.cardinality() > 0 {
389                        result.containers.push((*high1, intersected));
390                    }
391                    i += 1;
392                    j += 1;
393                }
394            }
395        }
396
397        result
398    }
399
400    fn intersect_containers(c1: &Container, c2: &Container) -> Container {
401        match (c1, c2) {
402            (Container::Array(a1), Container::Array(a2)) => {
403                let mut result = Vec::new();
404                let mut i = 0;
405                let mut j = 0;
406                while i < a1.len() && j < a2.len() {
407                    match a1[i].cmp(&a2[j]) {
408                        std::cmp::Ordering::Less => i += 1,
409                        std::cmp::Ordering::Greater => j += 1,
410                        std::cmp::Ordering::Equal => {
411                            result.push(a1[i]);
412                            i += 1;
413                            j += 1;
414                        }
415                    }
416                }
417                Container::Array(result)
418            }
419            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
420                let mut result = Box::new([0u64; 1024]);
421                for i in 0..1024 {
422                    result[i] = b1[i] & b2[i];
423                }
424                let mut c = Container::Bitmap(result);
425                c.optimize();
426                c
427            }
428            (Container::Array(arr), Container::Bitmap(bm))
429            | (Container::Bitmap(bm), Container::Array(arr)) => {
430                let mut result = Vec::new();
431                for &val in arr {
432                    let word_idx = (val / 64) as usize;
433                    let bit_idx = val % 64;
434                    if (bm[word_idx] >> bit_idx) & 1 == 1 {
435                        result.push(val);
436                    }
437                }
438                Container::Array(result)
439            }
440            _ => {
441                // For runs, convert to array first
442                let arr1 = Self::container_to_array(c1);
443                let arr2 = Self::container_to_array(c2);
444                Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
445            }
446        }
447    }
448
449    fn container_to_array(c: &Container) -> Vec<u16> {
450        match c {
451            Container::Array(arr) => arr.clone(),
452            Container::Bitmap(bm) => {
453                let mut arr = Vec::new();
454                for (word_idx, &word) in bm.iter().enumerate() {
455                    let mut w = word;
456                    while w != 0 {
457                        let bit_idx = w.trailing_zeros();
458                        arr.push((word_idx * 64 + bit_idx as usize) as u16);
459                        w &= w - 1;
460                    }
461                }
462                arr
463            }
464            Container::Runs(runs) => {
465                let mut arr = Vec::new();
466                for &(start, len) in runs {
467                    for i in 0..=len {
468                        arr.push(start + i);
469                    }
470                }
471                arr
472            }
473        }
474    }
475
476    /// Union with another bitmap
477    pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
478        let mut result = RoaringBitmap::new();
479
480        let mut i = 0;
481        let mut j = 0;
482
483        while i < self.containers.len() || j < other.containers.len() {
484            if i >= self.containers.len() {
485                result.containers.push(other.containers[j].clone());
486                j += 1;
487            } else if j >= other.containers.len() {
488                result.containers.push(self.containers[i].clone());
489                i += 1;
490            } else {
491                let (high1, c1) = &self.containers[i];
492                let (high2, c2) = &other.containers[j];
493
494                match high1.cmp(high2) {
495                    std::cmp::Ordering::Less => {
496                        result.containers.push(self.containers[i].clone());
497                        i += 1;
498                    }
499                    std::cmp::Ordering::Greater => {
500                        result.containers.push(other.containers[j].clone());
501                        j += 1;
502                    }
503                    std::cmp::Ordering::Equal => {
504                        let united = Self::union_containers(c1, c2);
505                        result.containers.push((*high1, united));
506                        i += 1;
507                        j += 1;
508                    }
509                }
510            }
511        }
512
513        result
514    }
515
516    fn union_containers(c1: &Container, c2: &Container) -> Container {
517        match (c1, c2) {
518            (Container::Bitmap(b1), Container::Bitmap(b2)) => {
519                let mut result = Box::new([0u64; 1024]);
520                for i in 0..1024 {
521                    result[i] = b1[i] | b2[i];
522                }
523                Container::Bitmap(result)
524            }
525            _ => {
526                // Convert both to arrays and merge
527                let arr1 = Self::container_to_array(c1);
528                let arr2 = Self::container_to_array(c2);
529
530                let mut result = Vec::with_capacity(arr1.len() + arr2.len());
531                let mut i = 0;
532                let mut j = 0;
533
534                while i < arr1.len() && j < arr2.len() {
535                    match arr1[i].cmp(&arr2[j]) {
536                        std::cmp::Ordering::Less => {
537                            result.push(arr1[i]);
538                            i += 1;
539                        }
540                        std::cmp::Ordering::Greater => {
541                            result.push(arr2[j]);
542                            j += 1;
543                        }
544                        std::cmp::Ordering::Equal => {
545                            result.push(arr1[i]);
546                            i += 1;
547                            j += 1;
548                        }
549                    }
550                }
551
552                result.extend_from_slice(&arr1[i..]);
553                result.extend_from_slice(&arr2[j..]);
554
555                let mut c = Container::Array(result);
556                c.optimize();
557                c
558            }
559        }
560    }
561
562    /// Serialize
563    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
564        writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
565        for (high, container) in &self.containers {
566            writer.write_u16::<LittleEndian>(*high)?;
567            container.serialize(writer)?;
568        }
569        Ok(())
570    }
571
572    /// Deserialize
573    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
574        let num_containers = reader.read_u32::<LittleEndian>()? as usize;
575        let mut containers = Vec::with_capacity(num_containers);
576
577        for _ in 0..num_containers {
578            let high = reader.read_u16::<LittleEndian>()?;
579            let container = Container::deserialize(reader)?;
580            containers.push((high, container));
581        }
582
583        Ok(Self { containers })
584    }
585
586    /// Approximate size in bytes
587    pub fn size_bytes(&self) -> usize {
588        4 + self
589            .containers
590            .iter()
591            .map(|(_, c)| 2 + c.size_bytes())
592            .sum::<usize>()
593    }
594
595    /// Create an iterator
596    pub fn iter(&self) -> RoaringIterator<'_> {
597        // Pre-allocate buffer for typical container size (4096 is array threshold)
598        RoaringIterator {
599            bitmap: self,
600            container_idx: 0,
601            value_idx: 0,
602            current_len: 0,
603            current_values: Vec::with_capacity(ARRAY_TO_BITMAP_THRESHOLD),
604        }
605    }
606}
607
608impl Default for RoaringBitmap {
609    fn default() -> Self {
610        Self::new()
611    }
612}
613
614/// Iterator over Roaring Bitmap
615pub struct RoaringIterator<'a> {
616    bitmap: &'a RoaringBitmap,
617    container_idx: usize,
618    value_idx: usize,
619    /// Number of valid values in current_values
620    current_len: usize,
621    /// Pre-allocated buffer for container values (max 65536 values per container)
622    current_values: Vec<u16>,
623}
624
625impl<'a> Iterator for RoaringIterator<'a> {
626    type Item = u32;
627
628    fn next(&mut self) -> Option<Self::Item> {
629        loop {
630            if self.value_idx < self.current_len {
631                let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
632                let low = self.current_values[self.value_idx] as u32;
633                self.value_idx += 1;
634                return Some((high << 16) | low);
635            }
636
637            if self.container_idx >= self.bitmap.containers.len() {
638                return None;
639            }
640
641            let (_, container) = &self.bitmap.containers[self.container_idx];
642            // Decode into pre-allocated buffer (no allocation!)
643            self.current_len = Self::decode_container_into(container, &mut self.current_values);
644            self.value_idx = 0;
645            self.container_idx += 1;
646        }
647    }
648}
649
650impl<'a> RoaringIterator<'a> {
651    /// Decode container values into a pre-allocated buffer
652    /// Returns the number of values decoded
653    #[inline]
654    fn decode_container_into(container: &Container, output: &mut Vec<u16>) -> usize {
655        match container {
656            Container::Array(arr) => {
657                let len = arr.len();
658                if output.len() < len {
659                    output.resize(len, 0);
660                }
661                output[..len].copy_from_slice(arr);
662                len
663            }
664            Container::Bitmap(bm) => Self::decode_bitmap_into(bm, output),
665            Container::Runs(runs) => {
666                let mut count = 0;
667                for &(start, len) in runs {
668                    for i in 0..=len {
669                        let val = start + i;
670                        if count >= output.len() {
671                            output.push(val);
672                        } else {
673                            output[count] = val;
674                        }
675                        count += 1;
676                    }
677                }
678                count
679            }
680        }
681    }
682
683    /// Decode bitmap container into output buffer
684    /// Uses optimized bit extraction
685    #[inline]
686    fn decode_bitmap_into(bm: &[u64; 1024], output: &mut Vec<u16>) -> usize {
687        let mut count = 0;
688
689        for (word_idx, &word) in bm.iter().enumerate() {
690            if word == 0 {
691                continue;
692            }
693
694            let base = (word_idx * 64) as u16;
695            let mut w = word;
696
697            // Unroll the bit extraction loop for better performance
698            while w != 0 {
699                let bit_idx = w.trailing_zeros() as u16;
700                let val = base + bit_idx;
701
702                if count >= output.len() {
703                    output.push(val);
704                } else {
705                    output[count] = val;
706                }
707                count += 1;
708                w &= w - 1; // Clear lowest set bit
709            }
710        }
711
712        count
713    }
714}
715
716/// Block size for Roaring BlockMax (matches container size = 65536 doc_ids)
717pub const ROARING_BLOCK_SIZE: usize = 65536;
718
719/// Block metadata for BlockMax WAND optimization in Roaring
720#[derive(Debug, Clone)]
721pub struct RoaringBlockInfo {
722    /// High 16 bits (container key)
723    pub container_key: u16,
724    /// First doc_id in this container
725    pub first_doc_id: u32,
726    /// Last doc_id in this container
727    pub last_doc_id: u32,
728    /// Maximum term frequency in this block
729    pub max_tf: u32,
730    /// Upper bound BM25 score for this block
731    pub max_block_score: f32,
732    /// Number of documents in this container
733    pub num_docs: u32,
734}
735
736impl RoaringBlockInfo {
737    /// Serialize block info
738    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
739        writer.write_u16::<LittleEndian>(self.container_key)?;
740        writer.write_u32::<LittleEndian>(self.first_doc_id)?;
741        writer.write_u32::<LittleEndian>(self.last_doc_id)?;
742        writer.write_u32::<LittleEndian>(self.max_tf)?;
743        writer.write_f32::<LittleEndian>(self.max_block_score)?;
744        writer.write_u32::<LittleEndian>(self.num_docs)?;
745        Ok(())
746    }
747
748    /// Deserialize block info
749    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
750        Ok(Self {
751            container_key: reader.read_u16::<LittleEndian>()?,
752            first_doc_id: reader.read_u32::<LittleEndian>()?,
753            last_doc_id: reader.read_u32::<LittleEndian>()?,
754            max_tf: reader.read_u32::<LittleEndian>()?,
755            max_block_score: reader.read_f32::<LittleEndian>()?,
756            num_docs: reader.read_u32::<LittleEndian>()?,
757        })
758    }
759}
760
761/// Roaring bitmap with term frequencies for posting lists
762#[derive(Debug, Clone)]
763pub struct RoaringPostingList {
764    /// Document IDs as roaring bitmap
765    pub doc_ids: RoaringBitmap,
766    /// Term frequencies (sparse map for non-1 frequencies)
767    /// Most terms have tf=1, so we only store exceptions
768    pub term_freqs: Vec<(u32, u32)>,
769    /// Default term frequency (usually 1)
770    pub default_tf: u32,
771    /// Maximum term frequency
772    pub max_tf: u32,
773    /// Block metadata for BlockMax WAND (one per container)
774    pub blocks: Vec<RoaringBlockInfo>,
775    /// Global maximum score across all blocks
776    pub max_score: f32,
777}
778
779impl RoaringPostingList {
780    /// Create from doc_ids and term frequencies (without IDF - for backwards compatibility)
781    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
782        Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
783    }
784
785    /// Create from doc_ids and term frequencies with IDF for block-max scores
786    pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
787        assert_eq!(doc_ids.len(), term_freqs.len());
788
789        if doc_ids.is_empty() {
790            return Self {
791                doc_ids: RoaringBitmap::new(),
792                term_freqs: Vec::new(),
793                default_tf: 1,
794                max_tf: 0,
795                blocks: Vec::new(),
796                max_score: 0.0,
797            };
798        }
799
800        let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
801
802        // Find most common TF (usually 1)
803        let mut tf_counts = std::collections::HashMap::new();
804        for &tf in term_freqs {
805            *tf_counts.entry(tf).or_insert(0u32) += 1;
806        }
807        let default_tf = tf_counts
808            .iter()
809            .max_by_key(|&(_, count)| count)
810            .map(|(&tf, _)| tf)
811            .unwrap_or(1);
812
813        // Store only non-default TFs
814        let exceptions: Vec<(u32, u32)> = doc_ids
815            .iter()
816            .zip(term_freqs.iter())
817            .filter(|&(_, &tf)| tf != default_tf)
818            .map(|(&doc, &tf)| (doc, tf))
819            .collect();
820
821        let max_tf = *term_freqs.iter().max().unwrap_or(&1);
822
823        // Build block metadata (one block per container)
824        // Group doc_ids by container (high 16 bits)
825        let mut blocks = Vec::new();
826        let mut max_score = 0.0f32;
827        let mut i = 0;
828
829        while i < doc_ids.len() {
830            let container_key = (doc_ids[i] >> 16) as u16;
831            let block_start = i;
832
833            // Find all docs in this container
834            while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
835                i += 1;
836            }
837
838            let block_doc_ids = &doc_ids[block_start..i];
839            let block_tfs = &term_freqs[block_start..i];
840            let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
841            let block_score = crate::query::bm25_upper_bound(block_max_tf as f32, idf);
842            max_score = max_score.max(block_score);
843
844            blocks.push(RoaringBlockInfo {
845                container_key,
846                first_doc_id: block_doc_ids[0],
847                last_doc_id: *block_doc_ids.last().unwrap(),
848                max_tf: block_max_tf,
849                max_block_score: block_score,
850                num_docs: block_doc_ids.len() as u32,
851            });
852        }
853
854        Self {
855            doc_ids: bitmap,
856            term_freqs: exceptions,
857            default_tf,
858            max_tf,
859            blocks,
860            max_score,
861        }
862    }
863
864    /// Get term frequency for a document
865    pub fn get_tf(&self, doc_id: u32) -> u32 {
866        // Binary search in exceptions
867        match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
868            Ok(idx) => self.term_freqs[idx].1,
869            Err(_) => self.default_tf,
870        }
871    }
872
873    /// Number of documents
874    pub fn len(&self) -> u32 {
875        self.doc_ids.cardinality()
876    }
877
878    /// Check if empty
879    pub fn is_empty(&self) -> bool {
880        self.doc_ids.is_empty()
881    }
882
883    /// Get number of blocks (containers)
884    pub fn num_blocks(&self) -> usize {
885        self.blocks.len()
886    }
887
888    /// Get block index for a doc_id
889    pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
890        let container_key = (doc_id >> 16) as u16;
891        self.blocks
892            .binary_search_by_key(&container_key, |b| b.container_key)
893            .ok()
894    }
895
896    /// Serialize
897    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
898        self.doc_ids.serialize(writer)?;
899        writer.write_u32::<LittleEndian>(self.default_tf)?;
900        writer.write_u32::<LittleEndian>(self.max_tf)?;
901        writer.write_f32::<LittleEndian>(self.max_score)?;
902        writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
903        for &(doc, tf) in &self.term_freqs {
904            writer.write_u32::<LittleEndian>(doc)?;
905            writer.write_u32::<LittleEndian>(tf)?;
906        }
907
908        // Write block metadata
909        writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
910        for block in &self.blocks {
911            block.serialize(writer)?;
912        }
913
914        Ok(())
915    }
916
917    /// Deserialize
918    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
919        let doc_ids = RoaringBitmap::deserialize(reader)?;
920        let default_tf = reader.read_u32::<LittleEndian>()?;
921        let max_tf = reader.read_u32::<LittleEndian>()?;
922        let max_score = reader.read_f32::<LittleEndian>()?;
923        let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
924        let mut term_freqs = Vec::with_capacity(num_exceptions);
925        for _ in 0..num_exceptions {
926            let doc = reader.read_u32::<LittleEndian>()?;
927            let tf = reader.read_u32::<LittleEndian>()?;
928            term_freqs.push((doc, tf));
929        }
930
931        // Read block metadata
932        let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
933        let mut blocks = Vec::with_capacity(num_blocks);
934        for _ in 0..num_blocks {
935            blocks.push(RoaringBlockInfo::deserialize(reader)?);
936        }
937
938        Ok(Self {
939            doc_ids,
940            term_freqs,
941            default_tf,
942            max_tf,
943            blocks,
944            max_score,
945        })
946    }
947
948    /// Create iterator
949    pub fn iterator(&self) -> RoaringPostingIterator<'_> {
950        RoaringPostingIterator {
951            list: self,
952            doc_iter: self.doc_ids.iter(),
953            current_doc: None,
954            current_block: 0,
955        }
956    }
957}
958
959/// Iterator over Roaring posting list with BlockMax support
960pub struct RoaringPostingIterator<'a> {
961    list: &'a RoaringPostingList,
962    doc_iter: RoaringIterator<'a>,
963    current_doc: Option<u32>,
964    current_block: usize,
965}
966
967impl<'a> RoaringPostingIterator<'a> {
968    /// Current document ID
969    pub fn doc(&self) -> u32 {
970        self.current_doc.unwrap_or(u32::MAX)
971    }
972
973    /// Current term frequency
974    pub fn term_freq(&self) -> u32 {
975        match self.current_doc {
976            Some(doc) => self.list.get_tf(doc),
977            None => 0,
978        }
979    }
980
981    /// Advance to next document
982    pub fn advance(&mut self) -> u32 {
983        self.current_doc = self.doc_iter.next();
984        // Update current block if needed
985        if let Some(doc) = self.current_doc
986            && !self.list.blocks.is_empty()
987        {
988            let container_key = (doc >> 16) as u16;
989            // Move to next block if we've passed current one
990            while self.current_block < self.list.blocks.len()
991                && self.list.blocks[self.current_block].container_key < container_key
992            {
993                self.current_block += 1;
994            }
995        }
996        self.doc()
997    }
998
999    /// Initialize (must be called before first use)
1000    pub fn init(&mut self) {
1001        self.current_doc = self.doc_iter.next();
1002        self.current_block = 0;
1003    }
1004
1005    /// Seek to first doc >= target
1006    pub fn seek(&mut self, target: u32) -> u32 {
1007        // Use block skip list for faster seeking
1008        if !self.list.blocks.is_empty() {
1009            let target_container = (target >> 16) as u16;
1010
1011            // Skip blocks that are entirely before target
1012            while self.current_block < self.list.blocks.len()
1013                && self.list.blocks[self.current_block].last_doc_id < target
1014            {
1015                self.current_block += 1;
1016            }
1017
1018            // If we've exhausted all blocks
1019            if self.current_block >= self.list.blocks.len() {
1020                self.current_doc = None;
1021                return u32::MAX;
1022            }
1023
1024            // Skip docs until we reach the target block's first doc
1025            let block = &self.list.blocks[self.current_block];
1026            if block.container_key > target_container
1027                || (block.container_key == target_container && block.first_doc_id > self.doc())
1028            {
1029                // Need to advance iterator to this block
1030                while let Some(doc) = self.current_doc {
1031                    if doc >= block.first_doc_id {
1032                        break;
1033                    }
1034                    self.current_doc = self.doc_iter.next();
1035                }
1036            }
1037        }
1038
1039        // Linear scan within block
1040        while let Some(doc) = self.current_doc {
1041            if doc >= target {
1042                return doc;
1043            }
1044            self.current_doc = self.doc_iter.next();
1045        }
1046        u32::MAX
1047    }
1048
1049    /// Check if iterator is exhausted
1050    pub fn is_exhausted(&self) -> bool {
1051        self.current_doc.is_none()
1052    }
1053
1054    /// Get current block's max score (for BlockMax WAND)
1055    pub fn current_block_max_score(&self) -> f32 {
1056        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1057            return 0.0;
1058        }
1059        if self.current_block < self.list.blocks.len() {
1060            self.list.blocks[self.current_block].max_block_score
1061        } else {
1062            0.0
1063        }
1064    }
1065
1066    /// Get current block's max term frequency
1067    pub fn current_block_max_tf(&self) -> u32 {
1068        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1069            return 0;
1070        }
1071        if self.current_block < self.list.blocks.len() {
1072            self.list.blocks[self.current_block].max_tf
1073        } else {
1074            0
1075        }
1076    }
1077
1078    /// Get max score for remaining blocks (for MaxScore optimization)
1079    pub fn max_remaining_score(&self) -> f32 {
1080        if self.current_doc.is_none() || self.list.blocks.is_empty() {
1081            return 0.0;
1082        }
1083        self.list.blocks[self.current_block..]
1084            .iter()
1085            .map(|b| b.max_block_score)
1086            .fold(0.0f32, |a, b| a.max(b))
1087    }
1088
1089    /// Skip to next block containing doc >= target (for BlockWAND)
1090    /// Returns (first_doc_in_block, block_max_score) or None if exhausted
1091    pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1092        if self.list.blocks.is_empty() {
1093            return None;
1094        }
1095
1096        while self.current_block < self.list.blocks.len() {
1097            let block = &self.list.blocks[self.current_block];
1098            if block.last_doc_id >= target {
1099                // Advance iterator to this block's first doc
1100                while let Some(doc) = self.current_doc {
1101                    if doc >= block.first_doc_id {
1102                        break;
1103                    }
1104                    self.current_doc = self.doc_iter.next();
1105                }
1106                return Some((block.first_doc_id, block.max_block_score));
1107            }
1108            self.current_block += 1;
1109        }
1110
1111        self.current_doc = None;
1112        None
1113    }
1114}
1115
1116#[cfg(test)]
1117mod tests {
1118    use super::*;
1119
1120    #[test]
1121    fn test_roaring_basic() {
1122        let mut bitmap = RoaringBitmap::new();
1123        bitmap.insert(1);
1124        bitmap.insert(100);
1125        bitmap.insert(1000);
1126        bitmap.insert(100000);
1127
1128        assert!(bitmap.contains(1));
1129        assert!(bitmap.contains(100));
1130        assert!(bitmap.contains(1000));
1131        assert!(bitmap.contains(100000));
1132        assert!(!bitmap.contains(2));
1133        assert!(!bitmap.contains(50000));
1134
1135        assert_eq!(bitmap.cardinality(), 4);
1136    }
1137
1138    #[test]
1139    fn test_roaring_from_sorted() {
1140        let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1141        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1142
1143        assert_eq!(bitmap.cardinality(), 10000);
1144
1145        for &val in &values {
1146            assert!(bitmap.contains(val), "Missing value {}", val);
1147        }
1148    }
1149
1150    #[test]
1151    fn test_roaring_intersection() {
1152        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1153        let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1154
1155        let result = a.and(&b);
1156
1157        assert_eq!(result.cardinality(), 4);
1158        assert!(result.contains(2));
1159        assert!(result.contains(3));
1160        assert!(result.contains(200));
1161        assert!(result.contains(300));
1162    }
1163
1164    #[test]
1165    fn test_roaring_union() {
1166        let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1167        let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1168
1169        let result = a.or(&b);
1170
1171        assert_eq!(result.cardinality(), 5);
1172        for i in 1..=5 {
1173            assert!(result.contains(i));
1174        }
1175    }
1176
1177    #[test]
1178    fn test_roaring_serialization() {
1179        let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1180        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1181
1182        let mut buffer = Vec::new();
1183        bitmap.serialize(&mut buffer).unwrap();
1184
1185        let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1186
1187        assert_eq!(restored.cardinality(), bitmap.cardinality());
1188        for &val in &values {
1189            assert!(restored.contains(val));
1190        }
1191    }
1192
1193    #[test]
1194    fn test_roaring_posting_list() {
1195        let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1196        let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1197
1198        let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1199
1200        assert_eq!(list.len(), 7);
1201        assert_eq!(list.default_tf, 1);
1202        assert_eq!(list.max_tf, 3);
1203
1204        // Check TF lookups
1205        assert_eq!(list.get_tf(1), 1);
1206        assert_eq!(list.get_tf(10), 2);
1207        assert_eq!(list.get_tf(100), 3);
1208    }
1209
1210    #[test]
1211    fn test_roaring_iterator() {
1212        let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1213        let bitmap = RoaringBitmap::from_sorted_slice(&values);
1214
1215        let collected: Vec<u32> = bitmap.iter().collect();
1216        assert_eq!(collected, values);
1217    }
1218
1219    #[test]
1220    fn test_roaring_block_max() {
1221        // Create posting list spanning multiple containers (high 16 bits)
1222        // Container 0: doc_ids 0-65535
1223        // Container 1: doc_ids 65536-131071
1224        // Container 2: doc_ids 131072-196607
1225        let mut doc_ids = Vec::new();
1226        let mut term_freqs = Vec::new();
1227
1228        // Container 0: 100 docs with max_tf = 2
1229        for i in 0..100 {
1230            doc_ids.push(i * 100);
1231            term_freqs.push(if i == 50 { 2 } else { 1 });
1232        }
1233
1234        // Container 1: 100 docs with max_tf = 5
1235        for i in 0..100 {
1236            doc_ids.push(65536 + i * 100);
1237            term_freqs.push(if i == 25 { 5 } else { 1 });
1238        }
1239
1240        // Container 2: 100 docs with max_tf = 3
1241        for i in 0..100 {
1242            doc_ids.push(131072 + i * 100);
1243            term_freqs.push(if i == 75 { 3 } else { 1 });
1244        }
1245
1246        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1247
1248        // Should have 3 blocks (one per container)
1249        assert_eq!(list.num_blocks(), 3);
1250        assert_eq!(list.blocks[0].container_key, 0);
1251        assert_eq!(list.blocks[1].container_key, 1);
1252        assert_eq!(list.blocks[2].container_key, 2);
1253
1254        assert_eq!(list.blocks[0].max_tf, 2);
1255        assert_eq!(list.blocks[1].max_tf, 5);
1256        assert_eq!(list.blocks[2].max_tf, 3);
1257
1258        // Block 1 should have highest score (max_tf = 5)
1259        assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1260        assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1261
1262        // Global max_score should equal block 1's score
1263        assert_eq!(list.max_score, list.blocks[1].max_block_score);
1264
1265        // Test iterator block-max methods
1266        let mut iter = list.iterator();
1267        iter.init();
1268        assert_eq!(iter.current_block_max_tf(), 2); // Block 0
1269
1270        // Seek to block 1
1271        iter.seek(65536);
1272        assert_eq!(iter.current_block_max_tf(), 5);
1273
1274        // Seek to block 2
1275        iter.seek(131072);
1276        assert_eq!(iter.current_block_max_tf(), 3);
1277    }
1278
1279    #[test]
1280    fn test_roaring_block_max_serialization() {
1281        let mut doc_ids = Vec::new();
1282        let mut term_freqs = Vec::new();
1283
1284        // Two containers
1285        for i in 0..50 {
1286            doc_ids.push(i * 10);
1287            term_freqs.push((i % 5) + 1);
1288        }
1289        for i in 0..50 {
1290            doc_ids.push(65536 + i * 10);
1291            term_freqs.push((i % 3) + 1);
1292        }
1293
1294        let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1295
1296        let mut buffer = Vec::new();
1297        list.serialize(&mut buffer).unwrap();
1298
1299        let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1300
1301        assert_eq!(restored.len(), list.len());
1302        assert_eq!(restored.max_tf, list.max_tf);
1303        assert_eq!(restored.max_score, list.max_score);
1304        assert_eq!(restored.num_blocks(), list.num_blocks());
1305
1306        // Verify block metadata
1307        for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1308            assert_eq!(orig.container_key, rest.container_key);
1309            assert_eq!(orig.first_doc_id, rest.first_doc_id);
1310            assert_eq!(orig.last_doc_id, rest.last_doc_id);
1311            assert_eq!(orig.max_tf, rest.max_tf);
1312            assert_eq!(orig.max_block_score, rest.max_block_score);
1313        }
1314
1315        // Verify iteration produces same results
1316        let mut iter1 = list.iterator();
1317        let mut iter2 = restored.iterator();
1318        iter1.init();
1319        iter2.init();
1320
1321        while !iter1.is_exhausted() {
1322            assert_eq!(iter1.doc(), iter2.doc());
1323            assert_eq!(iter1.term_freq(), iter2.term_freq());
1324            iter1.advance();
1325            iter2.advance();
1326        }
1327        assert!(iter2.is_exhausted());
1328    }
1329}