bitrush_index/ozbcbitmap/
mod.rs

1// This code is released under the
2// General Public License (GPL), version 3
3// http://www.gnu.org/licenses/gpl-3.0.en.html
4// (c) Lorenzo Vannucci
5
6//! # OZBCBitmap
7//!
8//! A compressed bitmap for bitmap indexes.
9//!
10//! # Encoding
11//! OZBCBitmap encodes bits in 16bits words. There are two types of words:
12//!
13//!  0:  |1bit word_type=0|7bit    bytes_zero|8bit    dirty_byte|
14//!  1:  |1bit word_type=1|         15bit 128_bytes_zero        |
15//!
16//! Where:
17//! - bytes_zero = number of consecutive sequence of 8bit zeros.
18//! - dirty_byte = uncompressed 8bit.
19//! - 128_bytes_zero = number of consecutive sequence of 1024bit of zeros.
20//!
21//! Note:
22//! - The max size of this compressed bitmap is twice the size of the same uncompressed bitmap.
23//! - The max number of consecutive zero bits that can be rapresented from
24//! a single word is ((2^15) - 1) * (2^10) = (2^25 - 2^10) bits.
25//!
26//! A older version of OZBCBitmap encoding: https://github.com/uccidibuti/OZBCBitmap .
27
28/// [`Debug`]: https://doc.rust-lang.org/std/fmt/trait.Debug.html
29/// [`BitAnd`]: https://doc.rust-lang.org/std/ops/trait.BitAnd.html
30/// [`Bitmap`]: ../bitmap_index/bitmap.rs
31/// [`BitmapIndex`]: ../bitmap_index/mod.rs
32
33use std::mem;
34use std::ops::{BitAnd};
35use std::result::Result;
36use crate::bitmap_index::Bitmap;
37
38const OZBC_MAX_128_BYTES_ZERO: u16 = ((1 << 15) - 1);
39const OZBC_MAX_BYTES_ZERO: u32 = ((OZBC_MAX_128_BYTES_ZERO as u32) << 7);
40
41macro_rules! get_bytes_from_word {
42    (0 $input:expr) => {
43        ((($input) >> 8) + 1)
44    };
45    (1 $input:expr) => {
46        ((($input) & OZBC_MAX_128_BYTES_ZERO as u32) << 7)
47    };
48}
49
50macro_rules! get_word_type {
51    ($input:expr) => {
52        ($input) >> 15
53    };
54}
55
56macro_rules! get_dirty_byte {
57    ($input:expr) => {
58        ($input) & 255
59    };
60}
61
62#[derive(Clone, PartialEq)]
63pub struct OZBCBitmap {
64    buffer: Vec<u16>,
65    num_bytes: u32,
66}
67
68/// Impl [`Debug`] trait printing each bit of every bitmap words.
69impl std::fmt::Debug for OZBCBitmap {
70    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
71        let mut output: String = format!("Number of bits: {}\n", (self.num_bytes << 3));
72        // let mut num_bytes = 0;
73        for word in &self.buffer {
74            let t: u16 = get_word_type!(word);
75            for j in (0..16).rev() {
76                let x: u16 = (word >> j) & (1 as u16);
77                output.push_str(&x.to_string());
78                if j == 15 || (j == 8 && t == 0) {
79                    output.push_str("|");
80                }
81            }
82            output.push_str("\n");
83        }
84        write!(f, "{}", output)
85    }
86}
87
88
89/// Impl [`BitAnd`] running "logical and" bit operation between 2 bitmaps.
90impl BitAnd for &OZBCBitmap {
91    type Output = OZBCBitmap;
92
93    fn bitand(self, b2: Self) -> OZBCBitmap {
94        let mut bitmap_to_return = OZBCBitmap::new();
95        let v0 = self.buffer.as_slice();
96        let v1 = b2.buffer.as_slice();
97        let mut i: usize = 0; // v0 index
98        let mut j: usize = 0; // v1 index
99        let mut count_bytes: u32 = 0;
100        let mut scanned_bytes: (u32, u32) = (0, 0);
101
102        while i < v0.len() && j < v1.len() {
103            let w0: u32 = unsafe { *v0.get_unchecked(i) } as u32;
104            let w1: u32 = unsafe { *v1.get_unchecked(j) } as u32;
105
106            let word_type: u8 = ((get_word_type!(w1) << 1) | get_word_type!(w0)) as u8;
107            let bytes_in_word = match word_type {
108                // w0 and w1 are of type 0
109                0b00 => (get_bytes_from_word!(0 w0), get_bytes_from_word!(0 w1)),
110                // w0 is of type 1, w1 is of type 0
111                0b01 => (get_bytes_from_word!(1 w0), get_bytes_from_word!(0 w1)),
112                // w0 is of type 0, w1 is of type 1
113                0b10 => (get_bytes_from_word!(0 w0), get_bytes_from_word!(1 w1)),
114                // w0 and w1 are of type 1
115                0b11 => (get_bytes_from_word!(1 w0), get_bytes_from_word!(1 w1)),
116                _ => panic!("Error occured on bitmap and"),
117            };
118            i += 1;
119            j += 1;
120            scanned_bytes.0 += bytes_in_word.0;
121            scanned_bytes.1 += bytes_in_word.1;
122
123            if scanned_bytes.0 < scanned_bytes.1 {
124                scanned_bytes.1 -= bytes_in_word.1;
125                j -= 1;
126            } else if scanned_bytes.0 > scanned_bytes.1 {
127                scanned_bytes.0 -= bytes_in_word.0;
128                i -= 1;
129            } else if word_type == 0 {
130                let mut bytes_zero = scanned_bytes.0 - count_bytes - 1;
131                let dirty_byte =
132                    unsafe { get_dirty_byte!(*v0.get_unchecked(i - 1) & *v1.get_unchecked(j - 1)) };
133
134                if dirty_byte != 0 {
135                    count_bytes = scanned_bytes.0;
136                    if bytes_zero < (1 << 7) {
137                        bitmap_to_return
138                            .buffer
139                            .push((bytes_zero << 8) as u16 | dirty_byte);
140                    } else {
141                        while bytes_zero > OZBC_MAX_BYTES_ZERO {
142                            bitmap_to_return
143                                .buffer
144                                .push(OZBC_MAX_128_BYTES_ZERO | (1 << 15));
145                            bytes_zero -= OZBC_MAX_BYTES_ZERO;
146                        }
147                        bitmap_to_return
148                            .buffer
149                            .push((bytes_zero >> 7) as u16 | (1 << 15));
150                        bitmap_to_return
151                            .buffer
152                            .push(((bytes_zero as u16 & 127) << 8) | dirty_byte);
153                    }
154                } // end if dirty_bytes != 0
155            } // end if word_type == 0
156        } // end while
157
158        bitmap_to_return.num_bytes = count_bytes;
159        bitmap_to_return
160    }
161}
162
163/// Impl [`Bitmap`] to allow to use OZBCBitmap in [`BitmapIndex`].
164impl Bitmap for OZBCBitmap {
165    
166    /// Return new empty bitmap.
167    fn new() -> OZBCBitmap {
168        OZBCBitmap {
169            buffer: Vec::new(),
170            num_bytes: 0,
171        }
172    }
173
174    /// Set the ith bit (starting from zero). You must set the bitmap in increasing
175    /// order otherwise nothing happend:
176    /// `set(0), set(16), set(1000)` is the same of `set(0), set(16), set(1000), set(50)`.
177    ///
178    /// # Example
179    ///
180    /// ```
181    /// use bitrush_index::OZBCBitmap;
182    /// use bitrush_index::Bitmap;
183    ///
184    /// fn main() {        
185    ///     let mut b0 = OZBCBitmap::new();
186    ///     let mut b1 = b0.clone();
187    ///     let values = [0, 1, 100, 100000, 2,  100001];
188    ///     let values_ok = [0, 1, 100, 100000, 100001];
189    ///     for val in values.iter() {
190    ///         b0.set(*val);
191    ///     }
192    ///     for val in values_ok.iter() {
193    ///         b1.set(*val);
194    ///     }
195    ///     assert_eq!(b0, b1);
196    /// }
197    /// ```
198    fn set(&mut self, i: u32) {
199        let dirty_bit = (i & 7) as u16;
200        let dirty_byte = 1 << dirty_bit;
201        let mut bytes_zero: i32 = (i >> 3) as i32 - self.num_bytes as i32;
202        if bytes_zero >= 0 {
203            self.num_bytes += bytes_zero as u32 + 1;
204            if bytes_zero < 128 {
205                self.buffer.push(((bytes_zero as u16) << 8) | dirty_byte);
206            } else {
207                while bytes_zero as u32 > OZBC_MAX_BYTES_ZERO {
208                    self.buffer.push((1 << 15) | OZBC_MAX_128_BYTES_ZERO);
209                    bytes_zero -= OZBC_MAX_BYTES_ZERO as i32;
210                }
211                self.buffer.push((1 << 15) | ((bytes_zero >> 7) as u16));
212                self.buffer
213                    .push((((bytes_zero as u16) & 127) << 8) | dirty_byte);
214            }
215        } else if bytes_zero == -1 && get_dirty_byte!(*self.buffer.last_mut().unwrap()) < dirty_byte
216        {
217            *self.buffer.last_mut().unwrap() |= dirty_byte;
218        }
219    }
220
221    /// Return a vector with all positions of set bit.
222    fn unroll_bitmap(&self) -> Vec<u32> {
223        let mut pos_set: u32 = 0;
224        let mut unrolled_bitmap: Vec<u32> = Vec::with_capacity(self.buffer.len());
225
226        for word in &self.buffer {
227            if get_word_type!(word) as u32 == 0 {
228                let bytes_zero = (word >> 8) as u32;
229                pos_set += bytes_zero << 3;
230                let dirty_byte = get_dirty_byte!(word);
231
232                for j in 0..8 {
233                    if (dirty_byte >> j) & 1 == 1 {
234                        unrolled_bitmap.push(pos_set + j);
235                    }
236                }
237                pos_set += 8;
238            } else {
239                let bytes_zero: u32 = ((word & OZBC_MAX_128_BYTES_ZERO) as u32) << 7;
240                pos_set += bytes_zero << 3;
241            }
242        }
243        unrolled_bitmap
244    }
245
246    /// Get bitmap content size.
247    fn size(&self) -> usize {
248        let word_size = mem::size_of::<u16>();
249        let buffer_content_size = self.buffer.len() * word_size;
250        let bitmap_size = buffer_content_size + mem::size_of::<u32>();
251        bitmap_size
252    }
253
254    /// Write bitmap content into buffer_out and return the number of bytes written.
255    fn write_to_buffer(&self, buffer_out: &mut [u8]) -> Result<usize, ()> {
256        let bitmap_content_size = self.size();
257        if buffer_out.len() < bitmap_content_size {
258            return Err(());
259        }
260        let num_bytes_raw: [u8; 4] = unsafe { mem::transmute(self.num_bytes) };
261        // let num_bytes_raw: [u8; 4] = self.num_bytes.to_ne_bytes();
262        buffer_out[0..(num_bytes_raw.len())].copy_from_slice(&num_bytes_raw);
263
264        let buffer_raw: &[u8] = unsafe {
265            std::slice::from_raw_parts(
266                self.buffer.as_ptr() as *const u8,
267                self.buffer.len() * mem::size_of::<u16>(),
268            )
269        };
270        let start_offset = num_bytes_raw.len();
271        let end_offset = start_offset + buffer_raw.len();
272        buffer_out[start_offset..end_offset].copy_from_slice(buffer_raw);
273        Ok(end_offset)
274    }
275
276    /// Read bitmap content from buffer_in, buffer_in must have the exact length of bitmap content.
277    fn read_from_buffer(&mut self, buffer_in: &[u8], check_bitmap: bool) -> Result<(), ()> {
278        let num_bytes_pointer: *const u32 = buffer_in[0..4].as_ptr() as *const u32;
279        let num_bytes: u32 = unsafe { *num_bytes_pointer };
280        // let num_bytes: u32 = u32::from_ne_bytes(buffer_in[0..4].try_into().unwrap());
281
282        let buffer_pointer: *const u16 = buffer_in[4..buffer_in.len()].as_ptr() as *const u16;
283        let buffer_len: usize = (buffer_in.len() - 4) / mem::size_of::<u16>();
284
285        let buffer_slice: &[u16] =
286            unsafe { std::slice::from_raw_parts(buffer_pointer, buffer_len) };
287
288        let mut buffer = vec![0; buffer_slice.len()];
289        buffer.copy_from_slice(buffer_slice);
290
291        if check_bitmap == true {
292            let buffer_num_bytes = OZBCBitmap::get_buffer_num_bytes(&buffer);
293            if buffer_num_bytes != num_bytes {
294                return Err(());
295            }
296        }
297
298        self.num_bytes = num_bytes;
299        self.buffer = buffer;
300
301        Ok(())
302    }
303}
304
305impl OZBCBitmap {
306    fn get_buffer_num_bytes(buffer: &Vec<u16>) -> u32 {
307        buffer.iter().fold(0, |mut num_bytes, &word| {
308            num_bytes += match get_word_type!(word) {
309                0 => get_bytes_from_word!(0(word as u32)),
310                1 => get_bytes_from_word!(1(word as u32)),
311                _ => 0,
312            };
313            num_bytes
314        })
315    }
316}