1use std::convert::TryInto;
2use std::io::Write;
3use std::{fmt, io, u64};
4
5use ownedbytes::OwnedBytes;
6
7#[derive(Clone, Copy, Eq, PartialEq)]
8pub struct TinySet(u64);
9
10impl fmt::Debug for TinySet {
11 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
12 self.into_iter().collect::<Vec<u32>>().fmt(f)
13 }
14}
15
16pub struct TinySetIterator(TinySet);
17impl Iterator for TinySetIterator {
18 type Item = u32;
19
20 #[inline]
21 fn next(&mut self) -> Option<Self::Item> {
22 self.0.pop_lowest()
23 }
24}
25
26impl IntoIterator for TinySet {
27 type Item = u32;
28 type IntoIter = TinySetIterator;
29 fn into_iter(self) -> Self::IntoIter {
30 TinySetIterator(self)
31 }
32}
33
34impl TinySet {
35 pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
36 writer.write_all(self.0.to_le_bytes().as_ref())
37 }
38
39 pub fn into_bytes(self) -> [u8; 8] {
40 self.0.to_le_bytes()
41 }
42
43 #[inline]
44 pub fn deserialize(data: [u8; 8]) -> Self {
45 let val: u64 = u64::from_le_bytes(data);
46 TinySet(val)
47 }
48
49 #[inline]
51 pub fn empty() -> TinySet {
52 TinySet(0u64)
53 }
54
55 #[inline]
57 pub fn full() -> TinySet {
58 TinySet::empty().complement()
59 }
60
61 pub fn clear(&mut self) {
62 self.0 = 0u64;
63 }
64
65 #[inline]
70 fn complement(self) -> TinySet {
71 TinySet(!self.0)
72 }
73
74 #[inline]
76 pub fn contains(self, el: u32) -> bool {
77 !self.intersect(TinySet::singleton(el)).is_empty()
78 }
79
80 #[inline]
82 pub fn len(self) -> u32 {
83 self.0.count_ones()
84 }
85
86 #[inline]
88 #[must_use]
89 pub fn intersect(self, other: TinySet) -> TinySet {
90 TinySet(self.0 & other.0)
91 }
92
93 #[inline]
96 pub fn singleton(el: u32) -> TinySet {
97 TinySet(1u64 << u64::from(el))
98 }
99
100 #[inline]
102 #[must_use]
103 pub fn insert(self, el: u32) -> TinySet {
104 self.union(TinySet::singleton(el))
105 }
106
107 #[inline]
109 #[must_use]
110 pub fn remove(self, el: u32) -> TinySet {
111 self.intersect(TinySet::singleton(el).complement())
112 }
113
114 #[inline]
118 pub fn insert_mut(&mut self, el: u32) -> bool {
119 let old = *self;
120 *self = old.insert(el);
121 old != *self
122 }
123
124 #[inline]
128 pub fn remove_mut(&mut self, el: u32) -> bool {
129 let old = *self;
130 *self = old.remove(el);
131 old != *self
132 }
133
134 #[inline]
136 #[must_use]
137 pub fn union(self, other: TinySet) -> TinySet {
138 TinySet(self.0 | other.0)
139 }
140
141 #[inline]
143 pub fn is_empty(self) -> bool {
144 self.0 == 0u64
145 }
146
147 #[inline]
150 pub fn pop_lowest(&mut self) -> Option<u32> {
151 if self.is_empty() {
152 None
153 } else {
154 let lowest = self.0.trailing_zeros();
155 self.0 ^= TinySet::singleton(lowest).0;
156 Some(lowest)
157 }
158 }
159
160 pub fn range_lower(upper_bound: u32) -> TinySet {
165 TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
166 }
167
168 pub fn range_greater_or_equal(from_included: u32) -> TinySet {
173 TinySet::range_lower(from_included).complement()
174 }
175}
176
177#[derive(Clone)]
178pub struct BitSet {
179 tinysets: Box<[TinySet]>,
180 len: u64,
181 max_value: u32,
182}
183
184fn num_buckets(max_val: u32) -> u32 {
185 (max_val + 63u32) / 64u32
186}
187
188impl BitSet {
189 pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
191 writer.write_all(self.max_value.to_le_bytes().as_ref())?;
192 for tinyset in self.tinysets.iter().cloned() {
193 writer.write_all(&tinyset.into_bytes())?;
194 }
195 writer.flush()?;
196 Ok(())
197 }
198
199 pub fn with_max_value(max_value: u32) -> BitSet {
202 let num_buckets = num_buckets(max_value);
203 let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
204 BitSet {
205 tinysets: tinybitsets,
206 len: 0,
207 max_value,
208 }
209 }
210
211 pub fn with_max_value_and_full(max_value: u32) -> BitSet {
214 let num_buckets = num_buckets(max_value);
215 let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
216
217 let lower = max_value % 64u32;
219 if lower != 0 {
220 tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
221 }
222 BitSet {
223 tinysets: tinybitsets,
224 len: max_value as u64,
225 max_value,
226 }
227 }
228
229 pub fn clear(&mut self) {
231 for tinyset in self.tinysets.iter_mut() {
232 *tinyset = TinySet::empty();
233 }
234 }
235
236 pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
238 self.intersect_update_with_iter(other.iter_tinysets());
239 }
240
241 fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
243 self.len = 0;
244 for (left, right) in self.tinysets.iter_mut().zip(other) {
245 *left = left.intersect(right);
246 self.len += left.len() as u64;
247 }
248 }
249
250 #[inline]
252 pub fn len(&self) -> usize {
253 self.len as usize
254 }
255
256 #[inline]
258 pub fn insert(&mut self, el: u32) {
259 let higher = el / 64u32;
261 let lower = el % 64u32;
262 self.len += u64::from(self.tinysets[higher as usize].insert_mut(lower));
263 }
264
265 #[inline]
267 pub fn remove(&mut self, el: u32) {
268 let higher = el / 64u32;
270 let lower = el % 64u32;
271 self.len -= u64::from(self.tinysets[higher as usize].remove_mut(lower));
272 }
273
274 #[inline]
276 pub fn contains(&self, el: u32) -> bool {
277 self.tinyset(el / 64u32).contains(el % 64)
278 }
279
280 pub fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
286 self.tinysets[bucket as usize..]
287 .iter()
288 .cloned()
289 .position(|tinyset| !tinyset.is_empty())
290 .map(|delta_bucket| bucket + delta_bucket as u32)
291 }
292
293 #[inline]
294 pub fn max_value(&self) -> u32 {
295 self.max_value
296 }
297
298 pub fn tinyset(&self, bucket: u32) -> TinySet {
302 self.tinysets[bucket as usize]
303 }
304}
305
306#[derive(Clone)]
308pub struct ReadOnlyBitSet {
309 data: OwnedBytes,
310 max_value: u32,
311}
312
313pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet {
314 assert_eq!(left.max_value(), other.max_value());
315 assert_eq!(left.data.len(), other.data.len());
316 let union_tinyset_it = left
317 .iter_tinysets()
318 .zip(other.iter_tinysets())
319 .map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
320 let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
321 for tinyset in union_tinyset_it {
322 output_dataset.extend_from_slice(&tinyset.into_bytes());
323 }
324 ReadOnlyBitSet {
325 data: OwnedBytes::new(output_dataset),
326 max_value: left.max_value(),
327 }
328}
329
330impl ReadOnlyBitSet {
331 pub fn open(data: OwnedBytes) -> Self {
332 let (max_value_data, data) = data.split(4);
333 assert_eq!(data.len() % 8, 0);
334 let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
335 ReadOnlyBitSet { data, max_value }
336 }
337
338 #[inline]
340 pub fn len(&self) -> usize {
341 self.iter_tinysets()
342 .map(|tinyset| tinyset.len() as usize)
343 .sum()
344 }
345
346 #[inline]
348 fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
349 self.data.chunks_exact(8).map(move |chunk| {
350 let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap());
351 tinyset
352 })
353 }
354
355 #[inline]
357 pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
358 self.iter_tinysets()
359 .enumerate()
360 .flat_map(move |(chunk_num, tinyset)| {
361 let chunk_base_val = chunk_num as u32 * 64;
362 tinyset
363 .into_iter()
364 .map(move |val| val + chunk_base_val)
365 .take_while(move |doc| *doc < self.max_value)
366 })
367 }
368
369 #[inline]
371 pub fn contains(&self, el: u32) -> bool {
372 let byte_offset = el / 8u32;
373 let b: u8 = self.data[byte_offset as usize];
374 let shift = (el % 8) as u8;
375 b & (1u8 << shift) != 0
376 }
377
378 #[inline]
384 pub fn max_value(&self) -> u32 {
385 self.max_value
386 }
387
388 pub fn num_bytes(&self) -> usize {
390 self.data.len()
391 }
392}
393
394impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
395 fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
396 let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
397 bitset
398 .serialize(&mut buffer)
399 .expect("serializing into a buffer should never fail");
400 ReadOnlyBitSet::open(OwnedBytes::new(buffer))
401 }
402}
403
404#[cfg(test)]
405mod tests {
406
407 use std::collections::HashSet;
408
409 use ownedbytes::OwnedBytes;
410 use rand::distributions::Bernoulli;
411 use rand::rngs::StdRng;
412 use rand::{Rng, SeedableRng};
413
414 use super::{BitSet, ReadOnlyBitSet, TinySet};
415
416 #[test]
417 fn test_read_serialized_bitset_full_multi() {
418 for i in 0..1000 {
419 let bitset = BitSet::with_max_value_and_full(i);
420 let mut out = vec![];
421 bitset.serialize(&mut out).unwrap();
422
423 let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
424 assert_eq!(bitset.len(), i as usize);
425 }
426 }
427
428 #[test]
429 fn test_read_serialized_bitset_full_block() {
430 let bitset = BitSet::with_max_value_and_full(64);
431 let mut out = vec![];
432 bitset.serialize(&mut out).unwrap();
433
434 let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
435 assert_eq!(bitset.len(), 64);
436 }
437
438 #[test]
439 fn test_read_serialized_bitset_full() {
440 let mut bitset = BitSet::with_max_value_and_full(5);
441 bitset.remove(3);
442 let mut out = vec![];
443 bitset.serialize(&mut out).unwrap();
444
445 let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
446 assert_eq!(bitset.len(), 4);
447 }
448
449 #[test]
450 fn test_bitset_intersect() {
451 let bitset_serialized = {
452 let mut bitset = BitSet::with_max_value_and_full(5);
453 bitset.remove(1);
454 bitset.remove(3);
455 let mut out = vec![];
456 bitset.serialize(&mut out).unwrap();
457
458 ReadOnlyBitSet::open(OwnedBytes::new(out))
459 };
460
461 let mut bitset = BitSet::with_max_value_and_full(5);
462 bitset.remove(1);
463 bitset.intersect_update(&bitset_serialized);
464
465 assert!(bitset.contains(0));
466 assert!(!bitset.contains(1));
467 assert!(bitset.contains(2));
468 assert!(!bitset.contains(3));
469 assert!(bitset.contains(4));
470
471 bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
472
473 assert!(bitset.contains(0));
474 assert!(!bitset.contains(1));
475 assert!(!bitset.contains(2));
476 assert!(!bitset.contains(3));
477 assert!(!bitset.contains(4));
478 assert_eq!(bitset.len(), 1);
479
480 bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
481 assert!(!bitset.contains(0));
482 assert!(!bitset.contains(1));
483 assert!(!bitset.contains(2));
484 assert!(!bitset.contains(3));
485 assert!(!bitset.contains(4));
486 assert_eq!(bitset.len(), 0);
487 }
488
489 #[test]
490 fn test_read_serialized_bitset_empty() {
491 let mut bitset = BitSet::with_max_value(5);
492 bitset.insert(3);
493 let mut out = vec![];
494 bitset.serialize(&mut out).unwrap();
495
496 let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
497 assert_eq!(bitset.len(), 1);
498
499 {
500 let bitset = BitSet::with_max_value(5);
501 let mut out = vec![];
502 bitset.serialize(&mut out).unwrap();
503 let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
504 assert_eq!(bitset.len(), 0);
505 }
506 }
507
508 #[test]
509 fn test_tiny_set_remove() {
510 {
511 let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
512 assert_eq!(u.pop_lowest(), Some(5u32));
513 assert!(u.pop_lowest().is_none());
514 }
515 {
516 let mut u = TinySet::empty()
517 .insert(63u32)
518 .insert(1)
519 .insert(5)
520 .remove(63u32);
521 assert_eq!(u.pop_lowest(), Some(1u32));
522 assert_eq!(u.pop_lowest(), Some(5u32));
523 assert!(u.pop_lowest().is_none());
524 }
525 {
526 let mut u = TinySet::empty().insert(1).remove(63u32);
527 assert_eq!(u.pop_lowest(), Some(1u32));
528 assert!(u.pop_lowest().is_none());
529 }
530 {
531 let mut u = TinySet::empty().insert(1).remove(1u32);
532 assert!(u.pop_lowest().is_none());
533 }
534 }
535 #[test]
536 fn test_tiny_set() {
537 assert!(TinySet::empty().is_empty());
538 {
539 let mut u = TinySet::empty().insert(1u32);
540 assert_eq!(u.pop_lowest(), Some(1u32));
541 assert!(u.pop_lowest().is_none())
542 }
543 {
544 let mut u = TinySet::empty().insert(1u32).insert(1u32);
545 assert_eq!(u.pop_lowest(), Some(1u32));
546 assert!(u.pop_lowest().is_none())
547 }
548 {
549 let mut u = TinySet::empty().insert(2u32);
550 assert_eq!(u.pop_lowest(), Some(2u32));
551 u.insert_mut(1u32);
552 assert_eq!(u.pop_lowest(), Some(1u32));
553 assert!(u.pop_lowest().is_none());
554 }
555 {
556 let mut u = TinySet::empty().insert(63u32);
557 assert_eq!(u.pop_lowest(), Some(63u32));
558 assert!(u.pop_lowest().is_none());
559 }
560 {
561 let mut u = TinySet::empty().insert(63u32).insert(5);
562 assert_eq!(u.pop_lowest(), Some(5u32));
563 assert_eq!(u.pop_lowest(), Some(63u32));
564 assert!(u.pop_lowest().is_none());
565 }
566 {
567 let original = TinySet::empty().insert(63u32).insert(5);
568 let after_serialize_deserialize = TinySet::deserialize(original.into_bytes());
569 assert_eq!(original, after_serialize_deserialize);
570 }
571 }
572
573 #[test]
574 fn test_bitset() {
575 let test_against_hashset = |els: &[u32], max_value: u32| {
576 let mut hashset: HashSet<u32> = HashSet::new();
577 let mut bitset = BitSet::with_max_value(max_value);
578 for &el in els {
579 assert!(el < max_value);
580 hashset.insert(el);
581 bitset.insert(el);
582 }
583 for el in 0..max_value {
584 assert_eq!(hashset.contains(&el), bitset.contains(el));
585 }
586 assert_eq!(bitset.max_value(), max_value);
587
588 let mut data = vec![];
590 bitset.serialize(&mut data).unwrap();
591 let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data));
592 for el in 0..max_value {
593 assert_eq!(hashset.contains(&el), ro_bitset.contains(el));
594 }
595 assert_eq!(ro_bitset.max_value(), max_value);
596 assert_eq!(ro_bitset.len(), els.len());
597 };
598
599 test_against_hashset(&[], 0);
600 test_against_hashset(&[], 1);
601 test_against_hashset(&[0u32], 1);
602 test_against_hashset(&[0u32], 100);
603 test_against_hashset(&[1u32, 2u32], 4);
604 test_against_hashset(&[99u32], 100);
605 test_against_hashset(&[63u32], 64);
606 test_against_hashset(&[62u32, 63u32], 64);
607 }
608
609 #[test]
610 fn test_bitset_num_buckets() {
611 use super::num_buckets;
612 assert_eq!(num_buckets(0u32), 0);
613 assert_eq!(num_buckets(1u32), 1);
614 assert_eq!(num_buckets(64u32), 1);
615 assert_eq!(num_buckets(65u32), 2);
616 assert_eq!(num_buckets(128u32), 2);
617 assert_eq!(num_buckets(129u32), 3);
618 }
619
620 #[test]
621 fn test_tinyset_range() {
622 assert_eq!(
623 TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
624 [0, 1, 2]
625 );
626 assert!(TinySet::range_lower(0).is_empty());
627 assert_eq!(
628 TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
629 (0u32..63u32).collect::<Vec<_>>()
630 );
631 assert_eq!(
632 TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
633 [0]
634 );
635 assert_eq!(
636 TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
637 [0, 1]
638 );
639 assert_eq!(
640 TinySet::range_greater_or_equal(3)
641 .into_iter()
642 .collect::<Vec<u32>>(),
643 (3u32..64u32).collect::<Vec<_>>()
644 );
645 }
646
647 #[test]
648 fn test_bitset_len() {
649 let mut bitset = BitSet::with_max_value(1_000);
650 assert_eq!(bitset.len(), 0);
651 bitset.insert(3u32);
652 assert_eq!(bitset.len(), 1);
653 bitset.insert(103u32);
654 assert_eq!(bitset.len(), 2);
655 bitset.insert(3u32);
656 assert_eq!(bitset.len(), 2);
657 bitset.insert(103u32);
658 assert_eq!(bitset.len(), 2);
659 bitset.insert(104u32);
660 assert_eq!(bitset.len(), 3);
661 bitset.remove(105u32);
662 assert_eq!(bitset.len(), 3);
663 bitset.remove(104u32);
664 assert_eq!(bitset.len(), 2);
665 bitset.remove(3u32);
666 assert_eq!(bitset.len(), 1);
667 bitset.remove(103u32);
668 assert_eq!(bitset.len(), 0);
669 }
670
671 pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
672 StdRng::from_seed([seed_val; 32])
673 .sample_iter(&Bernoulli::new(ratio).unwrap())
674 .take(n as usize)
675 .enumerate()
676 .filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
677 .collect()
678 }
679
680 pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
681 sample_with_seed(n, ratio, 4)
682 }
683
684 #[test]
685 fn test_bitset_clear() {
686 let mut bitset = BitSet::with_max_value(1_000);
687 let els = sample(1_000, 0.01f64);
688 for &el in &els {
689 bitset.insert(el);
690 }
691 assert!(els.iter().all(|el| bitset.contains(*el)));
692 bitset.clear();
693 for el in 0u32..1000u32 {
694 assert!(!bitset.contains(el));
695 }
696 }
697}
698
699#[cfg(all(test, feature = "unstable"))]
700mod bench {
701
702 use test;
703
704 use super::{BitSet, TinySet};
705
706 #[bench]
707 fn bench_tinyset_pop(b: &mut test::Bencher) {
708 b.iter(|| {
709 let mut tinyset = TinySet::singleton(test::black_box(31u32));
710 tinyset.pop_lowest();
711 tinyset.pop_lowest();
712 tinyset.pop_lowest();
713 tinyset.pop_lowest();
714 tinyset.pop_lowest();
715 tinyset.pop_lowest();
716 });
717 }
718
719 #[bench]
720 fn bench_tinyset_sum(b: &mut test::Bencher) {
721 let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
722 b.iter(|| {
723 assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
724 });
725 }
726
727 #[bench]
728 fn bench_tinyarr_sum(b: &mut test::Bencher) {
729 let v = [10u32, 14u32, 21u32];
730 b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
731 }
732
733 #[bench]
734 fn bench_bitset_initialize(b: &mut test::Bencher) {
735 b.iter(|| BitSet::with_max_value(1_000_000));
736 }
737}