swh_graph_stdlib/collections/
node_set.rs

1// Copyright (C) 2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Sets of values from 0 to a known maximum.
7//!
8//! This module contains a trait to use [`HashSet<usize>`](HashSet) and [`BitVec`]
9//! interchangeably, and defines [`AdaptiveNodeSet`] which switches between the two
10//! based on its content.
11
12use std::collections::HashSet;
13use std::hash::BuildHasher;
14use std::marker::PhantomData;
15
16use rapidhash::RapidBuildHasher;
17use sux::bits::bit_vec::BitVec;
18use sux::traits::{BitVecOps, BitVecOpsMut};
19
20/// Constant controlling when a [`AdaptiveNodeSet`] should be promoted from sparse to dense.
21///
22/// Promotion happens when the number of items in a [`AdaptiveNodeSet`] times this constant
23/// is greater than the maximum value.
24///
25/// The value was computed experimentally, using "find-earliest-revision" (which runs
26/// many BFSs of highly heterogeneous sizes) on the 2023-09-06 graph, running on
27/// Software Heritage's Maxxi computer (Xeon Gold 6342 CPU @ 2.80GHz, 96 threads, 4TB RAM)
28/// That graph contains 34 billion nodes, and performance increases continuously when
29/// increasing up to 100 millions, then plateaus up to 1 billion; though the latter
30/// uses a little less memory most of the time.
31const PROMOTION_THRESHOLD: usize = 64;
32
33pub type NodeId = usize;
34
35/// A set of `usize` with a known maximum value
36pub trait ReadNodeSet {
37    fn contains(&self, node: NodeId) -> bool;
38}
39
40/// A set of `usize` with a known maximum value, that can be mutated
41pub trait NodeSet: ReadNodeSet {
42    fn insert(&mut self, node: NodeId);
43}
44
45pub struct EmptyNodeSet;
46
47impl ReadNodeSet for EmptyNodeSet {
48    #[inline(always)]
49    fn contains(&self, _node: usize) -> bool {
50        false
51    }
52}
53impl IntoIterator for EmptyNodeSet {
54    type Item = NodeId;
55    type IntoIter = std::iter::Empty<NodeId>;
56
57    #[inline(always)]
58    fn into_iter(self) -> Self::IntoIter {
59        std::iter::empty()
60    }
61}
62
63impl<S: BuildHasher> NodeSet for HashSet<usize, S> {
64    #[inline(always)]
65    fn insert(&mut self, node: usize) {
66        HashSet::insert(self, node);
67    }
68}
69impl<S: BuildHasher> ReadNodeSet for HashSet<usize, S> {
70    #[inline(always)]
71    fn contains(&self, node: usize) -> bool {
72        HashSet::contains(self, &node)
73    }
74}
75
76impl NodeSet for BitVec {
77    #[inline(always)]
78    fn insert(&mut self, node: usize) {
79        self.set(node, true);
80    }
81}
82impl ReadNodeSet for BitVec {
83    #[inline(always)]
84    fn contains(&self, node: usize) -> bool {
85        self.get(node)
86    }
87}
88
89#[derive(Debug, PartialEq, Eq)]
90/// A slice of `NodeId` whose values are in ascending order
91///
92/// # Safety
93///
94/// Unsafe code should not rely on this to be sorted, as safe code can build arbitrary instances
95pub struct SortedSlice<Item: Ord, S: AsRef<[Item]> + ?Sized> {
96    pub marker: PhantomData<Item>,
97    pub slice: S,
98}
99
100impl<Item: Ord, S: AsRef<[Item]>> SortedSlice<Item, S> {
101    pub fn new(slice: S) -> Self {
102        SortedSlice {
103            marker: PhantomData,
104            slice,
105        }
106    }
107}
108
109impl<S: AsRef<[NodeId]> + ?Sized> ReadNodeSet for SortedSlice<NodeId, S> {
110    /// Runs in logarithmic time, as it performs a binary search
111    #[inline(always)]
112    fn contains(&self, node: NodeId) -> bool {
113        self.slice.as_ref().binary_search(&node).is_ok()
114    }
115}
116
117impl<Item: Ord, S: AsRef<[Item]>> std::ops::Deref for SortedSlice<Item, S> {
118    type Target = S;
119
120    fn deref(&self) -> &Self::Target {
121        &self.slice
122    }
123}
124
125impl<'a, Item: Ord + Copy> IntoIterator for SortedSlice<Item, &'a [Item]> {
126    type Item = Item;
127    type IntoIter = std::iter::Copied<std::slice::Iter<'a, Item>>;
128
129    fn into_iter(self) -> Self::IntoIter {
130        self.slice.iter().copied()
131    }
132}
133
134impl<'a, Item: Ord, S: AsRef<[Item]>> From<&'a SortedSlice<Item, S>>
135    for SortedSlice<Item, &'a [Item]>
136{
137    fn from(v: &'a SortedSlice<Item, S>) -> Self {
138        SortedSlice {
139            marker: PhantomData,
140            slice: v.slice.as_ref(),
141        }
142    }
143}
144
145/// Implementation of [`NodeSet`] that dynamically changes the underlying representation
146/// based on its content
147///
148/// The current implementation is initialized with a [`HashSet`], but switches to
149/// [`BitVec`] once the data becomes dense.
150///
151/// This has the advantage of allocating little memory if there won't be many elements,
152/// but avoiding the overhead of [`HashSet`] when there are.
153///
154/// ```
155/// # use swh_graph_stdlib::collections::{AdaptiveNodeSet, NodeSet};
156/// let mut node_set = AdaptiveNodeSet::new(100);
157/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {} }");
158/// node_set.insert(10);
159/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {10} }");
160/// for i in 20..30 {
161///     node_set.insert(i);
162/// }
163/// assert_eq!(format!("{:?}", node_set), "Dense { data: BitVec { bits: [1072694272, 0], len: 100 } }");
164/// ```
165#[derive(Debug)]
166pub enum AdaptiveNodeSet {
167    Sparse {
168        max_items: usize,
169        data: HashSet<NodeId, RapidBuildHasher>,
170    },
171    Dense {
172        data: BitVec,
173    },
174}
175
176impl AdaptiveNodeSet {
177    /// Creates an empty `AdaptiveNodeSet` that may only store node ids from `0` to `max_items-1`
178    #[inline(always)]
179    pub fn new(max_items: usize) -> Self {
180        AdaptiveNodeSet::Sparse {
181            max_items,
182            data: HashSet::with_hasher(RapidBuildHasher::default()),
183        }
184    }
185
186    /// Creates an empty `AdaptiveNodeSet` with at least the specified capacity
187    #[inline(always)]
188    pub fn with_capacity(max_items: usize, capacity: usize) -> Self {
189        if capacity > max_items / PROMOTION_THRESHOLD {
190            AdaptiveNodeSet::Dense {
191                data: BitVec::new(max_items),
192            }
193        } else {
194            AdaptiveNodeSet::Sparse {
195                max_items,
196                data: HashSet::with_capacity_and_hasher(capacity, RapidBuildHasher::default()),
197            }
198        }
199    }
200}
201
202impl NodeSet for AdaptiveNodeSet {
203    /// Adds a node to the set
204    ///
205    /// # Panics
206    ///
207    /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
208    #[inline(always)]
209    fn insert(&mut self, node: usize) {
210        match self {
211            AdaptiveNodeSet::Sparse { max_items, data } => {
212                data.insert(node);
213                if data.len() > *max_items / PROMOTION_THRESHOLD {
214                    // Promote the hashset to a bitvec
215                    let mut new_data = BitVec::new(*max_items);
216                    for node in data.iter() {
217                        new_data.insert(*node);
218                    }
219                    *self = AdaptiveNodeSet::Dense { data: new_data };
220                }
221            }
222            AdaptiveNodeSet::Dense { data } => data.insert(node),
223        }
224    }
225}
226
227impl ReadNodeSet for AdaptiveNodeSet {
228    /// Returns whether the node is part of the set
229    ///
230    /// # Panics
231    ///
232    /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
233    #[inline(always)]
234    fn contains(&self, node: usize) -> bool {
235        match self {
236            AdaptiveNodeSet::Sparse { max_items: _, data } => data.contains(&node),
237            AdaptiveNodeSet::Dense { data } => data.contains(node),
238        }
239    }
240}