swh_graph_stdlib/collections/node_set.rs
1// Copyright (C) 2024 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Sets of values from 0 to a known maximum.
7//!
8//! This module contains a trait to use [`HashSet<usize>`](HashSet) and [`BitVec`]
9//! interchangeably, and defines [`AdaptiveNodeSet`] which switches between the two
10//! based on its content.
11
12use std::collections::HashSet;
13use std::hash::BuildHasher;
14
15use rapidhash::RapidBuildHasher;
16use sux::bits::bit_vec::BitVec;
17
18/// Constant controlling when a [`AdaptiveNodeSet`] should be promoted from sparse to dense.
19///
20/// Promotion happens when the number of items in a [`AdaptiveNodeSet`] times this constant
21/// is greater than the maximum value.
22///
23/// The value was computed experimentally, using "find-earliest-revision" (which runs
24/// many BFSs of highly heterogeneous sizes) on the 2023-09-06 graph, running on
25/// Software Heritage's Maxxi computer (Xeon Gold 6342 CPU @ 2.80GHz, 96 threads, 4TB RAM)
26/// That graph contains 34 billion nodes, and performance increases continuously when
27/// increasing up to 100 millions, then plateaus up to 1 billion; though the latter
28/// uses a little less memory most of the time.
29const PROMOTION_THRESHOLD: usize = 64;
30
31pub type NodeId = usize;
32
33/// A set of `usize` with a known maximum value
34pub trait ReadNodeSet {
35 fn contains(&self, node: NodeId) -> bool;
36}
37
38/// A set of `usize` with a known maximum value, that can be mutated
39pub trait NodeSet: ReadNodeSet {
40 fn insert(&mut self, node: NodeId);
41}
42
43pub struct EmptyNodeSet;
44
45impl ReadNodeSet for EmptyNodeSet {
46 #[inline(always)]
47 fn contains(&self, _node: usize) -> bool {
48 false
49 }
50}
51impl IntoIterator for EmptyNodeSet {
52 type Item = NodeId;
53 type IntoIter = std::iter::Empty<NodeId>;
54
55 #[inline(always)]
56 fn into_iter(self) -> Self::IntoIter {
57 std::iter::empty()
58 }
59}
60
61impl<S: BuildHasher> NodeSet for HashSet<usize, S> {
62 #[inline(always)]
63 fn insert(&mut self, node: usize) {
64 HashSet::insert(self, node);
65 }
66}
67impl<S: BuildHasher> ReadNodeSet for HashSet<usize, S> {
68 #[inline(always)]
69 fn contains(&self, node: usize) -> bool {
70 HashSet::contains(self, &node)
71 }
72}
73
74impl NodeSet for BitVec {
75 #[inline(always)]
76 fn insert(&mut self, node: usize) {
77 self.set(node, true);
78 }
79}
80impl ReadNodeSet for BitVec {
81 #[inline(always)]
82 fn contains(&self, node: usize) -> bool {
83 self.get(node)
84 }
85}
86
87#[derive(Debug, PartialEq, Eq)]
88/// A slice of `NodeId` whose values are in ascending order
89///
90/// # Safety
91///
92/// Unsafe code should not rely on this to be sorted, as safe code can build arbitrary instances
93pub struct SortedNodeIdSlice<S: AsRef<[NodeId]> + ?Sized>(pub S);
94
95impl<S: AsRef<[NodeId]> + ?Sized> ReadNodeSet for SortedNodeIdSlice<S> {
96 /// Runs in logarithmic time, as it performs a binary search
97 #[inline(always)]
98 fn contains(&self, node: usize) -> bool {
99 self.0.as_ref().binary_search(&node).is_ok()
100 }
101}
102
103impl<S: AsRef<[NodeId]>> std::ops::Deref for SortedNodeIdSlice<S> {
104 type Target = S;
105
106 fn deref(&self) -> &Self::Target {
107 &self.0
108 }
109}
110
111impl<'a> IntoIterator for SortedNodeIdSlice<&'a [NodeId]> {
112 type Item = NodeId;
113 type IntoIter = std::iter::Copied<std::slice::Iter<'a, NodeId>>;
114
115 fn into_iter(self) -> Self::IntoIter {
116 self.0.iter().copied()
117 }
118}
119
120impl<'a, S: AsRef<[NodeId]>> From<&'a SortedNodeIdSlice<S>> for SortedNodeIdSlice<&'a [NodeId]> {
121 fn from(v: &'a SortedNodeIdSlice<S>) -> Self {
122 SortedNodeIdSlice(v.0.as_ref())
123 }
124}
125
126/// Implementation of [`NodeSet`] that dynamically changes the underlying representation
127/// based on its content
128///
129/// The current implementation is initialized with a [`HashSet`], but switches to
130/// [`BitVec`] once the data becomes dense.
131///
132/// This has the advantage of allocating little memory if there won't be many elements,
133/// but avoiding the overhead of [`HashSet`] when there are.
134///
135/// ```
136/// # use swh_graph_stdlib::collections::{AdaptiveNodeSet, NodeSet};
137/// let mut node_set = AdaptiveNodeSet::new(100);
138/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {} }");
139/// node_set.insert(10);
140/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {10} }");
141/// for i in 20..30 {
142/// node_set.insert(i);
143/// }
144/// assert_eq!(format!("{:?}", node_set), "Dense { data: BitVec { bits: [1072694272, 0], len: 100 } }");
145/// ```
146#[derive(Debug)]
147pub enum AdaptiveNodeSet {
148 Sparse {
149 max_items: usize,
150 data: HashSet<NodeId, RapidBuildHasher>,
151 },
152 Dense {
153 data: BitVec,
154 },
155}
156
157impl AdaptiveNodeSet {
158 /// Creates an empty `AdaptiveNodeSet` that may only store node ids from `0` to `max_items-1`
159 #[inline(always)]
160 pub fn new(max_items: usize) -> Self {
161 AdaptiveNodeSet::Sparse {
162 max_items,
163 data: HashSet::with_hasher(RapidBuildHasher::default()),
164 }
165 }
166
167 /// Creates an empty `AdaptiveNodeSet` with at least the specified capacity
168 #[inline(always)]
169 pub fn with_capacity(max_items: usize, capacity: usize) -> Self {
170 if capacity > max_items / PROMOTION_THRESHOLD {
171 AdaptiveNodeSet::Dense {
172 data: BitVec::new(max_items),
173 }
174 } else {
175 AdaptiveNodeSet::Sparse {
176 max_items,
177 data: HashSet::with_capacity_and_hasher(capacity, RapidBuildHasher::default()),
178 }
179 }
180 }
181}
182
183impl NodeSet for AdaptiveNodeSet {
184 /// Adds a node to the set
185 ///
186 /// # Panics
187 ///
188 /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
189 #[inline(always)]
190 fn insert(&mut self, node: usize) {
191 match self {
192 AdaptiveNodeSet::Sparse { max_items, data } => {
193 data.insert(node);
194 if data.len() > *max_items / PROMOTION_THRESHOLD {
195 // Promote the hashset to a bitvec
196 let mut new_data = BitVec::new(*max_items);
197 for node in data.iter() {
198 new_data.insert(*node);
199 }
200 *self = AdaptiveNodeSet::Dense { data: new_data };
201 }
202 }
203 AdaptiveNodeSet::Dense { data } => data.insert(node),
204 }
205 }
206}
207
208impl ReadNodeSet for AdaptiveNodeSet {
209 /// Returns whether the node is part of the set
210 ///
211 /// # Panics
212 ///
213 /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
214 #[inline(always)]
215 fn contains(&self, node: usize) -> bool {
216 match self {
217 AdaptiveNodeSet::Sparse { max_items: _, data } => data.contains(&node),
218 AdaptiveNodeSet::Dense { data } => data.contains(node),
219 }
220 }
221}