swh_graph_stdlib/collections/
node_set.rs

1// Copyright (C) 2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Sets of values from 0 to a known maximum.
7//!
8//! This module contains a trait to use [`HashSet<usize>`](HashSet) and [`BitVec`]
9//! interchangeably, and defines [`AdaptiveNodeSet`] which switches between the two
10//! based on its content.
11
12use std::collections::HashSet;
13use std::hash::BuildHasher;
14
15use rapidhash::RapidBuildHasher;
16use sux::bits::bit_vec::BitVec;
17
18/// Constant controlling when a [`AdaptiveNodeSet`] should be promoted from sparse to dense.
19///
20/// Promotion happens when the number of items in a [`AdaptiveNodeSet`] times this constant
21/// is greater than the maximum value.
22///
23/// The value was computed experimentally, using "find-earliest-revision" (which runs
24/// many BFSs of highly heterogeneous sizes) on the 2023-09-06 graph, running on
25/// Software Heritage's Maxxi computer (Xeon Gold 6342 CPU @ 2.80GHz, 96 threads, 4TB RAM)
26/// That graph contains 34 billion nodes, and performance increases continuously when
27/// increasing up to 100 millions, then plateaus up to 1 billion; though the latter
28/// uses a little less memory most of the time.
29const PROMOTION_THRESHOLD: usize = 64;
30
31pub type NodeId = usize;
32
33/// A set of `usize` with a known maximum value
34pub trait ReadNodeSet {
35    fn contains(&self, node: NodeId) -> bool;
36}
37
38/// A set of `usize` with a known maximum value, that can be mutated
39pub trait NodeSet: ReadNodeSet {
40    fn insert(&mut self, node: NodeId);
41}
42
43pub struct EmptyNodeSet;
44
45impl ReadNodeSet for EmptyNodeSet {
46    #[inline(always)]
47    fn contains(&self, _node: usize) -> bool {
48        false
49    }
50}
51impl IntoIterator for EmptyNodeSet {
52    type Item = NodeId;
53    type IntoIter = std::iter::Empty<NodeId>;
54
55    #[inline(always)]
56    fn into_iter(self) -> Self::IntoIter {
57        std::iter::empty()
58    }
59}
60
61impl<S: BuildHasher> NodeSet for HashSet<usize, S> {
62    #[inline(always)]
63    fn insert(&mut self, node: usize) {
64        HashSet::insert(self, node);
65    }
66}
67impl<S: BuildHasher> ReadNodeSet for HashSet<usize, S> {
68    #[inline(always)]
69    fn contains(&self, node: usize) -> bool {
70        HashSet::contains(self, &node)
71    }
72}
73
74impl NodeSet for BitVec {
75    #[inline(always)]
76    fn insert(&mut self, node: usize) {
77        self.set(node, true);
78    }
79}
80impl ReadNodeSet for BitVec {
81    #[inline(always)]
82    fn contains(&self, node: usize) -> bool {
83        self.get(node)
84    }
85}
86
87#[derive(Debug, PartialEq, Eq)]
88/// A slice of `NodeId` whose values are in ascending order
89///
90/// # Safety
91///
92/// Unsafe code should not rely on this to be sorted, as safe code can build arbitrary instances
93pub struct SortedNodeIdSlice<S: AsRef<[NodeId]> + ?Sized>(pub S);
94
95impl<S: AsRef<[NodeId]> + ?Sized> ReadNodeSet for SortedNodeIdSlice<S> {
96    /// Runs in logarithmic time, as it performs a binary search
97    #[inline(always)]
98    fn contains(&self, node: usize) -> bool {
99        self.0.as_ref().binary_search(&node).is_ok()
100    }
101}
102
103impl<S: AsRef<[NodeId]>> std::ops::Deref for SortedNodeIdSlice<S> {
104    type Target = S;
105
106    fn deref(&self) -> &Self::Target {
107        &self.0
108    }
109}
110
111impl<'a> IntoIterator for SortedNodeIdSlice<&'a [NodeId]> {
112    type Item = NodeId;
113    type IntoIter = std::iter::Copied<std::slice::Iter<'a, NodeId>>;
114
115    fn into_iter(self) -> Self::IntoIter {
116        self.0.iter().copied()
117    }
118}
119
120impl<'a, S: AsRef<[NodeId]>> From<&'a SortedNodeIdSlice<S>> for SortedNodeIdSlice<&'a [NodeId]> {
121    fn from(v: &'a SortedNodeIdSlice<S>) -> Self {
122        SortedNodeIdSlice(v.0.as_ref())
123    }
124}
125
126/// Implementation of [`NodeSet`] that dynamically changes the underlying representation
127/// based on its content
128///
129/// The current implementation is initialized with a [`HashSet`], but switches to
130/// [`BitVec`] once the data becomes dense.
131///
132/// This has the advantage of allocating little memory if there won't be many elements,
133/// but avoiding the overhead of [`HashSet`] when there are.
134///
135/// ```
136/// # use swh_graph_stdlib::collections::{AdaptiveNodeSet, NodeSet};
137/// let mut node_set = AdaptiveNodeSet::new(100);
138/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {} }");
139/// node_set.insert(10);
140/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {10} }");
141/// for i in 20..30 {
142///     node_set.insert(i);
143/// }
144/// assert_eq!(format!("{:?}", node_set), "Dense { data: BitVec { bits: [1072694272, 0], len: 100 } }");
145/// ```
146#[derive(Debug)]
147pub enum AdaptiveNodeSet {
148    Sparse {
149        max_items: usize,
150        data: HashSet<NodeId, RapidBuildHasher>,
151    },
152    Dense {
153        data: BitVec,
154    },
155}
156
157impl AdaptiveNodeSet {
158    /// Creates an empty `AdaptiveNodeSet` that may only store node ids from `0` to `max_items-1`
159    #[inline(always)]
160    pub fn new(max_items: usize) -> Self {
161        AdaptiveNodeSet::Sparse {
162            max_items,
163            data: HashSet::with_hasher(RapidBuildHasher::default()),
164        }
165    }
166
167    /// Creates an empty `AdaptiveNodeSet` with at least the specified capacity
168    #[inline(always)]
169    pub fn with_capacity(max_items: usize, capacity: usize) -> Self {
170        if capacity > max_items / PROMOTION_THRESHOLD {
171            AdaptiveNodeSet::Dense {
172                data: BitVec::new(max_items),
173            }
174        } else {
175            AdaptiveNodeSet::Sparse {
176                max_items,
177                data: HashSet::with_capacity_and_hasher(capacity, RapidBuildHasher::default()),
178            }
179        }
180    }
181}
182
183impl NodeSet for AdaptiveNodeSet {
184    /// Adds a node to the set
185    ///
186    /// # Panics
187    ///
188    /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
189    #[inline(always)]
190    fn insert(&mut self, node: usize) {
191        match self {
192            AdaptiveNodeSet::Sparse { max_items, data } => {
193                data.insert(node);
194                if data.len() > *max_items / PROMOTION_THRESHOLD {
195                    // Promote the hashset to a bitvec
196                    let mut new_data = BitVec::new(*max_items);
197                    for node in data.iter() {
198                        new_data.insert(*node);
199                    }
200                    *self = AdaptiveNodeSet::Dense { data: new_data };
201                }
202            }
203            AdaptiveNodeSet::Dense { data } => data.insert(node),
204        }
205    }
206}
207
208impl ReadNodeSet for AdaptiveNodeSet {
209    /// Returns whether the node is part of the set
210    ///
211    /// # Panics
212    ///
213    /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
214    #[inline(always)]
215    fn contains(&self, node: usize) -> bool {
216        match self {
217            AdaptiveNodeSet::Sparse { max_items: _, data } => data.contains(&node),
218            AdaptiveNodeSet::Dense { data } => data.contains(node),
219        }
220    }
221}