Skip to main content

forest/cid_collections/
hash_set.rs

1// Copyright 2019-2026 ChainSafe Systems
2// SPDX-License-Identifier: Apache-2.0, MIT
3
4use super::*;
5use anyhow::Context as _;
6use bytes::Bytes;
7use cid::Cid;
8
9#[cfg(doc)]
10use std::collections::HashSet;
11use std::{path::Path, sync::LazyLock};
12
13pub trait CidHashSetLike {
14    /// Adds a value to the set.
15    ///
16    /// Returns whether the value was newly inserted.
17    fn insert(&mut self, cid: Cid) -> anyhow::Result<bool>;
18}
19
20/// A hash set implemented as a `HashMap` where the value is `()`.
21///
22/// See also [`HashSet`].
23#[derive(Default, Clone, Debug, PartialEq, Eq)]
24pub struct CidHashSet {
25    inner: CidHashMap<()>,
26}
27
28impl CidHashSet {
29    /// Creates an empty `HashSet`.
30    ///
31    /// See also [`HashSet::new`].
32    pub fn new() -> Self {
33        Self::default()
34    }
35
36    /// Adds a value to the set.
37    ///
38    /// Returns whether the value was newly inserted.
39    ///
40    /// See also [`HashSet::insert`].
41    pub fn insert(&mut self, cid: Cid) -> bool {
42        self.inner.insert(cid, ()).is_none()
43    }
44
45    /// Returns the number of elements.
46    pub fn len(&self) -> usize {
47        self.inner.len()
48    }
49
50    /// Returns `true` if the set contains a `Cid`.
51    #[allow(dead_code)]
52    pub fn contains(&self, cid: &Cid) -> bool {
53        self.inner.contains_key(cid)
54    }
55
56    /// Removes a `Cid` from the set. Returns whether the value was present in the set.
57    #[allow(dead_code)]
58    pub fn remove(&mut self, cid: &Cid) -> bool {
59        self.inner.remove(cid).is_some()
60    }
61
62    /// Returns `true` if the set is empty.
63    #[allow(dead_code)]
64    pub fn is_empty(&self) -> bool {
65        self.inner.is_empty()
66    }
67}
68
69impl CidHashSetLike for CidHashSet {
70    fn insert(&mut self, cid: Cid) -> anyhow::Result<bool> {
71        Ok(self.insert(cid))
72    }
73}
74
75////////////////////
76// Collection Ops //
77////////////////////
78
79impl Extend<Cid> for CidHashSet {
80    fn extend<T: IntoIterator<Item = Cid>>(&mut self, iter: T) {
81        self.inner.extend(iter.into_iter().map(|it| (it, ())))
82    }
83}
84
85impl FromIterator<Cid> for CidHashSet {
86    fn from_iter<T: IntoIterator<Item = Cid>>(iter: T) -> Self {
87        let mut this = Self::new();
88        this.extend(iter);
89        this
90    }
91}
92
93/// A file-backed CID hash set.
94/// This is intended to be used for large sets of CIDs that may not fit in memory, such as when tracking seen CIDs during a chain export.
95pub struct FileBackedCidHashSet {
96    db: parity_db::Db,
97    // for dropping the temporary directory when the set is dropped
98    _dir: tempfile::TempDir,
99    lru: hashlink::LruCache<SmallCid, ()>,
100}
101
102impl FileBackedCidHashSet {
103    pub fn new(temp_dir_root: impl AsRef<Path>) -> anyhow::Result<Self> {
104        let dir = tempfile::tempdir_in(temp_dir_root.as_ref()).with_context(|| {
105            format!(
106                "failed to create temp dir in {}",
107                temp_dir_root.as_ref().display(),
108            )
109        })?;
110        let options = parity_db::Options {
111            path: dir.path().to_path_buf(),
112            sync_wal: false,
113            sync_data: false,
114            stats: false,
115            salt: None,
116            columns: vec![
117                parity_db::ColumnOptions {
118                    uniform: true,
119                    append_only: true,
120                    ..Default::default()
121                },
122                parity_db::ColumnOptions {
123                    append_only: true,
124                    ..Default::default()
125                },
126            ],
127            compression_threshold: Default::default(),
128        };
129        let db = parity_db::Db::open_or_create(&options).with_context(|| {
130            format!(
131                "failed to create temp parity-db at {}",
132                options.path.display()
133            )
134        })?;
135        Ok(Self {
136            db,
137            _dir: dir,
138            #[allow(clippy::disallowed_methods)]
139            lru: hashlink::LruCache::new(2 << 19), // ~80MiB for 1M entries
140        })
141    }
142
143    pub fn new_in_temp_dir() -> anyhow::Result<Self> {
144        Self::new(std::env::temp_dir())
145    }
146}
147
148impl CidHashSetLike for FileBackedCidHashSet {
149    fn insert(&mut self, cid: Cid) -> anyhow::Result<bool> {
150        static EMPTY_VALUE: LazyLock<Bytes> = LazyLock::new(|| Bytes::from_static(&[]));
151
152        let small = SmallCid::from(cid);
153        if self.lru.get(&small).is_some() {
154            return Ok(false);
155        }
156
157        let (col, key) = match &small {
158            SmallCid::Inline(c) => (0, c.digest().to_vec()),
159            SmallCid::Indirect(u) => (1, u.inner().to_bytes()),
160        };
161        if self.db.get(col, &key).ok().flatten().is_some() {
162            self.lru.insert(small, ());
163            Ok(false)
164        } else {
165            self.db.commit_changes_bytes([(
166                col,
167                parity_db::Operation::Set(key, EMPTY_VALUE.clone()),
168            )])?;
169            self.lru.insert(small, ());
170            Ok(true)
171        }
172    }
173}
174
175#[cfg(test)]
176impl Default for FileBackedCidHashSet {
177    fn default() -> Self {
178        Self::new_in_temp_dir().expect("failed to create FileBackedCidHashSet")
179    }
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185    use ahash::HashSet;
186
187    #[quickcheck_macros::quickcheck]
188    fn test_cid_hashset(cids: HashSet<Cid>) {
189        let mut set = CidHashSet::default();
190        for cid in cids.iter() {
191            all_asserts::assert_true!(set.insert(*cid), "expected CID to be newly inserted");
192        }
193        for cid in cids.iter() {
194            all_asserts::assert_false!(set.insert(*cid), "expected CID to be present in the set");
195        }
196    }
197
198    #[quickcheck_macros::quickcheck]
199    fn test_file_backed_cid_hashset(cids: HashSet<Cid>) {
200        let mut set = FileBackedCidHashSet::default();
201        let dir = set._dir.path().to_path_buf();
202        for cid in cids.iter() {
203            all_asserts::assert_true!(
204                set.insert(*cid).unwrap(),
205                "expected CID to be newly inserted"
206            );
207        }
208        for cid in cids.iter() {
209            all_asserts::assert_false!(
210                set.insert(*cid).unwrap(),
211                "expected CID to be present in the set"
212            );
213        }
214        drop(set);
215        all_asserts::assert_false!(dir.exists(), "expected temporary directory to be deleted");
216    }
217}