refset/
lib.rs

1//! A hash-set analogue that does not own its data.
2//!
3//! It can be used to "mark" items without the need to transfer ownership to the map
4//!
5//! # Example use case
6//! ```
7//! # use refset::HashRefSet;
8//! /// Process arguments while ignoring duplicates
9//! fn process_args(args: impl IntoIterator<Item=String>) {
10//!   let mut same=  HashRefSet::new();
11//!   for argument in args.into_iter()
12//!   {
13//!     if !same.insert(argument.as_str()) {
14//!       // Already processed this input, ignore
15//!       continue;
16//!     }
17//!     //do work...
18//!   }
19//! }
20//! ```
21//! # Serialisation support with `serde` crate
22//! `HashRefSet` and `HashType` both implement `Serialize` and `Deserialize` from the `serde` crate if the `serde` feature is enabled. By default it is not.
23//! # Drawbacks
24//! Since the item is not inserted itself, we cannot use `Eq` to double check there was not a hash collision.
25//! While the hashing algorithm used (Sha512) is extremely unlikely to produce collisions, especially for small data types, keep in mind that it is not infallible.
26
27#![cfg_attr(nightly, feature(test))] 
28#[cfg(nightly)] extern crate test;
29
30use std::{
31    collections::{
32	hash_set,
33	HashSet,
34    },
35    marker::{
36	PhantomData,
37	Send,
38	Sync,
39    },
40    hash::Hash,
41    borrow::Borrow,
42};
43
44mod hashing;
45
46#[cfg(feature="smallmap")] pub mod small;
47
48/// The type used to store the hash of each item.
49///
50/// It is a result of the `SHA512` algorithm as a newtype 64 byte array marked with `#[repr(transparent)]`.
51/// If you want to get the bytes from it, you can transmute safely.
52/// ```
53/// # use refset::HashType;
54/// fn hash_bytes(hash: HashType) -> [u8; 64]
55/// {
56///   unsafe {
57///     std::mem::transmute(hash)
58///   }
59/// }
60///
61/// fn hash_bytes_assert()
62/// {
63///   assert_eq!(hash_bytes(Default::default()), [0u8; 64]);
64/// }
65/// ```
66pub type HashType = hashing::Sha512Hash;
67
68/// Compute the `HashType` value for this `T`.
69fn compute_hash_for<T: ?Sized + Hash>(value: &T)  -> HashType
70{
71    let mut hasher = hashing::Sha512Hasher::new();
72    value.hash(&mut hasher);
73    hasher.finalize()
74}
75
76#[allow(dead_code)]
77#[cold] fn compute_both_hash_for<T: ?Sized + Hash>(value: &T)  -> (u64, HashType)
78{
79    use sha2::{
80	Digest,
81	digest::generic_array::sequence::Split,
82    };
83    let mut hasher = hashing::Sha512Hasher::new();
84    value.hash(&mut hasher);
85    let sha512 = hasher.into_inner();
86
87    let full = sha512.finalize();
88
89    let mut arr = [0u8; hashing::HASH_SIZE];
90    debug_assert_eq!(arr.len(), full.len());
91    unsafe {
92	std::ptr::copy_nonoverlapping(&full[0] as *const u8, &mut arr[0] as *mut u8, hashing::HASH_SIZE);
93    }
94    (u64::from_ne_bytes(full.split().0.into()), HashType::from_bytes(arr))
95}
96
97/// A hash-set of references to an item.
98///
99/// Instead of inserting the item into the set, the set is "marked" with the item.
100/// Think of this as inserting a reference into the set with no lifetime.
101///
102/// Any type that can borrow to `T` can be used to insert, and neither type needs to be `Sized`.
103/// `T` need only implement `Hash`.
104///
105/// # Hashing algorithm
106/// The hasing algorithm used is `Sha512`, which is rather large (64 bytes).
107/// At present there is no way to change the hasher used, I might implement that functionality in the future.
108#[derive(Debug, Clone, PartialEq, Eq, Default)]
109#[cfg_attr(feature="serde", derive(serde::Serialize, serde::Deserialize))] 
110pub struct HashRefSet<T: ?Sized>(HashSet<HashType>, PhantomData<HashSet<*const T>>);
111
112unsafe impl<T: ?Sized + Send> Send for HashRefSet<T>{}
113unsafe impl<T: ?Sized + Send + Sync> Sync for HashRefSet<T>{}
114
115impl<T:?Sized + Hash> HashRefSet<T>
116{
117    /// Create a new empty `HashRefSet`
118    pub fn new() -> Self
119    {
120	Self(
121	    HashSet::new(),
122	    PhantomData
123	)
124    }
125    /// Consume into the inner `HashSet`.
126    pub fn into_inner(self) -> HashSet<HashType>
127    {
128	self.0
129    }
130    /// Create a new `HashRefSet` with a capacity
131    pub fn with_capacity(cap: usize) -> Self
132    {
133	Self(HashSet::with_capacity(cap), PhantomData)
134    }
135
136    /// Insert a reference into the set. The reference can be any type that borrows to `T`.
137    ///
138    /// Returns `true` if there was no previous item, `false` if there was.
139    pub fn insert<Q>(&mut self, value: &Q) -> bool
140    where Q: ?Sized + Borrow<T>
141    {
142	self.0.insert(compute_hash_for(value.borrow()))
143    }
144
145    /// Remove a reference from the set.
146    ///
147    /// Returns `true` if it existed.
148    pub fn remove<Q>(&mut self, value: &Q) -> bool
149    where Q: ?Sized + Borrow<T>
150    {
151	self.0.remove(&compute_hash_for(value.borrow()))
152    }
153
154    /// Check if this value has been inserted into the set.
155    pub fn contains<Q>(&mut self, value: &Q) -> bool
156    where Q: ?Sized + Borrow<T>
157    {
158	self.0.contains(&compute_hash_for(value.borrow()))
159    }
160
161    /// The number of items stored in the set
162    pub fn len(&self) -> usize
163    {
164	self.0.len()
165    }
166
167    /// Is the set empty
168    pub fn is_empty(&self) -> bool
169    {
170	self.0.is_empty()
171    }
172
173    /// An iterator over the hashes stored in the set.
174    pub fn hashes_iter(&self) -> hash_set::Iter<'_, HashType>
175    {
176	self.0.iter()
177    }
178
179    #[inline] fn into_hashes_iter(self) -> hash_set::IntoIter<HashType>
180    {
181	self.0.into_iter()
182    }
183}
184
185impl<T: ?Sized + Hash> IntoIterator for HashRefSet<T>
186{
187    type Item= HashType;
188    type IntoIter = hash_set::IntoIter<HashType>;
189
190    #[inline] fn into_iter(self) -> Self::IntoIter
191    {
192	self.into_hashes_iter()
193    }
194}
195
196
197#[cfg(test)]
198mod tests
199{
200    use super::*;
201    #[test]
202    fn insert()
203    {
204	let mut refset = HashRefSet::new();
205
206	let values=  vec![
207	    "hi",
208	    "hello",
209	    "one",
210	    "two",
211	];
212	for &string in values.iter()
213	{
214	    refset.insert(string);
215	}
216
217	for string in values
218	{
219	    assert!(refset.contains(string));
220	}
221
222	assert!(refset.insert("none"));
223	assert!(!refset.insert("two"));
224    }
225
226    #[cfg(nightly)]
227    mod benchmarks
228    {
229	use test::{black_box, Bencher};
230	const STRINGS: &str = "leo vel fringilla est ullamcorper eget nulla facilisi etiam dignissim diam quis enim lobortis scelerisque fermentum dui faucibus in ornare quam viverra orci sagittis eu volutpat odio facilisis mauris sit amet massa vitae tortor condimentum lacinia quis vel eros donec ac odio tempor orci dapibus ultrices in iaculis nunc sed augue lacus viverra vitae congue eu consequat ac felis donec et odio pellentesque diam volutpat commodo sed egestas egestas fringilla phasellus faucibus scelerisque eleifend donec pretium vulputate sapien nec sagittis aliquam malesuada bibendum arcu vitae elementum curabitur vitae nunc sed velit dignissim sodales ut eu sem integer vitae justo eget";
231	const INTS: &[u32] = &[182,248,69,225,164,219,73,122,14,205,148,221,24,107,209,83,210,87,148,249,234,181,217,154,180,240,132,145,208,15,77,4,117,16,43,1,95,49,150,18,207,161,107,216,215,100,76,198,43,21,99,177,77,28,29,172,117,136,151,96,66,208,244,138,90];
232	
233	#[bench] fn non_owning_strings(b: &mut Bencher)
234	{
235	    let strings: Vec<String> = STRINGS.split(char::is_whitespace).map(ToOwned::to_owned).collect();
236	    let mut map = super::HashRefSet::new();
237	    b.iter(|| {
238		for string in strings.iter() {
239		    black_box(map.insert(string.as_str()));
240		}
241	    })
242	}
243	#[bench] fn owning_strings(b: &mut Bencher)
244	{
245	    let strings: Vec<String> = STRINGS.split(char::is_whitespace).map(ToOwned::to_owned).collect();
246	    let mut map = std::collections::HashSet::new();
247	    b.iter(|| {
248		for string in strings.iter() {
249		    black_box(map.insert(string.clone())); //clone is needed here :/
250		}
251	    })
252	}
253	
254	#[bench] fn non_owning_ints(b: &mut Bencher)
255	{
256	    let mut map = super::HashRefSet::new();
257	    b.iter(|| {
258		for int in INTS.iter() {
259		    black_box(map.insert(int));
260		}
261	    })
262	}
263	#[bench] fn owning_ints(b: &mut Bencher)
264	{
265	    let mut map = std::collections::HashSet::new();
266	    b.iter(|| {
267		for int in INTS.iter() {
268		    black_box(map.insert(int));
269		}
270	    })
271	}
272    }
273}