any_intern/
dropless.rs

1use super::common::{Interned, RawInterned, UnsafeLock};
2use bumpalo::Bump;
3use hashbrown::{HashTable, hash_table::Entry};
4use std::{
5    alloc::Layout,
6    hash::{BuildHasher, Hash},
7    ptr::{self, NonNull},
8    slice,
9};
10
11/// A trait for types that can be stored in a dropless interner.
12///
13/// The `Dropless` trait provides methods for working with types that can be stored in a
14/// [`DroplessInterner`] or [`DroplessInternSet`]. It defines how instances of a type are converted
15/// to and from raw byte representations, hashed, and compared for equality. This is useful for
16/// interning values without requiring ownership.
17///
18/// # Safety
19///
20/// Implementing the `Dropless` trait requires careful attention to alignment and memory safety. The
21/// methods in this trait are marked as `unsafe` because they rely on the caller to ensure that the
22/// provided byte slices are properly aligned and valid for the type. Misuse of these methods can
23/// lead to undefined behavior.
24///
25/// # Examples
26///
27/// ```
28/// use any_intern::Dropless;
29/// use std::alloc::Layout;
30/// use std::ptr::NonNull;
31///
32/// #[derive(PartialEq, Eq, Hash, Debug)]
33/// struct MyType(u32);
34///
35/// impl Dropless for MyType {
36///     fn as_byte_ptr(&self) -> NonNull<u8> {
37///         NonNull::from_ref(self).cast::<u8>()
38///     }
39///
40///     fn layout(&self) -> Layout {
41///         Layout::for_value(self)
42///     }
43///
44///     unsafe fn from_bytes(bytes: &[u8]) -> &Self {
45///         let ptr = bytes.as_ptr().cast::<Self>();
46///         unsafe { ptr.as_ref().unwrap_unchecked() }
47///     }
48///
49///     unsafe fn hash<S: std::hash::BuildHasher>(build_hasher: &S, bytes: &[u8]) -> u64 {
50///         let this = unsafe { Self::from_bytes(bytes) };
51///         build_hasher.hash_one(this)
52///     }
53///
54///     unsafe fn eq(a: &[u8], b: &[u8]) -> bool {
55///         unsafe { Self::from_bytes(a) == Self::from_bytes(b) }
56///     }
57/// }
58/// ```
59pub trait Dropless {
60    /// Returns pointer to the instance.
61    fn as_byte_ptr(&self) -> NonNull<u8> {
62        NonNull::from_ref(self).cast::<u8>()
63    }
64
65    /// Returns layout of the type.
66    fn layout(&self) -> Layout {
67        Layout::for_value(self)
68    }
69
70    /// Converts a byte slice into a reference to the type.
71    ///
72    /// The byte slice is guaranteed to be well aligned and correct data for the type.
73    ///
74    /// # Safety
75    ///
76    /// Undefined behavior if any conditions below are not met.
77    /// * Implementation should interpret the byte slice into the type correctly.
78    /// * Caller should give well aligned data for the type.
79    unsafe fn from_bytes(bytes: &[u8]) -> &Self;
80
81    /// Computes a hash value for the type using the provided byte slice.
82    ///
83    /// The byte slice is guaranteed to be well aligned and correct data for the type.
84    ///
85    /// # Safety
86    ///
87    /// Undefined behavior if any conditions below are not met.
88    /// * Implementation should interpret the byte slice into the type correctly.
89    /// * Caller should give well aligned data for the type.
90    unsafe fn hash<S: BuildHasher>(build_hasher: &S, bytes: &[u8]) -> u64;
91
92    /// Compares two byte slices for equality as instances of the type.
93    ///
94    /// The byte slices are guaranteed to be well aligned and correct data for the type.
95    ///
96    /// # Safety
97    ///
98    /// Undefined behavior if any conditions below are not met.
99    /// * Implementation should interpret the byte slice into the type correctly.
100    /// * Caller should give well aligned data for the type.
101    unsafe fn eq(a: &[u8], b: &[u8]) -> bool;
102}
103
104macro_rules! simple_impl_dropless_fn {
105    (from_bytes) => {
106        unsafe fn from_bytes(bytes: &[u8]) -> &Self {
107            let ptr = bytes.as_ptr().cast::<Self>();
108            unsafe { ptr.as_ref().unwrap_unchecked() }
109        }
110    };
111    (hash) => {
112        unsafe fn hash<S: BuildHasher>(build_hasher: &S, bytes: &[u8]) -> u64 {
113            let this = unsafe { Self::from_bytes(bytes) };
114            build_hasher.hash_one(this)
115        }
116    };
117    (eq) => {
118        unsafe fn eq(a: &[u8], b: &[u8]) -> bool {
119            unsafe { Self::from_bytes(a) == Self::from_bytes(b) }
120        }
121    };
122}
123
124macro_rules! impl_dropless {
125    ($($ty:ty)*) => {
126        $(
127            impl Dropless for $ty {
128                simple_impl_dropless_fn!(from_bytes);
129                simple_impl_dropless_fn!(hash);
130                simple_impl_dropless_fn!(eq);
131            }
132        )*
133    };
134}
135
136impl_dropless!(i8 i16 i32 i64 i128 isize u8 u16 u32 u64 u128 usize bool char);
137
138impl<T: Dropless + Hash + Eq> Dropless for [T] {
139    unsafe fn from_bytes(bytes: &[u8]) -> &Self {
140        let ptr = bytes.as_ptr().cast::<T>();
141        let stride = Layout::new::<T>().pad_to_align().size();
142        let num_elems = bytes.len() / stride;
143        unsafe { slice::from_raw_parts(ptr, num_elems) }
144    }
145    simple_impl_dropless_fn!(hash);
146    simple_impl_dropless_fn!(eq);
147}
148
149impl<T: Dropless + Hash + Eq, const N: usize> Dropless for [T; N] {
150    simple_impl_dropless_fn!(from_bytes);
151    simple_impl_dropless_fn!(hash);
152    simple_impl_dropless_fn!(eq);
153}
154
155impl Dropless for str {
156    unsafe fn from_bytes(bytes: &[u8]) -> &Self {
157        unsafe { str::from_utf8_unchecked(bytes) }
158    }
159    simple_impl_dropless_fn!(hash);
160    simple_impl_dropless_fn!(eq);
161}
162
163impl<T: Dropless + Hash + Eq> Dropless for Option<T> {
164    simple_impl_dropless_fn!(from_bytes);
165    simple_impl_dropless_fn!(hash);
166    simple_impl_dropless_fn!(eq);
167}
168
169impl<T: Dropless + Hash + Eq, E: Dropless + Hash + Eq> Dropless for Result<T, E> {
170    simple_impl_dropless_fn!(from_bytes);
171    simple_impl_dropless_fn!(hash);
172    simple_impl_dropless_fn!(eq);
173}
174
175/// An interner for storing and deduplicating values without requiring ownership.
176///
177/// The `DroplessInterner` is designed for interning values that implement the [`Dropless`] trait.
178/// It allows efficient storage and retrieval of values by ensuring that each unique value is stored
179/// only once. Interning with this type always copies the given value into an internal buffer,
180/// making it suitable for use cases where ownership is not required.
181///
182/// # Examples
183///
184/// ```
185/// use any_intern::DroplessInterner;
186///
187/// let mut interner = DroplessInterner::new();
188///
189/// // Interning strings
190/// let hello = interner.intern("hello");
191/// let world = interner.intern("world");
192/// let another_hello = interner.intern("hello");
193///
194/// assert_eq!(hello, another_hello); // Same value, same reference
195/// assert_ne!(hello, world); // Different values, different references
196///
197/// // Checking if a value exists
198/// assert!(interner.get("hello").is_some());
199/// assert!(interner.get("unknown").is_none());
200///
201/// // Clearing the interner
202/// interner.clear();
203/// assert!(interner.is_empty());
204/// ```
205///
206/// # Safety
207///
208/// The `DroplessInterner` relies on the `Dropless` trait for converting values to and from raw
209/// byte representations. It is the responsibility of the `Dropless` implementation to ensure
210/// memory safety and alignment when interacting with the interner.
211#[derive(Debug)]
212pub struct DroplessInterner<S = fxhash::FxBuildHasher> {
213    inner: UnsafeLock<DroplessInternSet<S>>,
214}
215
216impl DroplessInterner {
217    pub fn new() -> Self {
218        Self::default()
219    }
220}
221
222impl<S: BuildHasher> DroplessInterner<S> {
223    pub fn with_hasher(hash_builder: S) -> Self {
224        // Safety: Only one instance
225        let inner = unsafe { UnsafeLock::new(DroplessInternSet::with_hasher(hash_builder)) };
226        Self { inner }
227    }
228
229    /// Returns number of values the interner contains.
230    pub fn len(&self) -> usize {
231        self.with_inner(|set| set.len())
232    }
233
234    /// Returns true if the interner is empty.
235    pub fn is_empty(&self) -> bool {
236        self.with_inner(|set| set.is_empty())
237    }
238
239    /// Stores a value in the interner, returning a reference to the interned value.
240    ///
241    /// This method inserts the given value into the interner if it does not already exist.  If the
242    /// value already exists, a reference to the existing value is returned. The value is copied
243    /// into an internal buffer, making it suitable for use cases where ownership is not required.
244    ///
245    /// # Examples
246    ///
247    /// ```
248    /// use any_intern::DroplessInterner;
249    ///
250    /// let interner = DroplessInterner::new();
251    ///
252    /// // Interning strings
253    /// let hello = interner.intern("hello");
254    /// let world = interner.intern("world");
255    /// let another_hello = interner.intern("hello");
256    ///
257    /// assert_eq!(hello, another_hello); // Same value, same reference
258    /// assert_ne!(hello, world); // Different values, different references
259    ///
260    /// // Interning arrays
261    /// let array1 = interner.intern(&[1, 2, 3]);
262    /// let array2 = interner.intern(&[1, 2, 3]);
263    /// let array3 = interner.intern(&[4, 5, 6]);
264    ///
265    /// assert_eq!(array1, array2); // Same value, same reference
266    /// assert_ne!(array1, array3); // Different values, different references
267    /// ```
268    pub fn intern<K: Dropless + ?Sized>(&self, value: &K) -> Interned<'_, K> {
269        self.with_inner(|set| set.intern(value))
270    }
271
272    /// Retrieves a reference to a value in the interner based on the provided key.
273    ///
274    /// This method checks if a value corresponding to the given key exists in the interner. If it
275    /// exists, a reference to the interned value is returned. Otherwise, `None` is returned.
276    ///
277    /// # Eaxmples
278    ///
279    /// ```
280    /// use any_intern::DroplessInterner;
281    ///
282    /// let interner = DroplessInterner::new();
283    ///
284    /// // Interning strings
285    /// let hello = interner.intern("hello");
286    ///
287    /// assert_eq!(*interner.get("hello").unwrap(), "hello");
288    /// assert!(interner.get("world").is_none());
289    /// ```
290    pub fn get<K: Dropless + ?Sized>(&self, value: &K) -> Option<Interned<'_, K>> {
291        self.with_inner(|set| set.get(value))
292    }
293
294    /// Removes all items in the interner.
295    ///
296    /// Although the interner support interior mutability, clear method requires mutable access
297    /// to the interner to invalidate all [`Interned`]s referencing the interner.
298    pub fn clear(&mut self) {
299        self.with_inner(|set| set.clear())
300    }
301
302    fn with_inner<'this, F, R>(&'this self, f: F) -> R
303    where
304        F: FnOnce(&'this mut DroplessInternSet<S>) -> R,
305        R: 'this,
306    {
307        // Safety: Mutex unlocking is paired with the locking.
308        unsafe {
309            let set = self.inner.lock().as_mut();
310            let ret = f(set);
311            self.inner.unlock();
312            ret
313        }
314    }
315}
316
317impl<S: Default> Default for DroplessInterner<S> {
318    fn default() -> Self {
319        // Safety: Only one instance
320        let inner = unsafe { UnsafeLock::new(DroplessInternSet::default()) };
321        Self { inner }
322    }
323}
324
325/// A dropless interner.
326///
327/// Interning on this type always copies the given value into internal buffer.
328#[derive(Debug)]
329pub struct DroplessInternSet<S = fxhash::FxBuildHasher> {
330    bump: Bump,
331    set: HashTable<DynInternEntry<S>>,
332    hash_builder: S,
333}
334
335impl DroplessInternSet {
336    pub fn new() -> Self {
337        Self::default()
338    }
339}
340
341impl<S: BuildHasher> DroplessInternSet<S> {
342    pub fn with_hasher(hash_builder: S) -> Self {
343        Self {
344            bump: Bump::new(),
345            set: HashTable::new(),
346            hash_builder,
347        }
348    }
349
350    pub fn intern<K: Dropless + ?Sized>(&mut self, value: &K) -> Interned<'_, K> {
351        let src = value.as_byte_ptr();
352        let layout = value.layout();
353        unsafe {
354            let bytes = slice::from_raw_parts(src.as_ptr(), layout.size());
355            let hash = <K as Dropless>::hash(&self.hash_builder, bytes);
356            let eq = Self::table_eq::<K>(bytes, layout.align());
357            let hasher = Self::table_hasher(&self.hash_builder);
358            let ref_ = match self.set.entry(hash, eq, hasher) {
359                Entry::Occupied(entry) => Self::ref_from_entry(entry.get()),
360                Entry::Vacant(entry) => {
361                    // New allocation & copy
362                    let dst = self.bump.alloc_layout(layout);
363                    ptr::copy_nonoverlapping(src.as_ptr(), dst.as_ptr(), layout.size());
364
365                    let occupied = entry.insert(DynInternEntry {
366                        data: RawInterned(dst).cast(),
367                        layout,
368                        hash: <K as Dropless>::hash,
369                    });
370                    Self::ref_from_entry(occupied.get())
371                }
372            };
373            Interned::unique(ref_)
374        }
375    }
376
377    /// # Eaxmples
378    ///
379    /// ```
380    /// use any_intern::DroplessInternSet;
381    /// use std::hash::RandomState;
382    ///
383    /// let mut set = DroplessInternSet::with_hasher(RandomState::new());
384    /// set.intern("hello");
385    ///
386    /// assert_eq!(set.get("hello").as_deref(), Some(&"hello"));
387    /// assert!(set.get("hi").is_none());
388    /// ```
389    pub fn get<K: Dropless + ?Sized>(&self, value: &K) -> Option<Interned<'_, K>> {
390        let ptr = value.as_byte_ptr();
391        let layout = value.layout();
392        unsafe {
393            let bytes = slice::from_raw_parts(ptr.as_ptr(), layout.size());
394            let hash = <K as Dropless>::hash(&self.hash_builder, bytes);
395            let eq = Self::table_eq::<K>(bytes, layout.align());
396            let entry = self.set.find(hash, eq)?;
397            let ref_ = Self::ref_from_entry(entry);
398            Some(Interned::unique(ref_))
399        }
400    }
401
402    pub fn len(&self) -> usize {
403        self.set.len()
404    }
405
406    pub fn is_empty(&self) -> bool {
407        self.len() == 0
408    }
409
410    pub fn clear(&mut self) {
411        self.bump.reset();
412        self.set.clear();
413    }
414
415    /// Returns `eq` closure that is used for some methods on the [`HashTable`].
416    fn table_eq<K: Dropless + ?Sized>(
417        key: &[u8],
418        align: usize,
419    ) -> impl FnMut(&DynInternEntry<S>) -> bool {
420        move |entry: &DynInternEntry<S>| unsafe {
421            if align == entry.layout.align() {
422                let entry_bytes =
423                    slice::from_raw_parts(entry.data.cast::<u8>().as_ptr(), entry.layout.size());
424                <K as Dropless>::eq(key, entry_bytes)
425            } else {
426                false
427            }
428        }
429    }
430
431    /// Returns `hasher` closure that is used for some methods on the [`HashTable`].
432    fn table_hasher(hash_builder: &S) -> impl Fn(&DynInternEntry<S>) -> u64 {
433        |entry: &DynInternEntry<S>| unsafe {
434            let entry_bytes =
435                slice::from_raw_parts(entry.data.cast::<u8>().as_ptr(), entry.layout.size());
436            // Safety: `entry_bytes` is aligned for the entry hash function.
437            (entry.hash)(hash_builder, entry_bytes)
438        }
439    }
440
441    unsafe fn ref_from_entry<'a, K: Dropless + ?Sized>(entry: &DynInternEntry<S>) -> &'a K {
442        unsafe {
443            let bytes =
444                slice::from_raw_parts(entry.data.cast::<u8>().as_ptr(), entry.layout.size());
445            K::from_bytes(bytes)
446        }
447    }
448}
449
450impl<S: Default> Default for DroplessInternSet<S> {
451    fn default() -> Self {
452        Self {
453            bump: Bump::new(),
454            set: HashTable::new(),
455            hash_builder: Default::default(),
456        }
457    }
458}
459
460#[derive(Debug)]
461struct DynInternEntry<S> {
462    data: RawInterned,
463    layout: Layout,
464    /// Input bytes must be well aligned.
465    hash: unsafe fn(&S, &[u8]) -> u64,
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471    use crate::common;
472
473    #[test]
474    fn test_dropless_interner() {
475        test_dropless_interner_int();
476        test_dropless_interner_str();
477        test_dropless_interner_bytes();
478        test_dropless_interner_mixed();
479        test_dropless_interner_many();
480        test_dropless_interner_alignment_handling();
481    }
482
483    fn test_dropless_interner_int() {
484        let interner = DroplessInterner::new();
485
486        let a = interner.intern(&0_u32).erased_raw();
487        let b = interner.intern(&0_u32).erased_raw();
488        let c = interner.intern(&1_u32).erased_raw();
489        // d has the same bytes and layout as c's, so memory will be shared.
490        let d = interner.intern(&1_i32).erased_raw();
491
492        let groups: [&[RawInterned]; _] = [&[a, b], &[c, d]];
493        common::assert_group_addr_eq(&groups);
494    }
495
496    fn test_dropless_interner_str() {
497        let interner = DroplessInterner::new();
498
499        let a = interner.intern("apple").erased_raw();
500        let b = interner.intern(*Box::new("apple")).erased_raw();
501        let c = interner.intern(&*String::from("apple")).erased_raw();
502        let d = interner.intern("banana").erased_raw();
503
504        let groups: [&[RawInterned]; _] = [&[a, b, c], &[d]];
505        common::assert_group_addr_eq(&groups);
506    }
507
508    fn test_dropless_interner_bytes() {
509        let interner = DroplessInterner::new();
510
511        let a = interner.intern(&[0, 1]).erased_raw();
512        let boxed: Box<[i32]> = Box::new([0, 1]);
513        let b = interner.intern(&*boxed).erased_raw();
514        let c = interner.intern(&*vec![0, 1]).erased_raw();
515        let d = interner.intern(&[2, 3]).erased_raw();
516
517        let groups: [&[RawInterned]; _] = [&[a, b, c], &[d]];
518        common::assert_group_addr_eq(&groups);
519    }
520
521    fn test_dropless_interner_mixed() {
522        let interner = DroplessInterner::new();
523
524        interner.intern(&1_u32);
525        interner.intern(&[2, 3]);
526        interner.intern("apple");
527        assert_eq!(interner.get("apple").as_deref(), Some(&"apple"));
528        assert!(interner.get("banana").is_none());
529        assert_eq!(interner.get(&1_u32).as_deref(), Some(&&1_u32));
530        assert!(interner.get(&2_u32).is_none());
531        assert_eq!(interner.get(&[2, 3]).as_deref(), Some(&&[2, 3]));
532        assert!(interner.get(&[2]).is_none());
533    }
534
535    fn test_dropless_interner_many() {
536        let interner = DroplessInterner::new();
537
538        let mut interned_int = Vec::new();
539        let mut interned_str = Vec::new();
540        let mut interned_bytes = Vec::new();
541
542        const N: usize = 1000;
543
544        let strs = (0..N).map(|i| (i * 10_000).to_string()).collect::<Vec<_>>();
545
546        // Interns lots of items.
547        for i in 0..N {
548            let int = &(i as u32); // 4 bytes
549            let str_ = &*strs[i]; // greater than 4 btyes
550            let bytes = &[i as u16]; // 2 bytes
551            interned_int.push(interner.intern(int).erased_raw());
552            interned_str.push(interner.intern(str_).erased_raw());
553            interned_bytes.push(interner.intern(bytes).erased_raw());
554        }
555
556        // Verifies every pointer is unique.
557        interned_int.sort_unstable();
558        interned_int.dedup();
559        interned_str.sort_unstable();
560        interned_str.dedup();
561        interned_bytes.sort_unstable();
562        interned_bytes.dedup();
563        assert_eq!(interned_int.len(), N);
564        assert_eq!(interned_str.len(), N);
565        assert_eq!(interned_bytes.len(), N);
566        let whole = interned_int
567            .iter()
568            .chain(&interned_str)
569            .chain(&interned_bytes)
570            .cloned()
571            .collect::<fxhash::FxHashSet<_>>();
572        assert_eq!(whole.len(), N * 3);
573
574        // Verifies `get` method for every item.
575        for i in 0..N {
576            let int = &(i as u32); // 4 bytes
577            let str_ = &*strs[i]; // greater than 4 btyes
578            let bytes = &[i as u16]; // 2 bytes
579            assert_eq!(*interner.get(int).unwrap(), int);
580            assert_eq!(*interner.get(str_).unwrap(), str_);
581            assert_eq!(*interner.get(bytes).unwrap(), bytes);
582        }
583    }
584
585    #[rustfmt::skip]
586    fn test_dropless_interner_alignment_handling() {
587        #[derive(PartialEq, Eq, Hash, Debug)] #[repr(C, align(4))]  struct T4  (u16, [u8; 2]);
588        #[derive(PartialEq, Eq, Hash, Debug)] #[repr(C, align(8))]  struct T8  (u16, [u8; 6]);
589        #[derive(PartialEq, Eq, Hash, Debug)] #[repr(C, align(16))] struct T16 (u16, [u8; 14]);
590
591        macro_rules! impl_for_first_2bytes {
592            () => {
593                unsafe fn hash<S: BuildHasher>(hash_builder: &S, bytes: &[u8]) -> u64 {
594                    hash_builder.hash_one(&bytes[0..2])
595                }
596                unsafe fn eq(a: &[u8], b: &[u8]) -> bool {
597                    a[0..2] == b[0..2]
598                }
599            };
600        }
601
602        impl Dropless for T4 { simple_impl_dropless_fn!(from_bytes); impl_for_first_2bytes!(); }
603        impl Dropless for T8 { simple_impl_dropless_fn!(from_bytes); impl_for_first_2bytes!(); }
604        impl Dropless for T16 { simple_impl_dropless_fn!(from_bytes); impl_for_first_2bytes!(); }
605
606        let interner = DroplessInterner::new();
607
608        let mut interned_4 = Vec::new();
609        let mut interned_8 = Vec::new();
610        let mut interned_16 = Vec::new();
611
612        const N: usize = 1000;
613
614        // Items being interned have the same first 2 bytes, so they will look like the smae from
615        // perspective of interner. But, they must be distinguished from each other due to their
616        // different layouts. Following bytes after the 2 bytes will be used to detect data
617        // validity.
618        for i in 0..N {
619            let t4 = T4(i as u16, [4; _]);
620            let t8 = T8(i as u16, [8; _]);
621            let t16 = T16(i as u16, [16; _]);
622            interned_4.push(interner.intern(&t4).erased_raw());
623            interned_8.push(interner.intern(&t8).erased_raw());
624            interned_16.push(interner.intern(&t16).erased_raw());
625        }
626
627        for i in 0..N {
628            let t4 = T4(i as u16, [4; _]);
629            let t8 = T8(i as u16, [8; _]);
630            let t16 = T16(i as u16, [16; _]);
631            unsafe {
632                assert_eq!(*<Interned<'_, T4>>::from_erased_raw(interned_4[i]), &t4);
633                assert_eq!(*<Interned<'_, T8>>::from_erased_raw(interned_8[i]), &t8);
634                assert_eq!(*<Interned<'_, T16>>::from_erased_raw(interned_16[i]), &t16);
635            }
636        }
637    }
638}