blazinterner/
lib.rs

1//! This crate offers an efficient and concurrent
2//! [interning](https://en.wikipedia.org/wiki/Interning_(computer_science)) API
3//! over generic data.
4//!
5//! Here are its main features.
6//!
7//! - **Generic**: You can intern any data type that implements [`Hash`] and
8//!   [`Eq`], not just strings. The interned type doesn't even have to be
9//!   [`Sized`] (for example [`str`]), as long as you provide a [`Sized`]
10//!   storage type (such as `Box<str>`) that can be borrowed as the interned
11//!   type.
12//! - **Efficient**: Each [`Interned`] value contains only a 32-bit index. The
13//!   corresponding [`Arena`] stores each value directly in an [`AppendVec`],
14//!   plus the 32-bit index in a raw hash table ([`DashTable`]). To intern a
15//!   value of type `T` using storage type `S`, you can pass any type that
16//!   implements [`Borrow<T>`](Borrow) and [`Into<S>`](Into), which allows
17//!   avoiding unnecessary copies. For example, in an `Arena<str, Box<str>>` you
18//!   can intern many string types: `&str`, `String`, `Box<str>`, `Cow<'_,
19//!   str>`, etc.
20//! - **Concurrent**: The [`Arena`] is [`Sync`], and allows simultaneous reads
21//!   and writes. More specifically, retrieving values via
22//!   [`Interned::lookup()`] and [`Interned::lookup_ref()`] is always wait-free,
23//!   even when a write happens concurrently! This is thanks to the underlying
24//!   [`AppendVec`] implementation. However, only one write (using
25//!   [`Interned::from()`]) can happen at a time on a given arena, due to an
26//!   exclusive write lock.
27
28#![forbid(missing_docs, unsafe_code)]
29
30use appendvec::AppendVec;
31use dashtable::DashTable;
32#[cfg(feature = "get-size2")]
33use get_size2::{GetSize, GetSizeTracker};
34use hashbrown::DefaultHashBuilder;
35#[cfg(feature = "serde")]
36use serde::de::{SeqAccess, Visitor};
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Deserializer, Serialize, Serializer};
39use std::borrow::Borrow;
40use std::cmp::Ordering;
41use std::fmt::Debug;
42use std::hash::{BuildHasher, Hash, Hasher};
43use std::marker::PhantomData;
44#[cfg(feature = "get-size2")]
45use std::mem::size_of;
46#[cfg(feature = "debug")]
47use std::sync::atomic::{self, AtomicUsize};
48
49/// A handle to an interned value in an [`Arena`].
50///
51/// This is generic over the logical value type `T` as well as its `Storage`
52/// type, that needs to be [`Sized`]. For [`Sized`] values, `Storage = T` is a
53/// good default that incurs no overhead. For non-[`Sized`] values such as
54/// [`str`], you need to specify a [`Sized`] storage type, such as `Box<T>`.
55pub struct Interned<T: ?Sized, Storage = T> {
56    id: u32,
57    _phantom: PhantomData<fn() -> (*const T, *const Storage)>,
58}
59
60impl<T: ?Sized, Storage> Debug for Interned<T, Storage> {
61    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62        f.debug_tuple("I").field(&self.id).finish()
63    }
64}
65
66impl<T: ?Sized, Storage> Clone for Interned<T, Storage> {
67    fn clone(&self) -> Self {
68        *self
69    }
70}
71
72impl<T: ?Sized, Storage> Copy for Interned<T, Storage> {}
73
74impl<T: ?Sized, Storage> PartialEq for Interned<T, Storage> {
75    fn eq(&self, other: &Self) -> bool {
76        self.id.eq(&other.id)
77    }
78}
79
80impl<T: ?Sized, Storage> Eq for Interned<T, Storage> {}
81
82impl<T: ?Sized, Storage> PartialOrd for Interned<T, Storage> {
83    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
84        Some(self.cmp(other))
85    }
86}
87
88impl<T: ?Sized, Storage> Ord for Interned<T, Storage> {
89    fn cmp(&self, other: &Self) -> Ordering {
90        self.id.cmp(&other.id)
91    }
92}
93
94impl<T: ?Sized, Storage> Hash for Interned<T, Storage> {
95    fn hash<H>(&self, state: &mut H)
96    where
97        H: Hasher,
98    {
99        self.id.hash(state);
100    }
101}
102
103#[cfg(feature = "get-size2")]
104impl<T: ?Sized, Storage> GetSize for Interned<T, Storage> {
105    // There is nothing on the heap, so the default implementation works out of the
106    // box.
107}
108
109#[cfg(feature = "raw")]
110impl<T: ?Sized, Storage> Interned<T, Storage> {
111    /// Creates an interned value for the given index.
112    ///
113    /// This is a low-level function. You should instead use the
114    /// [`from()`](Self::from) API to intern a value, unless you really know
115    /// what you're doing.
116    pub fn from_id(id: u32) -> Self {
117        Self {
118            id,
119            _phantom: PhantomData,
120        }
121    }
122
123    /// Obtains the underlying interning index.
124    ///
125    /// This is a low-level function. You should instead use the
126    /// [`lookup()`](lookup) and [`lookup_ref()`](lookup_ref) APIs, unless you
127    /// really know what you're doing.
128    pub fn id(&self) -> u32 {
129        self.id
130    }
131}
132
133impl<T: ?Sized, Storage> Interned<T, Storage>
134where
135    T: Eq + Hash,
136    Storage: Borrow<T>,
137{
138    /// Interns the given value in the given [`Arena`].
139    ///
140    /// If the value was already interned in this arena, it will simply be
141    /// borrowed to retrieve its interning index. Otherwise it will then be
142    /// converted to store it into the arena.
143    pub fn from(arena: &Arena<T, Storage>, value: impl Borrow<T> + Into<Storage>) -> Self {
144        let id = arena.intern(value);
145        Self {
146            id,
147            _phantom: PhantomData,
148        }
149    }
150}
151
152impl<T: ?Sized, Storage> Interned<T, Storage>
153where
154    Storage: Clone,
155{
156    /// Retrieves this interned value from the given [`Arena`].
157    ///
158    /// The caller is responsible for ensuring that the same arena was used to
159    /// intern this value, otherwise an arbitrary value will be returned or
160    /// a panic will happen.
161    ///
162    /// See also [`lookup_ref()`](lookup_ref) if you only need a reference.
163    pub fn lookup(&self, arena: &Arena<T, Storage>) -> Storage {
164        arena.lookup(self.id)
165    }
166}
167
168impl<T: ?Sized, Storage> Interned<T, Storage>
169where
170    Storage: Borrow<T>,
171{
172    /// Retrieves a reference to this interned value from the given [`Arena`].
173    ///
174    /// The caller is responsible for ensuring that the same arena was used to
175    /// intern this value, otherwise an arbitrary value will be returned or
176    /// a panic will happen.
177    ///
178    /// See also [`lookup()`](lookup) if you need an owned value.
179    pub fn lookup_ref<'a>(&self, arena: &'a Arena<T, Storage>) -> &'a T {
180        arena.lookup_ref(self.id)
181    }
182}
183
184#[cfg(feature = "serde")]
185impl<T: ?Sized, Storage> Serialize for Interned<T, Storage> {
186    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
187    where
188        S: Serializer,
189    {
190        serializer.serialize_u32(self.id)
191    }
192}
193
194#[cfg(feature = "serde")]
195impl<'de, T: ?Sized, Storage> Deserialize<'de> for Interned<T, Storage> {
196    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
197    where
198        D: Deserializer<'de>,
199    {
200        let id = deserializer.deserialize_u32(U32Visitor)?;
201        Ok(Self {
202            id,
203            _phantom: PhantomData,
204        })
205    }
206}
207
208#[cfg(feature = "serde")]
209struct U32Visitor;
210
211#[cfg(feature = "serde")]
212impl Visitor<'_> for U32Visitor {
213    type Value = u32;
214
215    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
216        formatter.write_str("an integer between 0 and 2^32")
217    }
218
219    fn visit_u8<E>(self, value: u8) -> Result<Self::Value, E>
220    where
221        E: serde::de::Error,
222    {
223        Ok(u32::from(value))
224    }
225
226    fn visit_u16<E>(self, value: u16) -> Result<Self::Value, E>
227    where
228        E: serde::de::Error,
229    {
230        Ok(u32::from(value))
231    }
232
233    fn visit_u32<E>(self, value: u32) -> Result<Self::Value, E>
234    where
235        E: serde::de::Error,
236    {
237        Ok(value)
238    }
239
240    fn visit_u64<E>(self, value: u64) -> Result<Self::Value, E>
241    where
242        E: serde::de::Error,
243    {
244        value
245            .try_into()
246            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
247    }
248
249    fn visit_i8<E>(self, value: i8) -> Result<Self::Value, E>
250    where
251        E: serde::de::Error,
252    {
253        value
254            .try_into()
255            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
256    }
257
258    fn visit_i16<E>(self, value: i16) -> Result<Self::Value, E>
259    where
260        E: serde::de::Error,
261    {
262        value
263            .try_into()
264            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
265    }
266
267    fn visit_i32<E>(self, value: i32) -> Result<Self::Value, E>
268    where
269        E: serde::de::Error,
270    {
271        value
272            .try_into()
273            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
274    }
275
276    fn visit_i64<E>(self, value: i64) -> Result<Self::Value, E>
277    where
278        E: serde::de::Error,
279    {
280        value
281            .try_into()
282            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
283    }
284}
285
286/// Interning arena for values of type `T`, storing them with the given
287/// `Storage` type (that needs to be [`Sized`]).
288///
289/// For [`Sized`] values, `Storage = T` is a good default that incurs no
290/// overhead. For non-[`Sized`] values such as [`str`], you need to specify a
291/// [`Sized`] storage type, such as `Box<T>`.
292pub struct Arena<T: ?Sized, Storage = T> {
293    vec: AppendVec<Storage>,
294    map: DashTable<u32>,
295    hasher: DefaultHashBuilder,
296    #[cfg(feature = "debug")]
297    references: AtomicUsize,
298    _phantom: PhantomData<fn() -> *const T>,
299}
300
301impl<T: ?Sized, Storage> Default for Arena<T, Storage> {
302    fn default() -> Self {
303        Self {
304            vec: AppendVec::new(),
305            map: DashTable::new(),
306            hasher: DefaultHashBuilder::default(),
307            #[cfg(feature = "debug")]
308            references: AtomicUsize::new(0),
309            _phantom: PhantomData,
310        }
311    }
312}
313
314impl<T: ?Sized, Storage> Debug for Arena<T, Storage>
315where
316    T: Debug,
317    Storage: Borrow<T>,
318{
319    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
320        fmt.debug_list()
321            .entries(self.vec.iter().map(|x| x.borrow()))
322            .finish()
323    }
324}
325
326impl<T: ?Sized, Storage> PartialEq for Arena<T, Storage>
327where
328    T: Eq + Hash,
329    Storage: Borrow<T>,
330{
331    fn eq(&self, other: &Self) -> bool {
332        self.vec
333            .iter()
334            .map(|x| x.borrow())
335            .eq(other.vec.iter().map(|x| x.borrow()))
336    }
337}
338
339impl<T: ?Sized, Storage> Eq for Arena<T, Storage>
340where
341    T: Eq + Hash,
342    Storage: Borrow<T>,
343{
344}
345
346#[cfg(feature = "get-size2")]
347impl<T: ?Sized, Storage> GetSize for Arena<T, Storage>
348where
349    Storage: GetSize,
350{
351    fn get_heap_size_with_tracker<Tr: GetSizeTracker>(&self, tracker: Tr) -> (usize, Tr) {
352        let heap_size = self.vec.iter().map(|x| x.get_size()).sum::<usize>()
353            + self.vec.len() * size_of::<u32>();
354        (heap_size, tracker)
355    }
356}
357
358#[cfg(feature = "debug")]
359impl<T: ?Sized, Storage> Arena<T, Storage>
360where
361    Storage: GetSize,
362{
363    /// Prints a summary of the storage used by this arena to stdout.
364    pub fn print_summary(&self, prefix: &str, title: &str, total_bytes: usize) {
365        let len = self.len();
366        let references = self.references();
367        let estimated_bytes = self.get_size();
368        println!(
369            "{}[{:.02}%] {} interner: {} objects | {} bytes ({:.02} bytes/object) | {} references ({:.02} refs/object)",
370            prefix,
371            estimated_bytes as f64 * 100.0 / total_bytes as f64,
372            title,
373            len,
374            estimated_bytes,
375            estimated_bytes as f64 / len as f64,
376            references,
377            references as f64 / len as f64,
378        );
379    }
380}
381
382#[cfg(feature = "debug")]
383impl<T: ?Sized, Storage> Arena<T, Storage> {
384    fn len(&self) -> usize {
385        self.vec.len()
386    }
387
388    fn references(&self) -> usize {
389        self.references.load(atomic::Ordering::Relaxed)
390    }
391}
392
393impl<T: ?Sized, Storage> Arena<T, Storage>
394where
395    T: Eq + Hash,
396    Storage: Borrow<T>,
397{
398    fn intern(&self, value: impl Borrow<T> + Into<Storage>) -> u32 {
399        #[cfg(feature = "debug")]
400        self.references.fetch_add(1, atomic::Ordering::Relaxed);
401
402        let hash = self.hasher.hash_one(value.borrow());
403        *self
404            .map
405            .entry(
406                hash,
407                |&i| self.vec[i as usize].borrow() == value.borrow(),
408                |&i| self.hasher.hash_one(self.vec[i as usize].borrow()),
409            )
410            .or_insert_with(|| {
411                let x: Storage = value.into();
412                let id = self.vec.push(x);
413                assert!(id <= u32::MAX as usize);
414                id as u32
415            })
416            .get()
417    }
418
419    /// Unconditionally push a value, without validating that it's already
420    /// interned.
421    #[cfg(feature = "serde")]
422    fn push(&mut self, value: Storage) -> u32 {
423        #[cfg(feature = "debug")]
424        self.references.fetch_add(1, atomic::Ordering::Relaxed);
425
426        let hash = self.hasher.hash_one(value.borrow());
427
428        let id = self.vec.push_mut(value);
429        assert!(id <= u32::MAX as usize);
430        let id = id as u32;
431
432        self.map.insert_unique(hash, id, |&i| {
433            self.hasher.hash_one(self.vec[i as usize].borrow())
434        });
435
436        id
437    }
438}
439
440impl<T: ?Sized, Storage> Arena<T, Storage>
441where
442    Storage: Clone,
443{
444    fn lookup(&self, id: u32) -> Storage {
445        self.vec[id as usize].clone()
446    }
447}
448
449impl<T: ?Sized, Storage> Arena<T, Storage>
450where
451    Storage: Borrow<T>,
452{
453    fn lookup_ref(&self, id: u32) -> &T {
454        self.vec[id as usize].borrow()
455    }
456}
457
458#[cfg(feature = "serde")]
459impl<T: ?Sized, Storage> Serialize for Arena<T, Storage>
460where
461    T: Serialize,
462    Storage: Borrow<T>,
463{
464    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
465    where
466        S: Serializer,
467    {
468        serializer.collect_seq(self.vec.iter().map(|x| x.borrow()))
469    }
470}
471
472#[cfg(feature = "serde")]
473impl<'de, T: ?Sized, Storage> Deserialize<'de> for Arena<T, Storage>
474where
475    T: Eq + Hash,
476    Storage: Borrow<T> + Deserialize<'de>,
477{
478    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
479    where
480        D: Deserializer<'de>,
481    {
482        deserializer.deserialize_seq(ArenaVisitor::new())
483    }
484}
485
486#[cfg(feature = "serde")]
487struct ArenaVisitor<T: ?Sized, Storage> {
488    _phantom: PhantomData<fn() -> Arena<T, Storage>>,
489}
490
491#[cfg(feature = "serde")]
492impl<T: ?Sized, Storage> ArenaVisitor<T, Storage> {
493    fn new() -> Self {
494        Self {
495            _phantom: PhantomData,
496        }
497    }
498}
499
500#[cfg(feature = "serde")]
501impl<'de, T: ?Sized, Storage> Visitor<'de> for ArenaVisitor<T, Storage>
502where
503    T: Eq + Hash,
504    Storage: Borrow<T> + Deserialize<'de>,
505{
506    type Value = Arena<T, Storage>;
507
508    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
509        formatter.write_str("a sequence of values")
510    }
511
512    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
513    where
514        A: SeqAccess<'de>,
515    {
516        let mut arena = match seq.size_hint() {
517            None => Arena::default(),
518            Some(size_hint) => Arena {
519                vec: AppendVec::with_capacity(size_hint),
520                map: DashTable::with_capacity(size_hint),
521                hasher: DefaultHashBuilder::default(),
522                #[cfg(feature = "debug")]
523                references: AtomicUsize::new(0),
524                _phantom: PhantomData,
525            },
526        };
527
528        while let Some(t) = seq.next_element()? {
529            arena.push(t);
530        }
531
532        Ok(arena)
533    }
534}
535
536#[cfg(test)]
537mod test {
538    use super::*;
539    use std::borrow::Cow;
540
541    #[test]
542    fn test_str_interner() {
543        let arena: Arena<str, Box<str>> = Arena::default();
544
545        let key: &str = "Hello";
546        assert_eq!(arena.intern(key), 0);
547
548        let key: String = "world".into();
549        assert_eq!(arena.intern(key), 1);
550
551        let key: Box<str> = "Hello".into();
552        assert_eq!(arena.intern(key), 0);
553
554        let key: Box<str> = "world".into();
555        assert_eq!(arena.intern(key), 1);
556
557        let key: Cow<'_, str> = "Hello world".into();
558        assert_eq!(arena.intern(key), 2);
559    }
560}