Skip to main content

blazinterner/
lib.rs

1//! This crate offers an efficient and concurrent
2//! [interning](https://en.wikipedia.org/wiki/Interning_(computer_science)) API
3//! over generic data.
4//!
5//! Here are its main features.
6//!
7//! - **Generic**: You can intern any data type that implements [`Hash`] and
8//!   [`Eq`], not just strings. The interned type doesn't even have to be
9//!   [`Sized`] (for example [`str`]), as long as you provide a [`Sized`]
10//!   storage type (such as `Box<str>`) that can be borrowed as the interned
11//!   type.
12//! - **Efficient**: Each [`Interned`] value contains only a 32-bit index. The
13//!   corresponding [`Arena`] stores each value directly in an [`AppendVec`],
14//!   plus the 32-bit index in a raw hash table ([`DashTable`]). To intern a
15//!   value of type `T` using storage type `S`, you can pass any type that
16//!   implements [`Borrow<T>`](Borrow) and [`Into<S>`](Into), which allows
17//!   avoiding unnecessary copies. For example, in an `Arena<str, Box<str>>` you
18//!   can intern many string types: `&str`, `String`, `Box<str>`, `Cow<'_,
19//!   str>`, etc.
20//! - **Concurrent**: The [`Arena`] is [`Sync`], and allows simultaneous reads
21//!   and writes. More specifically, retrieving values via
22//!   [`Interned::lookup()`] and [`Interned::lookup_ref()`] is always wait-free,
23//!   even when a write happens concurrently! This is thanks to the underlying
24//!   [`AppendVec`] implementation. However, only one write (using
25//!   [`Interned::from()`]) can happen at a time on a given arena, due to an
26//!   exclusive write lock.
27
28#![forbid(missing_docs, unsafe_code)]
29#![cfg_attr(docsrs, feature(doc_cfg))]
30
31#[cfg(feature = "delta")]
32mod delta;
33mod slice;
34
35use appendvec::AppendVec;
36use dashtable::DashTable;
37#[cfg(feature = "delta")]
38pub use delta::{Accumulator, DeltaEncoding};
39#[cfg(feature = "get-size2")]
40use get_size2::{GetSize, GetSizeTracker};
41use hashbrown::DefaultHashBuilder;
42#[cfg(feature = "serde")]
43use serde::de::{SeqAccess, Visitor};
44#[cfg(feature = "serde")]
45use serde::{Deserialize, Deserializer, Serialize, Serializer};
46pub use slice::{ArenaSlice, InternedSlice};
47use std::borrow::Borrow;
48use std::cmp::Ordering;
49use std::fmt::Debug;
50use std::hash::{BuildHasher, Hash, Hasher};
51use std::marker::PhantomData;
52#[cfg(feature = "get-size2")]
53use std::mem::size_of;
54#[cfg(feature = "debug")]
55use std::sync::atomic::{self, AtomicUsize};
56
57/// A handle to an interned value in an [`Arena`].
58///
59/// This is generic over the logical value type `T` as well as its `Storage`
60/// type, that needs to be [`Sized`]. For [`Sized`] values, `Storage = T` is a
61/// good default that incurs no overhead. For non-[`Sized`] values such as
62/// [`str`], you need to specify a [`Sized`] storage type, such as `Box<T>`.
63pub struct Interned<T: ?Sized, Storage = T> {
64    id: u32,
65    _phantom: PhantomData<fn() -> (*const T, *const Storage)>,
66}
67
68impl<T: ?Sized, Storage> Debug for Interned<T, Storage> {
69    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
70        f.debug_tuple("I").field(&self.id).finish()
71    }
72}
73
74impl<T: ?Sized, Storage> Clone for Interned<T, Storage> {
75    fn clone(&self) -> Self {
76        *self
77    }
78}
79
80impl<T: ?Sized, Storage> Copy for Interned<T, Storage> {}
81
82impl<T: ?Sized, Storage> PartialEq for Interned<T, Storage> {
83    fn eq(&self, other: &Self) -> bool {
84        self.id.eq(&other.id)
85    }
86}
87
88impl<T: ?Sized, Storage> Eq for Interned<T, Storage> {}
89
90impl<T: ?Sized, Storage> PartialOrd for Interned<T, Storage> {
91    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
92        Some(self.cmp(other))
93    }
94}
95
96impl<T: ?Sized, Storage> Ord for Interned<T, Storage> {
97    fn cmp(&self, other: &Self) -> Ordering {
98        self.id.cmp(&other.id)
99    }
100}
101
102impl<T: ?Sized, Storage> Hash for Interned<T, Storage> {
103    fn hash<H>(&self, state: &mut H)
104    where
105        H: Hasher,
106    {
107        self.id.hash(state);
108    }
109}
110
111#[cfg(feature = "get-size2")]
112impl<T: ?Sized, Storage> GetSize for Interned<T, Storage> {
113    // There is nothing on the heap, so the default implementation works out of the
114    // box.
115}
116
117#[cfg(feature = "raw")]
118impl<T: ?Sized, Storage> Interned<T, Storage> {
119    /// Creates an interned value for the given index.
120    ///
121    /// This is a low-level function. You should instead use the
122    /// [`from()`](Self::from) API to intern a value, unless you really know
123    /// what you're doing.
124    pub fn from_id(id: u32) -> Self {
125        Self {
126            id,
127            _phantom: PhantomData,
128        }
129    }
130
131    /// Obtains the underlying interning index.
132    ///
133    /// This is a low-level function. You should instead use the
134    /// [`lookup()`](Self::lookup) and [`lookup_ref()`](Self::lookup_ref) APIs,
135    /// unless you really know what you're doing.
136    pub fn id(&self) -> u32 {
137        self.id
138    }
139}
140
141impl<T: ?Sized, Storage> Interned<T, Storage>
142where
143    T: Eq + Hash,
144    Storage: Borrow<T>,
145{
146    /// Interns the given value in the given [`Arena`].
147    ///
148    /// If the value was already interned in this arena, it will simply be
149    /// borrowed to retrieve its interning index. Otherwise it will then be
150    /// converted to store it into the arena.
151    pub fn from(arena: &Arena<T, Storage>, value: impl Borrow<T> + Into<Storage>) -> Self {
152        let id = arena.intern(value);
153        Self {
154            id,
155            _phantom: PhantomData,
156        }
157    }
158}
159
160impl<T: ?Sized, Storage> Interned<T, Storage>
161where
162    Storage: Clone,
163{
164    /// Retrieves this interned value from the given [`Arena`].
165    ///
166    /// The caller is responsible for ensuring that the same arena was used to
167    /// intern this value, otherwise an arbitrary value will be returned or
168    /// a panic will happen.
169    ///
170    /// See also [`lookup_ref()`](Self::lookup_ref) if you only need a
171    /// reference.
172    pub fn lookup(&self, arena: &Arena<T, Storage>) -> Storage {
173        arena.lookup(self.id)
174    }
175}
176
177impl<T: ?Sized, Storage> Interned<T, Storage>
178where
179    Storage: Borrow<T>,
180{
181    /// Retrieves a reference to this interned value from the given [`Arena`].
182    ///
183    /// The caller is responsible for ensuring that the same arena was used to
184    /// intern this value, otherwise an arbitrary value will be returned or
185    /// a panic will happen.
186    ///
187    /// See also [`lookup()`](Self::lookup) if you need an owned value.
188    pub fn lookup_ref<'a>(&self, arena: &'a Arena<T, Storage>) -> &'a T {
189        arena.lookup_ref(self.id)
190    }
191}
192
193#[cfg(feature = "serde")]
194impl<T: ?Sized, Storage> Serialize for Interned<T, Storage> {
195    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
196    where
197        S: Serializer,
198    {
199        serializer.serialize_u32(self.id)
200    }
201}
202
203#[cfg(feature = "serde")]
204impl<'de, T: ?Sized, Storage> Deserialize<'de> for Interned<T, Storage> {
205    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
206    where
207        D: Deserializer<'de>,
208    {
209        let id = deserializer.deserialize_u32(U32Visitor)?;
210        Ok(Self {
211            id,
212            _phantom: PhantomData,
213        })
214    }
215}
216
217#[cfg(feature = "serde")]
218struct U32Visitor;
219
220#[cfg(feature = "serde")]
221impl Visitor<'_> for U32Visitor {
222    type Value = u32;
223
224    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
225        formatter.write_str("an integer between 0 and 2^32")
226    }
227
228    fn visit_u8<E>(self, value: u8) -> Result<Self::Value, E>
229    where
230        E: serde::de::Error,
231    {
232        Ok(u32::from(value))
233    }
234
235    fn visit_u16<E>(self, value: u16) -> Result<Self::Value, E>
236    where
237        E: serde::de::Error,
238    {
239        Ok(u32::from(value))
240    }
241
242    fn visit_u32<E>(self, value: u32) -> Result<Self::Value, E>
243    where
244        E: serde::de::Error,
245    {
246        Ok(value)
247    }
248
249    fn visit_u64<E>(self, value: u64) -> Result<Self::Value, E>
250    where
251        E: serde::de::Error,
252    {
253        value
254            .try_into()
255            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
256    }
257
258    fn visit_i8<E>(self, value: i8) -> Result<Self::Value, E>
259    where
260        E: serde::de::Error,
261    {
262        value
263            .try_into()
264            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
265    }
266
267    fn visit_i16<E>(self, value: i16) -> Result<Self::Value, E>
268    where
269        E: serde::de::Error,
270    {
271        value
272            .try_into()
273            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
274    }
275
276    fn visit_i32<E>(self, value: i32) -> Result<Self::Value, E>
277    where
278        E: serde::de::Error,
279    {
280        value
281            .try_into()
282            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
283    }
284
285    fn visit_i64<E>(self, value: i64) -> Result<Self::Value, E>
286    where
287        E: serde::de::Error,
288    {
289        value
290            .try_into()
291            .map_err(|_| E::custom(format!("u32 out of range: {}", value)))
292    }
293}
294
295/// Interning arena for values of type `T`, storing them with the given
296/// `Storage` type (that needs to be [`Sized`]).
297///
298/// For [`Sized`] values, `Storage = T` is a good default that incurs no
299/// overhead. For non-[`Sized`] values such as [`str`], you need to specify a
300/// [`Sized`] storage type, such as `Box<T>`.
301pub struct Arena<T: ?Sized, Storage = T> {
302    vec: AppendVec<Storage>,
303    map: DashTable<u32>,
304    hasher: DefaultHashBuilder,
305    #[cfg(feature = "debug")]
306    references: AtomicUsize,
307    _phantom: PhantomData<fn() -> *const T>,
308}
309
310impl<T: ?Sized, Storage> Default for Arena<T, Storage> {
311    fn default() -> Self {
312        Self {
313            vec: AppendVec::new(),
314            map: DashTable::new(),
315            hasher: DefaultHashBuilder::default(),
316            #[cfg(feature = "debug")]
317            references: AtomicUsize::new(0),
318            _phantom: PhantomData,
319        }
320    }
321}
322
323impl<T: ?Sized, Storage> Debug for Arena<T, Storage>
324where
325    T: Debug,
326    Storage: Borrow<T>,
327{
328    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
329        fmt.debug_list()
330            .entries(self.vec.iter().map(|x| x.borrow()))
331            .finish()
332    }
333}
334
335impl<T: ?Sized, Storage> PartialEq for Arena<T, Storage>
336where
337    T: Eq,
338    Storage: Borrow<T>,
339{
340    fn eq(&self, other: &Self) -> bool {
341        self.vec
342            .iter()
343            .map(|x| x.borrow())
344            .eq(other.vec.iter().map(|x| x.borrow()))
345    }
346}
347
348impl<T: ?Sized, Storage> Eq for Arena<T, Storage>
349where
350    T: Eq,
351    Storage: Borrow<T>,
352{
353}
354
355#[cfg(feature = "get-size2")]
356impl<T: ?Sized, Storage> GetSize for Arena<T, Storage>
357where
358    Storage: GetSize,
359{
360    fn get_heap_size_with_tracker<Tr: GetSizeTracker>(&self, tracker: Tr) -> (usize, Tr) {
361        let heap_size = self.vec.iter().map(|x| x.get_size()).sum::<usize>()
362            + self.vec.len() * size_of::<u32>();
363        (heap_size, tracker)
364    }
365}
366
367#[cfg(feature = "debug")]
368impl<T: ?Sized, Storage> Arena<T, Storage>
369where
370    Storage: GetSize,
371{
372    /// Prints a summary of the storage used by this arena to stdout.
373    pub fn print_summary(&self, prefix: &str, title: &str, total_bytes: usize) {
374        let len = self.len();
375        let references = self.references();
376        let estimated_bytes = self.get_size();
377        println!(
378            "{}[{:.02}%] {} interner: {} objects | {} bytes ({:.02} bytes/object) | {} references ({:.02} refs/object)",
379            prefix,
380            estimated_bytes as f64 * 100.0 / total_bytes as f64,
381            title,
382            len,
383            estimated_bytes,
384            estimated_bytes as f64 / len as f64,
385            references,
386            references as f64 / len as f64,
387        );
388    }
389}
390
391#[cfg(feature = "debug")]
392impl<T: ?Sized, Storage> Arena<T, Storage> {
393    fn len(&self) -> usize {
394        self.vec.len()
395    }
396
397    fn references(&self) -> usize {
398        self.references.load(atomic::Ordering::Relaxed)
399    }
400}
401
402impl<T: ?Sized, Storage> Arena<T, Storage>
403where
404    T: Eq + Hash,
405    Storage: Borrow<T>,
406{
407    fn intern(&self, value: impl Borrow<T> + Into<Storage>) -> u32 {
408        #[cfg(feature = "debug")]
409        self.references.fetch_add(1, atomic::Ordering::Relaxed);
410
411        let hash = self.hasher.hash_one(value.borrow());
412        *self
413            .map
414            .entry(
415                hash,
416                |&i| self.vec[i as usize].borrow() == value.borrow(),
417                |&i| self.hasher.hash_one(self.vec[i as usize].borrow()),
418            )
419            .or_insert_with(|| {
420                let x: Storage = value.into();
421                let id = self.vec.push(x);
422                assert!(id <= u32::MAX as usize);
423                id as u32
424            })
425            .get()
426    }
427
428    /// Unconditionally push a value, without validating that it's already
429    /// interned.
430    #[cfg(feature = "serde")]
431    fn push(&mut self, value: Storage) -> u32 {
432        #[cfg(feature = "debug")]
433        self.references.fetch_add(1, atomic::Ordering::Relaxed);
434
435        let hash = self.hasher.hash_one(value.borrow());
436
437        let id = self.vec.push_mut(value);
438        assert!(id <= u32::MAX as usize);
439        let id = id as u32;
440
441        self.map.insert_unique(hash, id, |&i| {
442            self.hasher.hash_one(self.vec[i as usize].borrow())
443        });
444
445        id
446    }
447}
448
449impl<T: ?Sized, Storage> Arena<T, Storage>
450where
451    Storage: Clone,
452{
453    fn lookup(&self, id: u32) -> Storage {
454        self.vec[id as usize].clone()
455    }
456}
457
458impl<T: ?Sized, Storage> Arena<T, Storage>
459where
460    Storage: Borrow<T>,
461{
462    fn lookup_ref(&self, id: u32) -> &T {
463        self.vec[id as usize].borrow()
464    }
465}
466
467#[cfg(feature = "serde")]
468impl<T: ?Sized, Storage> Serialize for Arena<T, Storage>
469where
470    T: Serialize,
471    Storage: Borrow<T>,
472{
473    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
474    where
475        S: Serializer,
476    {
477        serializer.collect_seq(self.vec.iter().map(|x| x.borrow()))
478    }
479}
480
481#[cfg(feature = "serde")]
482impl<'de, T: ?Sized, Storage> Deserialize<'de> for Arena<T, Storage>
483where
484    T: Eq + Hash,
485    Storage: Borrow<T> + Deserialize<'de>,
486{
487    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
488    where
489        D: Deserializer<'de>,
490    {
491        deserializer.deserialize_seq(ArenaVisitor::new())
492    }
493}
494
495#[cfg(feature = "serde")]
496struct ArenaVisitor<T: ?Sized, Storage> {
497    _phantom: PhantomData<fn() -> Arena<T, Storage>>,
498}
499
500#[cfg(feature = "serde")]
501impl<T: ?Sized, Storage> ArenaVisitor<T, Storage> {
502    fn new() -> Self {
503        Self {
504            _phantom: PhantomData,
505        }
506    }
507}
508
509#[cfg(feature = "serde")]
510impl<'de, T: ?Sized, Storage> Visitor<'de> for ArenaVisitor<T, Storage>
511where
512    T: Eq + Hash,
513    Storage: Borrow<T> + Deserialize<'de>,
514{
515    type Value = Arena<T, Storage>;
516
517    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
518        formatter.write_str("a sequence of values")
519    }
520
521    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
522    where
523        A: SeqAccess<'de>,
524    {
525        let mut arena = match seq.size_hint() {
526            None => Arena::default(),
527            Some(size_hint) => Arena {
528                vec: AppendVec::with_capacity(size_hint),
529                map: DashTable::with_capacity(size_hint),
530                hasher: DefaultHashBuilder::default(),
531                #[cfg(feature = "debug")]
532                references: AtomicUsize::new(0),
533                _phantom: PhantomData,
534            },
535        };
536
537        while let Some(t) = seq.next_element()? {
538            arena.push(t);
539        }
540
541        Ok(arena)
542    }
543}
544
545#[cfg(test)]
546mod test {
547    use super::*;
548    use std::borrow::Cow;
549
550    #[test]
551    fn test_str_interner() {
552        let arena: Arena<str, Box<str>> = Arena::default();
553
554        let key: &str = "Hello";
555        assert_eq!(arena.intern(key), 0);
556
557        let key: String = "world".into();
558        assert_eq!(arena.intern(key), 1);
559
560        let key: Box<str> = "Hello".into();
561        assert_eq!(arena.intern(key), 0);
562
563        let key: Box<str> = "world".into();
564        assert_eq!(arena.intern(key), 1);
565
566        let key: Cow<'_, str> = "Hello world".into();
567        assert_eq!(arena.intern(key), 2);
568    }
569}