interned/
lib.rs

1//! Interned provides highly optimized, thread-local, generic
2//! [interning](https://en.wikipedia.org/wiki/String_interning) via [`Interned<T>`] and a
3//! [memoization](https://en.wikipedia.org/wiki/Memoization) layer built on top of this
4//! interning layer, provided by [`Memoized<I, T>`], which can cache the result of an arbitrary
5//! input `I: Hash` and _intern_ this result in the underlying interning layer.
6//!
7//! Blanket implementations supporting `T` are provided for all primitives, slices of [`Sized`]
8//! `T` (including `&[u8]`), as well as [`str`] slices (`&str`). Support for additional
9//! arbitrary types can be added by implementing [`DataType`], [`Staticize`], and [`Hash`].
10//! [`str`] slices have a custom implementation since they are the only built-in unsized type
11//! with slice support.
12//!
13//! All values are heap-allocated `'static`s and benefit from [`TypeId`]-specific locality of
14//! reference in the heap. Any two [`Interned<T>`] instances that have the same value of `T`
15//! will be guaranteed to point to the same memory address in the heap. Among other things,
16//! this allows for `O(1)` (in the size of the data) equality comparisons since the heap
17//! addresses are compared instead of having to compare the underlying data bit-by-bit. This
18//! makes interned types especially suited for parsing and similar tasks.
19//!
20//! A caveat of the `'static` lifetime and immutability of the underlying heap data is that
21//! unique values of [`Interned<T>`] and [`Memoized<I, T>`] _leak_ in the sense that they can
22//! never be de-allocated. This allows us to implement [`Copy`] on all interned types, because
23//! we can rely on the heap pointer to continue existing for the life of the program once it
24//! has been created for a particular value. For this reason, you should _not_ use this crate
25//! for long-running programs that will encounter an unbounded number of unique values, such as
26//! those created by an unending stream of user input.
27//!
28//! Because the internal size of an [`Interned<T>`] _on the stack_ is the size of a [`usize`]
29//! (pointer) plus a [`u64`] (cached hash code), it would be silly to use [`Interned<T>`] with
30//! integer types directly, however it makes sense to do so for the purposes of memoizing an
31//! expensive computation via [`Memoized<I, T>`].
32//!
33//! An interned string type, [`InStr`], is also provided as a convenient wrapper around
34//! `Interned<&'static str>`. It has a number of extra impls and should be your go-to type if
35//! you want to work with interned strings.
36//!
37//! ### Interned Example
38//! ```
39//! use interned::*;
40//!
41//! let a: Interned<i32> = 1289.into();
42//! let b = Interned::from(1289);
43//! let c: Interned<i32> = 47.into();
44//! assert_eq!(a, b);
45//! assert_ne!(a, c);
46//! assert_eq!(a.as_ptr(), b.as_ptr());
47//! assert_ne!(b.as_ptr(), c.as_ptr());
48//! let d: Interned<&str> = "asdf".into();
49//! assert_ne!(d, "fdsa".into());
50//! assert_eq!(Interned::from("asdf"), d);
51//! let e = Interned::from([1, 2, 3, 4, 5].as_slice());
52//! let f = InStr::from("abc");
53//! let g: InStr = "abc".into();
54//! assert_eq!(f, g);
55//! assert_eq!(f.as_ptr(), g.as_ptr());
56//! assert_eq!(e, [1, 2, 3, 4, 5].as_slice().into());
57//! assert_ne!(e, [4, 1, 7].as_slice().into());
58//! assert_eq!(format!("{b:?}"), "Interned<i32> { value: 1289 }");
59//! assert_eq!(format!("{d:?}"), "Interned<&str> { str: \"asdf\" }");
60//! assert_eq!(e[3], 4);
61//! assert_eq!(e[0], 1);
62//! assert_eq!(
63//!     format!("{e:?}"),
64//!     "Interned<&[i32]> { slice: [1, 2, 3, 4, 5] }"
65//! );
66//! ```
67//!
68//! ### Memoized Examples
69//! ```
70//! use interned::*;
71//!
72//! let initial_interned = num_interned::<usize>();
73//! let initial_memoized = num_memoized::<usize>();
74//! let a = Memoized::from("scope a", "some_input", |input| input.len().into());
75//! let b = Memoized::from("scope a", "other", |input| input.len().into());
76//! assert_ne!(a, b);
77//! let c = Memoized::from("scope a", "some_input", |input| input.len().into());
78//! assert_eq!(a, c);
79//! assert_ne!(b, c);
80//! assert_eq!(a.as_value(), &10);
81//! assert_ne!(*a.as_value(), 11);
82//! assert_eq!(*b.interned().interned_value(), 5);
83//! assert_eq!(*c.as_value(), 10);
84//! assert_eq!(num_interned::<usize>(), initial_interned + 2);
85//! assert_eq!(num_memoized::<usize>(), initial_memoized + 2);
86//! ```
87//!
88//! The following demonstrates how "scopes" work with `Memoized`:
89//! ```
90//! use interned::*;
91//! fn expensive_fn(a: usize, b: usize, c: usize) -> String {
92//!     format!("{}", a * a + b * b + c * c)
93//! }
94//! let a = Memoized::from("my_scope", (1, 2, 3), |tup: (usize, usize, usize)| {
95//!     expensive_fn(tup.0, tup.1, tup.2).as_str().into()
96//! });
97//! assert_eq!(a.as_str(), "14");
98//! ```
99
100#[cfg(all(doc, feature = "generate-readme"))]
101docify::compile_markdown!("README.docify.md", "README.md");
102
103pub mod _unsafe;
104pub mod datatype;
105pub use datatype::DataType;
106pub mod memoized;
107pub use memoized::Memoized;
108pub mod unsized_types;
109pub use unsized_types::*;
110
111use _unsafe::*;
112use datatype::*;
113use staticize::*;
114
115use std::{
116    any::TypeId,
117    cell::RefCell,
118    collections::{
119        hash_map::{DefaultHasher, Entry},
120        HashMap,
121    },
122    ffi::OsStr,
123    fmt::Display,
124    hash::{BuildHasher, Hash, Hasher},
125    marker::PhantomData,
126    ops::Deref,
127    path::Path,
128};
129
130thread_local! {
131    /// Internal thread-local data structure used to store all interned values.
132    static INTERNED: RefCell<HashMap<TypeId, HashMap<u64, Static>, TypeIdHasherBuilder>> = RefCell::new(HashMap::with_hasher(TypeIdHasherBuilder));
133
134    /// Internal thread-local data structure used to store all memoized values.
135    static MEMOIZED: RefCell<HashMap<TypeId, HashMap<u64, Static>, TypeIdHasherBuilder>> = RefCell::new(HashMap::with_hasher(TypeIdHasherBuilder));
136}
137
138/// Internal [`Hasher`] used to hash a [`TypeId`] by simply using the underlying `u64` of the
139/// [`TypeId`] as the hash code. This results in a zero-cost hash operation for [`TypeId`].
140struct TypeIdHasher {
141    hash: Option<u64>,
142}
143
144impl Hasher for TypeIdHasher {
145    fn finish(&self) -> u64 {
146        self.hash.unwrap()
147    }
148
149    fn write(&mut self, bytes: &[u8]) {
150        debug_assert!(bytes.len() == 8);
151        self.hash = Some(bytes.as_ptr() as u64);
152    }
153}
154
155/// Internal [`BuildHasher`] used to set up [`TypeIdHasher`] in a usable form.
156struct TypeIdHasherBuilder;
157
158impl BuildHasher for TypeIdHasherBuilder {
159    type Hasher = TypeIdHasher;
160
161    fn build_hasher(&self) -> Self::Hasher {
162        TypeIdHasher { hash: None }
163    }
164}
165
166/// The main type of this crate. Represents a unique, heap-allocated, statically interned value
167/// that will exist for the life of the program.
168///
169/// Two instances of [`Interned`] for the same value `T` will always have the same heap memory
170/// address. Additionally, `Interned` values can be copied freely, since they are merely heap
171/// pointers.
172#[derive(Copy, Clone)]
173pub struct Interned<T: Hash> {
174    _value: PhantomData<T>,
175    #[doc(hidden)]
176    pub value: Static,
177}
178
179impl<T: Hash> Interned<T> {
180    /// Provides raw access to the raw heap pointer for this [`Interned`] value. Doing
181    /// something substantive with this value is unsafe. Useful for testing.
182    pub fn as_ptr(&self) -> *const () {
183        self.value.as_ptr()
184    }
185}
186
187impl<T: Hash + Copy + Staticize + DataType> From<Static> for Interned<T> {
188    fn from(value: Static) -> Self {
189        let type_id = T::static_type_id();
190        let entry = INTERNED.with(|interned| {
191            *interned
192                .borrow_mut()
193                .entry(type_id)
194                .or_insert_with(|| HashMap::new())
195                .entry(value.hash_code())
196                .or_insert(value)
197        });
198        Interned {
199            _value: PhantomData,
200            value: entry,
201        }
202    }
203}
204
205impl<T: Hash + Copy + Staticize + DataType + From<Interned<T>>> From<T> for Interned<T::Static>
206where
207    <T as Staticize>::Static: Hash + Sized,
208{
209    fn from(value: T) -> Interned<T::Static> {
210        let mut hasher = DefaultHasher::default();
211        value.hash(&mut hasher);
212        let hash = hasher.finish();
213        let type_id = T::static_type_id();
214        let entry = INTERNED.with(|interned| {
215            *interned
216                .borrow_mut()
217                .entry(type_id)
218                .or_insert_with(|| HashMap::new())
219                .entry(hash)
220                .or_insert_with(|| value.to_static_with_hash(Some(hash)))
221        });
222        Interned {
223            _value: PhantomData,
224            value: entry,
225        }
226    }
227}
228
229impl<T: Hash + Staticize + DataType<Type = Slice>> Interned<T> {
230    /// Returns a the underlying slice interned in this [`Interned`]. Calling this method on a
231    /// non-slice will panic.
232    pub fn interned_slice<'a>(&self) -> &'a [T::SliceValueType] {
233        unsafe { self.value.as_slice::<T::SliceValueType>() }
234    }
235}
236
237impl Interned<&str> {
238    /// Returns a reference to the underlying `&str` interned in this [`Interned`]. Calling
239    /// this method on a non-string will panic.
240    pub fn interned_str<'a>(&self) -> &'a str {
241        self.value.as_str()
242    }
243}
244
245impl Interned<&OsStr> {
246    /// Returns a reference to the underlying `&OsStr` interned in this [`Interned`]. Calling
247    /// this method on a non-OsStr will panic.
248    pub fn interned_os_str<'a>(&self) -> &'a OsStr {
249        self.value.as_os_str()
250    }
251}
252
253impl Interned<&Path> {
254    /// Returns a reference to the underlying `&Path` interned in this [`Interned`]. Calling
255    /// this method on a non-Path will panic.
256    pub fn interned_path<'a>(&self) -> &'a Path {
257        self.value.as_path()
258    }
259}
260
261impl<T: Hash + Staticize + DataType<Type = Value>> Interned<T> {
262    /// Returns a reference to the underlying `T` interned in this [`Interned`]. Calling this
263    /// method on a non-value will panic.
264    pub fn interned_value<'a>(&self) -> &'a T {
265        unsafe { self.value.as_value() }
266    }
267}
268
269impl<T: Hash + Staticize + DataType> Deref for Interned<T> {
270    type Target = T::DerefTargetType;
271
272    // this `Deref` implementation safely generalizes to the proper underlying type.
273    fn deref(&self) -> &Self::Target {
274        match self.value {
275            Static::Slice(static_slice) => unsafe {
276                let target_ref: &[T::SliceValueType] =
277                    &*(static_slice.ptr as *const [T::SliceValueType]);
278                std::mem::transmute_copy(&target_ref)
279            },
280            Static::Value(static_value) => unsafe {
281                std::mem::transmute_copy(&static_value.as_value::<T>())
282            },
283            Static::Str(static_str) => unsafe { std::mem::transmute_copy(&static_str.as_str()) },
284            Static::OsStr(static_os_str) => unsafe {
285                std::mem::transmute_copy(&static_os_str.as_os_str())
286            },
287            Static::Path(static_path) => unsafe {
288                std::mem::transmute_copy(&static_path.as_path())
289            },
290        }
291    }
292}
293
294impl<T: Hash + PartialEq + Staticize + DataType> PartialEq for Interned<T>
295where
296    <T as DataType>::SliceValueType: PartialEq,
297{
298    fn eq(&self, other: &Self) -> bool {
299        unsafe { self.value._partial_eq::<T>(&other.value) }
300    }
301}
302
303impl<T: Hash + Staticize + Eq + DataType> Eq for Interned<T>
304where
305    T: PartialEq,
306    <T as DataType>::SliceValueType: PartialEq,
307{
308}
309
310impl<T: Hash + Staticize + PartialOrd + DataType> PartialOrd for Interned<T>
311where
312    <T as DataType>::SliceValueType: PartialEq,
313{
314    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
315        unsafe { self.value._partial_cmp::<T>(&other.value) }
316    }
317}
318
319impl<T: Hash + Staticize + Ord + DataType> Ord for Interned<T>
320where
321    <T as DataType>::SliceValueType: PartialEq,
322{
323    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
324        unsafe { self.value._cmp::<T>(&other.value) }
325    }
326}
327
328impl<T: Hash + Staticize> Hash for Interned<T> {
329    fn hash<H: Hasher>(&self, state: &mut H) {
330        unsafe { self.value._hash::<T, H>(state) }
331    }
332}
333
334impl<T: Hash + Staticize + DataType + std::fmt::Debug> std::fmt::Debug for Interned<T>
335where
336    <T as DataType>::SliceValueType: std::fmt::Debug,
337{
338    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
339        let mut f = f.debug_struct(format!("Interned<{}>", T::static_type_name()).as_str());
340        let ret = match self.value {
341            Static::Value(value) => f.field("value", unsafe { value.as_value::<T>() }),
342            Static::Slice(slice) => {
343                f.field("slice", unsafe { &slice.as_slice::<T::SliceValueType>() })
344            }
345            Static::Str(string) => f.field("str", &string.as_str()),
346            Static::OsStr(os_str) => f.field("OsStr", &os_str.as_os_str()),
347            Static::Path(path) => f.field("Path", &path.as_path()),
348        }
349        .finish();
350        ret
351    }
352}
353
354impl<T: Hash + Display> Display for Interned<T> {
355    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
356        use std::fmt::Pointer;
357        match self.value {
358            Static::Value(value) => unsafe { value.as_value::<T>().fmt(f) },
359            Static::Slice(slice) => unsafe { slice.as_slice::<T>().fmt(f) },
360            Static::Str(string) => string.as_str().fmt(f),
361            Static::OsStr(os_str) => os_str.as_os_str().fmt(f),
362            Static::Path(path) => path.as_path().fmt(f),
363        }
364    }
365}
366
367/// Returns the number of items currently memoized by [`Memoized`] on the current thread for
368/// the specified type `T`. This is useful for testing and debugging.
369pub fn num_memoized<T: Staticize>() -> usize {
370    let type_id = T::static_type_id();
371    MEMOIZED.with(|interned| interned.borrow_mut().entry(type_id).or_default().len())
372}
373
374/// Returns the number of items currently interned by [`Interned`] on the current thread for
375/// the specified type `T`. This is useful for testing and debugging.
376pub fn num_interned<T: Staticize>() -> usize {
377    let type_id = T::static_type_id();
378    INTERNED.with(|interned| interned.borrow_mut().entry(type_id).or_default().len())
379}
380
381/// Derives [`From<Interned<T>>`] for the specified value type.
382#[macro_export]
383macro_rules! derive_from_interned_impl_value {
384    ($ty:ty) => {
385        impl From<$crate::Interned<$ty>> for $ty {
386            fn from(value: Interned<$ty>) -> Self {
387                use $crate::_unsafe::Static::*;
388                match value.value {
389                    Value(val) => unsafe { *val.as_value() },
390                    _ => unreachable!(),
391                }
392            }
393        }
394    };
395}
396
397/// Derives [`From<Interned<T>>`] for the specified slice type.
398#[macro_export]
399macro_rules! derive_from_interned_impl_slice {
400    ($ty:ty) => {
401        impl From<$crate::Interned<$ty>> for $ty {
402            fn from(value: Interned<$ty>) -> Self {
403                use $crate::_unsafe::Static::*;
404                match value.value {
405                    Slice(slice) => unsafe { slice.as_slice() },
406                    _ => unreachable!(),
407                }
408            }
409        }
410    };
411}
412
413impl From<Interned<&str>> for &str {
414    fn from(value: Interned<&str>) -> Self {
415        value.interned_str()
416    }
417}
418
419impl From<Interned<&OsStr>> for &OsStr {
420    fn from(value: Interned<&OsStr>) -> Self {
421        value.interned_os_str()
422    }
423}
424
425impl From<Interned<&Path>> for &Path {
426    fn from(value: Interned<&Path>) -> Self {
427        value.interned_path()
428    }
429}
430
431derive_from_interned_impl_value!(char);
432derive_from_interned_impl_value!(bool);
433derive_from_interned_impl_value!(usize);
434derive_from_interned_impl_value!(u8);
435derive_from_interned_impl_value!(u16);
436derive_from_interned_impl_value!(u32);
437derive_from_interned_impl_value!(u64);
438derive_from_interned_impl_value!(u128);
439derive_from_interned_impl_value!(i8);
440derive_from_interned_impl_value!(i16);
441derive_from_interned_impl_value!(i32);
442derive_from_interned_impl_value!(i64);
443derive_from_interned_impl_value!(i128);
444derive_from_interned_impl_slice!(&[bool]);
445derive_from_interned_impl_slice!(&[usize]);
446derive_from_interned_impl_slice!(&[u8]);
447derive_from_interned_impl_slice!(&[u16]);
448derive_from_interned_impl_slice!(&[u32]);
449derive_from_interned_impl_slice!(&[u64]);
450derive_from_interned_impl_slice!(&[u128]);
451derive_from_interned_impl_slice!(&[i8]);
452derive_from_interned_impl_slice!(&[i16]);
453derive_from_interned_impl_slice!(&[i32]);
454derive_from_interned_impl_slice!(&[i64]);
455derive_from_interned_impl_slice!(&[i128]);