intern_string/
lib.rs

1use fxhash::{FxBuildHasher, FxHashMap};
2
3#[derive(Default)]
4pub struct Intern<'a> {
5    data: FxHashMap<&'a str, InternId>,
6    list: Vec<Box<str>>,
7}
8
9pub type InternId = u32;
10
11impl Intern<'_> {
12    /// Create a new intern table.
13    pub fn new() -> Self {
14        Self {
15            data: FxHashMap::default(),
16            list: Vec::new(),
17        }
18    }
19
20    /// Create a new intern table with the given capacity.
21    pub fn with_capacity(capacity: usize) -> Self {
22        Self {
23            data: FxHashMap::with_capacity_and_hasher(capacity, FxBuildHasher::default()),
24            list: Vec::with_capacity(capacity),
25        }
26    }
27
28    /// Intern a string.
29    /// Returns the interned id.
30    /// If the string is already interned, returns the existing id.
31    /// The string is stored in the intern table for the lifetime of the program.
32    /// The id is a 32-bit integer, so there can be at most 2^32 unique strings interned.
33    /// If the limit is reached, this function will panic.
34    /// The id is guaranteed to be unique for the lifetime of the program.
35    ///
36    /// ## Examples
37    ///
38    /// ```
39    /// use intern_string::Intern;
40    ///
41    /// let mut intern = Intern::new();
42    /// let id = intern.intern("hello");
43    /// assert_eq!(intern.lookup(id), "hello");
44    /// ```
45    #[inline]
46    pub fn intern<V: Into<String> + AsRef<str>>(&mut self, input: V) -> InternId {
47        if let Some(&id) = self.data.get(input.as_ref()) {
48            return id;
49        }
50
51        let owned = input.into().into_boxed_str();
52
53        let str_data = owned.as_ptr();
54        let str_len = owned.len();
55
56        let id = self.list.len() as InternId;
57        self.list.push(owned);
58
59        // SAFETY: we can do this because the allocations inside of a Box<str>
60        // are stable, and so passing ownership to push does not change the
61        // address.
62        //
63        // additionally, because we have not touched the string since we created
64        // these raw pointers ourselves, we know that it is valid UTF-8 and so
65        // can skip that check as well.
66        let k =
67            unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(str_data, str_len)) };
68
69        self.data.insert(k, id);
70        id
71    }
72
73    /// Lookup the interned string by id.
74    ///
75    /// # Panics
76    ///
77    /// Panics if the id is not valid.
78    ///
79    /// # Examples
80    ///
81    /// ```
82    /// use intern_string::Intern;
83    ///
84    /// let mut intern = Intern::new();
85    /// let id = intern.intern("hello");
86    /// assert_eq!(intern.lookup(id), "hello");
87    /// ```
88    #[inline]
89    pub fn lookup(&self, id: InternId) -> &str {
90        &self.list[id as usize]
91    }
92
93    /// Lookup the interned string by id.
94    /// Returns `None` if the id is not valid.
95    ///
96    /// # Examples
97    ///
98    /// ```
99    /// use intern_string::Intern;
100    ///
101    /// let mut intern = Intern::new();
102    /// let id = intern.intern("hello");
103    /// assert_eq!(intern.try_lookup(id), Some("hello"));
104    /// ```
105    #[inline]
106    pub fn try_lookup(&self, id: InternId) -> Option<&str> {
107        self.list.get(id as usize).map(|s| &**s)
108    }
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114
115    #[test]
116    fn interns_and_handles_lookup() {
117        let mut interner = Intern::new();
118        let id = interner.intern("hello");
119        assert_eq!(interner.lookup(id), "hello");
120        assert_eq!(interner.try_lookup(id), Some("hello"));
121    }
122
123    #[test]
124    fn reallocate() {
125        let mut interner = Intern::with_capacity(1);
126        let id1 = interner.intern("hello");
127
128        // this should reallocate the internal list
129        let id2 = interner.intern("world");
130
131        assert_eq!(interner.lookup(id1), "hello");
132        assert_eq!(interner.try_lookup(id1), Some("hello"));
133        assert_eq!(interner.lookup(id2), "world");
134        assert_eq!(interner.try_lookup(id2), Some("world"));
135    }
136}