texcraft_stdext/collections/interner.rs
1//! String interning
2//!
3//! A string interner is a data structure that enables strings to be represented as integers
4//! in a computer program.
5//! Interning strings is often an optimization because only one copy of each distinct string is stored,
6//! the string type is smaller and more cache friendly,
7//! and string operations like comparisons are faster.
8//! The cost of string interning (at least as implemented here) is that once a string is interned,
9//! it is never deallocated.
10//!
11//! When using the [Interner] in this module, strings are interned using the [get_or_intern](Interner::get_or_intern) method.
12//! This method returns a _key_.
13//! If the same string is interned twice, the same key is returned.
14//! The type of the key is fixed for each instance of the interner, and can be any
15//! type that implements the [Key] trait.
16//! By default the interner uses [std::num::NonZeroU32], which is a 32-bit integer.
17//!
18//! Given a key, the original string value can be recovered using the [resolve](Interner::resolve) method.
19//!
20//! ```
21//! # use texcraft_stdext::collections::interner::Interner;
22//! let mut interner: Interner = Default::default();
23//! let hello_1 = interner.get_or_intern("hello");
24//! let world_1 = interner.get_or_intern("world");
25//! let hello_2 = interner.get_or_intern("hello");
26//! assert_eq!(hello_1, hello_2);
27//! assert_ne!(hello_1, world_1);
28//!
29//! assert_eq!(interner.resolve(hello_1), Some("hello"));
30//! assert_eq!(interner.resolve(world_1), Some("world"));
31//!
32//! assert_eq!(interner.get("hello"), Some(hello_1));
33//! assert_eq!(interner.get("other"), None);
34//! ```
35//!
36//! The code in the interner is written from scratch, but all aspects of it
37//! (the algorithm, the API, even some variable names) are based on Robin Freyler's
38//! [string-interner](https://docs.rs/crate/string-interner/latest) crate.
39//! For this reason the code is jointly copyrighted between Robin Freyler and the Texcraft contributors.
40//!
41//! ## The implementation
42//!
43//! The algorithm is based on the [string-interner](https://docs.rs/crate/string-interner/latest) crate.
44//! This algorithm is also separately discovered and discussed in
45//! [a post by Mat Klad](https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html).
46//!
47//! The interner maintains a [String] buffer, and each time a new string is interned it's appended to the buffer.
48//! A vector of indices is used to record the position of each string in the buffer.
49//! When a new string is added to the buffer, the current length of the buffer (which is the end index
50//! of the string in the buffer) is appended to the vector.
51//! The key of the string is then the length of the vector when the index is appended.
52//! Thus using the key, we can easily find the end index.
53//!
54//! To recover a string using its key, we get the end index from the vector.
55//! We get the start index by getting the end index of the _previous_ string that was interned.
56//! Given the process described above, the start index is stored in the vector just before the end index.
57//! The recovered string is then the substring of the buffer between these two indices.
58//!
59//! This handles adding new strings.
60//! A key property of the interner is that it also deduplicates strings.
61//! The naive way to do this is to maintain a map from strings to keys, and first search
62//! in this map for the string.
63//! The problem with this approach is that it requires a costly second allocation
64//! of each interned string in this map.
65//!
66//! The string-interner crate and this module use different approaches to fix this.
67//! In the crate, the map is keyed on the interned string's integer key.
68//! When performing operations on the map, hash and equality of keys is based on the underlying string.
69//!
70//! In this module, the map is keyed on the [u64] hash of each string, which is computed outside of the map.
71//! There is an edge case in which the hashes of two strings collide.
72//! For this reason the value of the map is a linked list of all string keys that have the corresponding hash.
73//! When checking if a string exists, we walk the linked list and check if the resolved string for each key
74//! matches.
75//! If not, we intern the string and append to the linked list.
76//! In the worst case this can result in O(n) lookups, but in reality hash collisions are rare.
77
78use std::collections::hash_map;
79use std::collections::HashMap;
80use std::hash;
81use std::num;
82
83/// String interner.
84///
85/// See the module documentation for information about this data structure.
86#[cfg_attr(feature = "serde", derive(serde::Serialize))]
87pub struct Interner<K = num::NonZeroU32, S = hash_map::RandomState> {
88 buffer: String,
89 ends: Vec<usize>,
90 // When deserializing the interner, we reconstruct the deduplication map. We do this because the hash
91 // builder in the deserialized interner will in general be different and so the keys of the map
92 // will have changed. Additionally this is more efficient.
93 #[cfg_attr(feature = "serde", serde(skip))]
94 dedup: DeDupMap<K>,
95 #[cfg_attr(feature = "serde", serde(skip))]
96 hash_builder: S,
97}
98
99impl<K, S: Default> Default for Interner<K, S> {
100 fn default() -> Self {
101 Self {
102 buffer: Default::default(),
103 ends: Default::default(),
104 dedup: Default::default(),
105 hash_builder: Default::default(),
106 }
107 }
108}
109
110/// Types implementing this trait can be used as keys in the [Interner].
111pub trait Key: Copy + Eq {
112 /// Try to create a key from the provided [usize]. The first [usize]
113 /// passed to this method will be 0; the second 1; and so on.
114 ///
115 /// This method is more or less the same as the well-known [`TryFrom<usize>`] trait.
116 /// We use a custom trait so that consumers don't have to implement the well-known trait.
117 fn try_from_usize(index: usize) -> Option<Self>;
118 /// Convert the key into a [usize].
119 ///
120 /// This method is more or less the same as the well-known [`Into<usize>`] trait.
121 /// We use a custom trait so that consumers don't have to implement the well-known trait.
122 fn into_usize(self) -> usize;
123}
124
125impl Key for num::NonZeroU32 {
126 fn try_from_usize(index: usize) -> Option<Self> {
127 let u32: u32 = match index.try_into() {
128 Ok(u32) => u32,
129 Err(_) => return None,
130 };
131 num::NonZeroU32::new(u32 + 1)
132 }
133
134 fn into_usize(self) -> usize {
135 self.get() as usize
136 }
137}
138
139impl<K: Key, S: hash::BuildHasher> Interner<K, S> {
140 /// Intern the provided string and return its key.
141 pub fn get_or_intern(&mut self, s: &str) -> K {
142 // First we check if the string has already been interned.
143 let hash = hash(&self.hash_builder, s);
144 if let Some(key) = self.get_internal(s, hash) {
145 return key;
146 }
147
148 // If the string hasn't been interned, we now intern it.
149 let key = K::try_from_usize(self.ends.len()).unwrap();
150 self.buffer.push_str(s);
151 let end = self.buffer.len();
152 self.ends.push(end);
153 populate_dedup_map(&mut self.dedup, hash, key);
154 key
155 }
156
157 /// Get the key for the provided string if it has been already been interned.
158 ///
159 /// This method is useful when the caller only has a shared reference to the interner.
160 pub fn get(&self, s: &str) -> Option<K> {
161 self.get_internal(s, hash(&self.hash_builder, s))
162 }
163
164 fn get_internal(&self, s: &str, hash: u64) -> Option<K> {
165 let mut node_or = self.dedup.get(&hash);
166 while let Some(node) = node_or {
167 if self.resolve(node.key).unwrap() == s {
168 return Some(node.key);
169 }
170 node_or = match &node.next {
171 None => None,
172 Some(node) => Some(node),
173 };
174 }
175 None
176 }
177
178 /// Return the interned string corresponding to the provided key.
179 pub fn resolve(&self, k: K) -> Option<&str> {
180 let i = k.into_usize().wrapping_sub(1);
181 let start = match i.checked_sub(1) {
182 None => 0,
183 Some(prev_k) => match self.ends.get(prev_k) {
184 None => return None,
185 Some(start) => *start,
186 },
187 };
188 let end = match self.ends.get(i) {
189 None => return None,
190 Some(end) => *end,
191 };
192 Some(&self.buffer[start..end])
193 }
194}
195
196fn hash<S: hash::BuildHasher>(hash_builder: &S, s: &str) -> u64 {
197 use std::hash::{Hash, Hasher};
198 let mut hasher = hash_builder.build_hasher();
199 s.hash(&mut hasher);
200 hasher.finish()
201}
202
203type DeDupMap<K> = HashMap<u64, LinkedList<K>, hash::BuildHasherDefault<SingleU64Hasher>>;
204
205fn populate_dedup_map<K>(map: &mut DeDupMap<K>, hash: u64, key: K) {
206 match map.entry(hash) {
207 hash_map::Entry::Occupied(mut o) => {
208 let first = o.get_mut();
209 let second = std::mem::replace(first, LinkedList { key, next: None });
210 first.next = Some(Box::new(second));
211 }
212 hash_map::Entry::Vacant(v) => {
213 v.insert(LinkedList { key, next: None });
214 }
215 };
216}
217
218struct LinkedList<K> {
219 key: K,
220 next: Option<Box<LinkedList<K>>>,
221}
222
223/// A hasher that can only hash a single [u64] value, and whose result is simply the [u64] value.
224///
225/// This hasher is used to make the hashing in the interner's deduplication map a no-op.
226/// We use this hasher because the [u64] key for the map is already a hash (of a string),
227/// and hashing the value again is wasteful.
228///
229/// The implementation of this hasher uses safe Rust and performs at least two panic-able checks
230/// on the hot path of hashing the value.
231/// However when compiled, the entire hasher is completely optimized out, and the
232/// hashing function inside the hash map becomes the identity function for the [u64] type.
233#[derive(Default)]
234struct SingleU64Hasher {
235 val: Option<u64>,
236}
237
238impl hash::Hasher for SingleU64Hasher {
239 #[inline]
240 fn finish(&self) -> u64 {
241 self.val.unwrap()
242 }
243
244 fn write(&mut self, _: &[u8]) {
245 panic!("this hasher does not support writing arbitrary bytes, only a single u64 value")
246 }
247
248 #[inline]
249 fn write_u64(&mut self, i: u64) {
250 if self.val.is_some() {
251 panic!("this hasher does not support writing multiple u64 values")
252 }
253 self.val = Some(i)
254 }
255}
256
257#[cfg(feature = "serde")]
258impl<'de, K: Key, S: Default + hash::BuildHasher> serde::Deserialize<'de> for Interner<K, S> {
259 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
260 where
261 D: serde::Deserializer<'de>,
262 {
263 #[derive(serde::Deserialize)]
264 struct DeserializedInterner {
265 buffer: String,
266 ends: Vec<usize>,
267 }
268
269 let DeserializedInterner { buffer, ends } =
270 DeserializedInterner::deserialize(deserializer)?;
271 let hash_builder = S::default();
272 let mut dedup = DeDupMap::<K>::default();
273 dedup.reserve(ends.len());
274
275 let mut start: usize = 0;
276 for (i, end) in ends.iter().enumerate() {
277 let s = &buffer[start..*end];
278 let hash = hash(&hash_builder, s);
279 let key = K::try_from_usize(i).unwrap();
280 populate_dedup_map(&mut dedup, hash, key);
281 start = *end;
282 }
283 Ok(Self {
284 buffer,
285 ends,
286 dedup,
287 hash_builder,
288 })
289 }
290}
291
292#[cfg(test)]
293mod tests {
294 use super::*;
295
296 /// A hasher that always returns the same fixed value.
297 /// This is use to test hash collisions.
298 #[derive(Default)]
299 struct FixedHasher;
300
301 impl hash::Hasher for FixedHasher {
302 fn finish(&self) -> u64 {
303 12
304 }
305
306 fn write(&mut self, _: &[u8]) {}
307 }
308
309 #[test]
310 fn test_hash_collision() {
311 let mut interner: Interner<num::NonZeroU32, hash::BuildHasherDefault<FixedHasher>> =
312 Default::default();
313 let hello_1 = interner.get_or_intern("hello");
314 let world_1 = interner.get_or_intern("world");
315 let hello_2 = interner.get_or_intern("hello");
316 assert_eq!(hello_1, hello_2);
317 assert_ne!(hello_1, world_1);
318
319 assert_eq!(interner.resolve(hello_1), Some("hello"));
320 assert_eq!(interner.resolve(world_1), Some("world"));
321 }
322
323 #[cfg(feature = "serde")]
324 #[test]
325 fn test_serde() {
326 let mut interner: Interner = Default::default();
327 let hello_1 = interner.get_or_intern("hello");
328 let world_1 = interner.get_or_intern("world");
329
330 let serialized = serde_json::to_string_pretty(&interner).unwrap();
331 let mut interner_de: Interner = serde_json::from_str(&serialized).unwrap();
332 let hello_2 = interner_de.get_or_intern("hello");
333 let world_2 = interner_de.get_or_intern("world");
334
335 assert_eq!(hello_1, hello_2);
336 assert_eq!(world_1, world_2);
337 }
338}