jupiter/ig/symbols.rs
1//! Provides a `SymbolTable` and a `SymbolMap` for an efficient and compact representation
2//! of objects / inner maps.
3//!
4//! As most datasets consist of many objects which all share the same keys, we keep the actual
5//! strings in a symbol table and only store the symbols in the actual objects.
6//!
7//! This heavily reduces the memory being consumed and also permits a very cache friendly and there-
8//! fore super fast lookup for values in inner objects.
9use std::collections::HashMap;
10use std::slice::Iter;
11
12/// Defines the representation of a symbol.
13///
14/// Note that a single `Doc` (and its `SymbolTable') may only store up to 2^31 - 1 distinct symbols.
15pub type Symbol = i32;
16
17/// Used to resolve and lookup symbols.
18#[derive(Clone, Default)]
19pub struct SymbolTable {
20 table: HashMap<String, Symbol>,
21 symbols: Vec<String>,
22}
23
24impl SymbolTable {
25 /// Creates a new and empty symbol table.
26 ///
27 /// # Example
28 ///
29 /// ```
30 /// # use jupiter::ig::symbols::SymbolTable;
31 /// let table = SymbolTable::new();
32 ///
33 /// assert_eq!(table.len(), 0);
34 /// ```
35 pub fn new() -> Self {
36 SymbolTable {
37 table: HashMap::new(),
38 symbols: Vec::new(),
39 }
40 }
41
42 /// Tries to resolve the given `string` into an existing `Symbol`.
43 ///
44 /// If no symbol with the given name is known, `None` is returned. If a new
45 /// symbol should be created instead, use `find_or_create`.
46 ///
47 /// # Example
48 /// ```
49 /// # use jupiter::ig::symbols::SymbolTable;
50 /// let mut table = SymbolTable::new();
51 ///
52 /// let symbol = table.find_or_create("Test").unwrap();
53 ///
54 /// assert_eq!(table.resolve("Test").unwrap(), symbol);
55 /// assert_eq!(table.resolve("Unknown").is_none(), true);
56 /// ```
57 pub fn resolve(&self, string: impl AsRef<str>) -> Option<Symbol> {
58 self.table.get(string.as_ref()).copied()
59 }
60
61 /// Resolve the given `string` into a new or an existing `Symbol`.
62 ///
63 /// If no symbol should be created if the given name is unknown, use `resolve`.
64 ///
65 /// # Errors
66 ///
67 /// This will return an error if the internal symbol table overflows (if there are more than
68 /// std::i32::MAX - 2 symbols).
69 ///
70 /// # Example
71 /// ```
72 /// # use jupiter::ig::symbols::SymbolTable;
73 /// let mut table = SymbolTable::new();
74 ///
75 /// let symbol = table.find_or_create("Test").unwrap();
76 /// assert_eq!(table.resolve("Test").unwrap(), symbol);
77 /// ```
78 pub fn find_or_create(&mut self, string: impl AsRef<str>) -> anyhow::Result<Symbol> {
79 let value = string.as_ref();
80 if let Some(symbol) = self.table.get(value) {
81 Ok(*symbol)
82 } else if self.symbols.len() >= i32::MAX as usize {
83 Err(anyhow::anyhow!("Symbol table overflow!"))
84 } else {
85 let new_symbol = (self.symbols.len() + 1) as i32;
86
87 let _ = self.table.insert(value.to_owned(), new_symbol);
88 self.symbols.push(value.to_owned());
89
90 Ok(new_symbol)
91 }
92 }
93
94 /// Retrieves the name of the given `Symbol`.
95 ///
96 /// # Examples
97 /// ```
98 /// # use jupiter::ig::symbols::SymbolTable;
99 /// let mut table = SymbolTable::new();
100 ///
101 /// let symbol = table.find_or_create("Test").unwrap();
102 ///
103 /// // A known symbol can be looked up...
104 /// assert_eq!(table.lookup(symbol), "Test");
105 ///
106 /// // An unknown symbol is simply translated to ""
107 /// assert_eq!(table.lookup(1024), "");
108 /// ```
109 pub fn lookup(&self, symbol: Symbol) -> &str {
110 self.symbols
111 .get((symbol - 1) as usize)
112 .map(|string| string.as_str())
113 .unwrap_or("")
114 }
115
116 /// Determines the number of known symbols in the table.
117 ///
118 /// # Examples
119 /// ```
120 /// # use jupiter::ig::symbols::SymbolTable;
121 /// let mut table = SymbolTable::new();
122 ///
123 /// // The same symbol is only added once to a table...
124 /// let symbol = table.find_or_create("Test").unwrap();
125 /// let symbol1 = table.find_or_create("Test").unwrap();
126 /// assert_eq!(symbol, symbol1);
127 ///
128 /// // ..therefore the table size is 1.
129 /// assert_eq!(table.len(), 1);
130 ///
131 /// // If we add another symbol...
132 /// table.find_or_create("Test 2").unwrap();
133 ///
134 /// // ...the size grows to 2.
135 /// assert_eq!(table.len(), 2);
136 /// ```
137 pub fn len(&self) -> usize {
138 self.symbols.len()
139 }
140
141 /// Determines if the symbol table is empty.
142 ///
143 /// # Examples
144 /// ```
145 /// # use jupiter::ig::symbols::SymbolTable;
146 /// let mut table = SymbolTable::new();
147 ///
148 /// assert_eq!(table.is_empty(), true);
149 /// let _ = table.find_or_create("Test").unwrap();
150 /// assert_eq!(table.is_empty(), false);
151 /// ```
152 pub fn is_empty(&self) -> bool {
153 self.symbols.is_empty()
154 }
155
156 /// Estimates the allocated memory required to represent the symbol table.
157 ///
158 /// Note that this is only an approximation as some inner types to not reveal their
159 /// size.
160 ///
161 /// # Example
162 ///
163 /// ```
164 /// # use jupiter::ig::symbols::SymbolTable;
165 /// let mut table = SymbolTable::new();
166 ///
167 /// table.find_or_create("Hello").unwrap();
168 /// table.find_or_create("World").unwrap();
169 ///
170 /// println!("{}", table.allocated_size());
171 /// ```
172 pub fn allocated_size(&self) -> usize {
173 // This is the semi-official way of estimating the size of the underlying hash table..
174 //
175 // Internally a bit more than the actual capacity is allocated to guarantee a proper load
176 // factor...
177 let table_size = self.table.capacity() * 11 / 10
178 // Per entry, the key, its value and a hash is stored...
179 * (std::mem::size_of::<usize>() + std::mem::size_of::<String>() + std::mem::size_of::<Symbol>());
180
181 // The lookup-table is known to be simply a Vec of strings...
182 let lookup_size = self.symbols.capacity() * std::mem::size_of::<String>();
183
184 // And of course we have to add up the bytes in each string...
185 let content_size: usize = self.symbols.iter().map(|string| string.len()).sum();
186
187 table_size + lookup_size + content_size
188 }
189}
190
191/// Provides a efficient lookup map using `Symbol` as key.
192///
193/// This map is optimized as we know for sure, that the key is an `i32`. Also, we will use this
194/// map for objects within `Node` and therefore know that most commonly there will be only a few
195/// keys (most probably < 8 and almost always < 32).
196///
197/// Therefore we simply use a `Vec` of tuples to store the keys and their value. We also keep
198/// this vector sorted by the key so that a lookup can be done via `binary_search`.
199///
200/// Using this approach easily beats `HashMap` in both, performance and memory consumption.
201#[derive(Debug, Eq, PartialEq, Default)]
202pub struct SymbolMap<V> {
203 entries: Vec<(Symbol, V)>,
204}
205
206impl<V: Default> SymbolMap<V> {
207 /// Creates a new and empty map.
208 ///
209 /// # Examples
210 ///
211 /// ```
212 /// # use jupiter::ig::symbols::SymbolMap;
213 /// let mut map = SymbolMap::<i32>::new();
214 /// assert_eq!(map.len(), 0);
215 /// ```
216 pub fn new() -> Self {
217 SymbolMap {
218 entries: Vec::with_capacity(2),
219 }
220 }
221
222 /// Retrieves the value for the given key.
223 ///
224 /// # Examples
225 ///
226 /// ```
227 /// # use jupiter::ig::symbols::SymbolMap;
228 /// let mut map = SymbolMap::<i32>::new();
229 ///
230 /// map.put(42, 1);
231 ///
232 /// assert_eq!(map.get(42).unwrap(), &1);
233 /// assert_eq!(map.get(99), None);
234 /// ```
235 pub fn get(&self, key: Symbol) -> Option<&V> {
236 if let Ok(pos) = self.entries.binary_search_by_key(&key, |entry| entry.0) {
237 Some(&self.entries[pos].1)
238 } else {
239 None
240 }
241 }
242
243 /// Retrieves the value for the given key or creates a new one in none exists.
244 ///
245 /// If a new value is created, [Default::default](Default::default) is invoked and a mutable
246 /// reference is returned.
247 ///
248 /// # Examples
249 ///
250 /// ```
251 /// # use jupiter::ig::symbols::SymbolMap;
252 /// let mut map = SymbolMap::<i32>::new();
253 ///
254 /// map.put(42, 1);
255 ///
256 /// assert_eq!(map.get_or_insert(42, || 1), &1);
257 /// assert_eq!(map.get_or_insert(23, || 0), &0);
258 /// ```
259 pub fn get_or_insert(&mut self, key: Symbol, mut factory: impl FnMut() -> V) -> &mut V {
260 match self.entries.binary_search_by_key(&key, |entry| entry.0) {
261 Ok(pos) => &mut self.entries[pos].1,
262 Err(pos) => {
263 self.entries.insert(pos, (key, factory()));
264 &mut self.entries[pos].1
265 }
266 }
267 }
268
269 /// Retrieves the value for the given key as mutable reference.
270 ///
271 /// # Examples
272 ///
273 /// ```
274 /// # use jupiter::ig::symbols::SymbolMap;
275 /// let mut map = SymbolMap::<String>::new();
276 ///
277 /// map.put(42, "Hello".to_owned());
278 /// map.get_mut(42).unwrap().push_str(" World");
279 ///
280 /// assert_eq!(map.get(42).unwrap().as_str(), "Hello World");
281 /// ```
282 pub fn get_mut(&mut self, key: Symbol) -> Option<&mut V> {
283 if let Ok(pos) = self.entries.binary_search_by_key(&key, |entry| entry.0) {
284 Some(&mut self.entries[pos].1)
285 } else {
286 None
287 }
288 }
289
290 /// Puts a value in the for the given key.
291 ///
292 /// # Examples
293 ///
294 /// ```
295 /// # use jupiter::ig::symbols::SymbolMap;
296 /// let mut map = SymbolMap::<String>::new();
297 ///
298 /// map.put(42, "Hello".to_owned());
299 ///
300 /// assert_eq!(map.get(42).unwrap().as_str(), "Hello");
301 /// ```
302 pub fn put(&mut self, key: Symbol, value: V) {
303 match self.entries.binary_search_by_key(&key, |entry| entry.0) {
304 Ok(pos) => self.entries[pos].1 = value,
305 Err(pos) => self.entries.insert(pos, (key, value)),
306 }
307 }
308
309 /// Counts the number of entries in the map.
310 ///
311 /// # Examples
312 ///
313 /// ```
314 /// # use jupiter::ig::symbols::SymbolMap;
315 /// let mut map = SymbolMap::<String>::new();
316 ///
317 /// // If an entry is put in the map, its size is incremented...
318 /// map.put(42, "Hello".to_owned());
319 /// assert_eq!(map.len(), 1);
320 ///
321 /// // If the entry is overwritten, the size remains the same...
322 /// map.put(42, "Hello".to_owned());
323 /// assert_eq!(map.len(), 1);
324 ///
325 /// // If another entry is added, its size grows once again...
326 /// map.put(99, "World".to_owned());
327 /// assert_eq!(map.len(), 2);
328 /// ```
329 pub fn len(&self) -> usize {
330 self.entries.len()
331 }
332
333 /// Determines if this map is empty.
334 ///
335 /// # Examples
336 ///
337 /// ```
338 /// # use jupiter::ig::symbols::SymbolMap;
339 /// let mut map = SymbolMap::<String>::new();
340 ///
341 /// assert_eq!(map.is_empty(), true);
342 /// map.put(42, "Hello".to_owned());
343 /// assert_eq!(map.is_empty(), false);
344 /// ```
345 pub fn is_empty(&self) -> bool {
346 self.entries.is_empty()
347 }
348
349 /// Reports the capacity currently reserved.
350 ///
351 /// # Examples
352 ///
353 /// ```
354 /// # use jupiter::ig::symbols::SymbolMap;
355 /// let mut map = SymbolMap::<String>::new();
356 ///
357 /// let initial_capacity = map.capacity();
358 ///
359 /// for i in 1..=initial_capacity+1 {
360 /// map.put(i as i32, "Hello".to_owned());
361 /// }
362 ///
363 /// assert_eq!(map.capacity() > initial_capacity, true);
364 /// ```
365 pub fn capacity(&self) -> usize {
366 self.entries.capacity()
367 }
368
369 /// Provides an iterator over all entries in the map.
370 ///
371 /// # Examples
372 ///
373 /// ```
374 /// # use jupiter::ig::symbols::SymbolMap;
375 /// # use itertools::Itertools;
376 /// let mut map = SymbolMap::<String>::new();
377 ///
378 /// map.put(42, "Hello".to_owned());
379 /// map.put(99, "World".to_owned());
380 ///
381 /// let string = map.entries().map(|(key, value)| format!("{}: {}", key, value)).join(", ");
382 /// assert_eq!(string, "42: Hello, 99: World");
383 /// ```
384 pub fn entries(&self) -> Iter<(Symbol, V)> {
385 self.entries.iter()
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use crate::ig::symbols::{SymbolMap, SymbolTable};
392
393 #[test]
394 pub fn resolve_and_lookup_works() {
395 let mut table = SymbolTable::new();
396
397 let symbol = table.find_or_create("Hello").unwrap();
398 assert_eq!(table.find_or_create("Hello").unwrap(), symbol);
399 assert_eq!(table.resolve("Hello").unwrap(), symbol);
400 assert_eq!(table.lookup(symbol), "Hello");
401 assert_eq!(table.lookup(-1), "");
402 assert_eq!(table.resolve("World").is_none(), true);
403 assert_eq!(table.len(), 1);
404 }
405
406 #[test]
407 pub fn get_and_put_work_for_symbol_map() {
408 let mut map = SymbolMap::new();
409 for symbol in (1..128).rev() {
410 map.put(symbol, symbol);
411 }
412 for symbol in 1..128 {
413 assert_eq!(*map.get(symbol).unwrap(), symbol);
414 assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
415 }
416
417 assert_eq!(map.get(130).is_none(), true);
418 assert_eq!(map.get_mut(130).is_none(), true);
419
420 map.put(1, 42);
421 assert_eq!(*map.get(1).unwrap(), 42);
422 assert_eq!(*map.get_mut(1).unwrap(), 42);
423 for symbol in 2..128 {
424 assert_eq!(*map.get(symbol).unwrap(), symbol);
425 assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
426 }
427
428 assert_eq!(map.len(), 127);
429 assert_eq!(
430 map.entries().map(|(symbol, _)| symbol).sum::<i32>(),
431 127 * 128 / 2
432 );
433 }
434
435 #[test]
436 pub fn get_or_insert_works() {
437 let mut map = SymbolMap::new();
438
439 assert_eq!(map.get_or_insert(42, || 0), &0);
440
441 map.put(42, 1);
442 assert_eq!(map.get_or_insert(42, || 1), &1);
443
444 *map.get_or_insert(23, || 0) = 5;
445 assert_eq!(map.get_or_insert(23, || 0), &5);
446 }
447}