jupiter/ig/
symbols.rs

1//! Provides a `SymbolTable` and a `SymbolMap` for an efficient and compact representation
2//! of objects / inner maps.
3//!
4//! As most datasets consist of many objects which all share the same keys, we keep the actual
5//! strings in a symbol table and only store the symbols in the actual objects.
6//!
7//! This heavily reduces the memory being consumed and also permits a very cache friendly and there-
8//! fore super fast lookup for values in inner objects.
9use std::collections::HashMap;
10use std::slice::Iter;
11
12/// Defines the representation of a symbol.
13///
14/// Note that a single `Doc` (and its `SymbolTable') may only store up to 2^31 - 1 distinct symbols.
15pub type Symbol = i32;
16
17/// Used to resolve and lookup symbols.
18#[derive(Clone, Default)]
19pub struct SymbolTable {
20    table: HashMap<String, Symbol>,
21    symbols: Vec<String>,
22}
23
24impl SymbolTable {
25    /// Creates a new and empty symbol table.
26    ///
27    /// # Example
28    ///
29    /// ```
30    /// # use jupiter::ig::symbols::SymbolTable;
31    /// let table = SymbolTable::new();
32    ///
33    /// assert_eq!(table.len(), 0);
34    /// ```
35    pub fn new() -> Self {
36        SymbolTable {
37            table: HashMap::new(),
38            symbols: Vec::new(),
39        }
40    }
41
42    /// Tries to resolve the given `string` into an existing `Symbol`.
43    ///
44    /// If no symbol with the given name is known, `None` is returned. If a new
45    /// symbol should be created instead, use `find_or_create`.
46    ///
47    /// # Example
48    /// ```
49    /// # use jupiter::ig::symbols::SymbolTable;
50    /// let mut table = SymbolTable::new();
51    ///
52    /// let symbol = table.find_or_create("Test").unwrap();
53    ///
54    /// assert_eq!(table.resolve("Test").unwrap(), symbol);
55    /// assert_eq!(table.resolve("Unknown").is_none(), true);
56    /// ```
57    pub fn resolve(&self, string: impl AsRef<str>) -> Option<Symbol> {
58        self.table.get(string.as_ref()).copied()
59    }
60
61    /// Resolve the given `string` into a new or an existing `Symbol`.
62    ///
63    /// If no symbol should be created if the given name is unknown, use `resolve`.
64    ///
65    /// # Errors
66    ///
67    /// This will return an error if the internal symbol table overflows (if there are more than
68    /// std::i32::MAX - 2 symbols).
69    ///
70    /// # Example
71    /// ```
72    /// # use jupiter::ig::symbols::SymbolTable;
73    /// let mut table = SymbolTable::new();
74    ///
75    /// let symbol = table.find_or_create("Test").unwrap();
76    /// assert_eq!(table.resolve("Test").unwrap(), symbol);
77    /// ```
78    pub fn find_or_create(&mut self, string: impl AsRef<str>) -> anyhow::Result<Symbol> {
79        let value = string.as_ref();
80        if let Some(symbol) = self.table.get(value) {
81            Ok(*symbol)
82        } else if self.symbols.len() >= i32::MAX as usize {
83            Err(anyhow::anyhow!("Symbol table overflow!"))
84        } else {
85            let new_symbol = (self.symbols.len() + 1) as i32;
86
87            let _ = self.table.insert(value.to_owned(), new_symbol);
88            self.symbols.push(value.to_owned());
89
90            Ok(new_symbol)
91        }
92    }
93
94    /// Retrieves the name of the given `Symbol`.
95    ///
96    /// # Examples
97    /// ```
98    /// # use jupiter::ig::symbols::SymbolTable;
99    /// let mut table = SymbolTable::new();
100    ///
101    /// let symbol = table.find_or_create("Test").unwrap();
102    ///
103    /// // A known symbol can be looked up...
104    /// assert_eq!(table.lookup(symbol), "Test");
105    ///
106    /// // An unknown symbol is simply translated to ""
107    /// assert_eq!(table.lookup(1024), "");
108    /// ```
109    pub fn lookup(&self, symbol: Symbol) -> &str {
110        self.symbols
111            .get((symbol - 1) as usize)
112            .map(|string| string.as_str())
113            .unwrap_or("")
114    }
115
116    /// Determines the number of known symbols in the table.
117    ///
118    /// # Examples
119    /// ```
120    /// # use jupiter::ig::symbols::SymbolTable;
121    /// let mut table = SymbolTable::new();
122    ///
123    /// // The same symbol is only added once to a table...
124    /// let symbol = table.find_or_create("Test").unwrap();
125    /// let symbol1 = table.find_or_create("Test").unwrap();
126    /// assert_eq!(symbol, symbol1);
127    ///
128    /// // ..therefore the table size is 1.
129    /// assert_eq!(table.len(), 1);
130    ///
131    /// // If we add another symbol...
132    /// table.find_or_create("Test 2").unwrap();
133    ///
134    /// // ...the size grows to 2.
135    /// assert_eq!(table.len(), 2);
136    /// ```
137    pub fn len(&self) -> usize {
138        self.symbols.len()
139    }
140
141    /// Determines if the symbol table is empty.
142    ///
143    /// # Examples
144    /// ```
145    /// # use jupiter::ig::symbols::SymbolTable;
146    /// let mut table = SymbolTable::new();
147    ///
148    /// assert_eq!(table.is_empty(), true);
149    /// let _ = table.find_or_create("Test").unwrap();
150    /// assert_eq!(table.is_empty(), false);
151    /// ```
152    pub fn is_empty(&self) -> bool {
153        self.symbols.is_empty()
154    }
155
156    /// Estimates the allocated memory required to represent the symbol table.
157    ///
158    /// Note that this is only an approximation as some inner types to not reveal their
159    /// size.
160    ///
161    /// # Example
162    ///
163    /// ```
164    /// # use jupiter::ig::symbols::SymbolTable;
165    /// let mut table = SymbolTable::new();
166    ///
167    /// table.find_or_create("Hello").unwrap();
168    /// table.find_or_create("World").unwrap();
169    ///
170    /// println!("{}", table.allocated_size());
171    /// ```
172    pub fn allocated_size(&self) -> usize {
173        // This is the semi-official way of estimating the size of the underlying hash table..
174        //
175        // Internally a bit more than the actual capacity is allocated to guarantee a proper load
176        // factor...
177        let table_size = self.table.capacity() * 11 / 10
178            // Per entry, the key, its value and a hash is stored...
179            * (std::mem::size_of::<usize>() + std::mem::size_of::<String>() + std::mem::size_of::<Symbol>());
180
181        // The lookup-table is known to be simply a Vec of strings...
182        let lookup_size = self.symbols.capacity() * std::mem::size_of::<String>();
183
184        // And of course we have to add up the bytes in each string...
185        let content_size: usize = self.symbols.iter().map(|string| string.len()).sum();
186
187        table_size + lookup_size + content_size
188    }
189}
190
191/// Provides a efficient lookup map using `Symbol` as key.
192///
193/// This map is optimized  as we know for sure, that the key is an `i32`. Also, we will use this
194/// map for objects within `Node` and therefore know that most commonly there will be only a few
195/// keys (most probably < 8 and almost always < 32).
196///
197/// Therefore we simply use a `Vec` of tuples to store the keys and their value. We also keep
198/// this vector sorted by the key so that a lookup can be done via `binary_search`.
199///
200/// Using this approach easily beats `HashMap` in both, performance and memory consumption.
201#[derive(Debug, Eq, PartialEq, Default)]
202pub struct SymbolMap<V> {
203    entries: Vec<(Symbol, V)>,
204}
205
206impl<V: Default> SymbolMap<V> {
207    /// Creates a new and empty map.
208    ///
209    /// # Examples
210    ///
211    /// ```
212    /// # use jupiter::ig::symbols::SymbolMap;
213    /// let mut map = SymbolMap::<i32>::new();
214    /// assert_eq!(map.len(), 0);
215    /// ```
216    pub fn new() -> Self {
217        SymbolMap {
218            entries: Vec::with_capacity(2),
219        }
220    }
221
222    /// Retrieves the value for the given key.
223    ///
224    /// # Examples
225    ///
226    /// ```
227    /// # use jupiter::ig::symbols::SymbolMap;
228    /// let mut map = SymbolMap::<i32>::new();
229    ///
230    /// map.put(42, 1);
231    ///
232    /// assert_eq!(map.get(42).unwrap(), &1);
233    /// assert_eq!(map.get(99), None);
234    /// ```
235    pub fn get(&self, key: Symbol) -> Option<&V> {
236        if let Ok(pos) = self.entries.binary_search_by_key(&key, |entry| entry.0) {
237            Some(&self.entries[pos].1)
238        } else {
239            None
240        }
241    }
242
243    /// Retrieves the value for the given key or creates a new one in none exists.
244    ///
245    /// If a new value is created, [Default::default](Default::default) is invoked and a mutable
246    /// reference is returned.
247    ///
248    /// # Examples
249    ///
250    /// ```
251    /// # use jupiter::ig::symbols::SymbolMap;
252    /// let mut map = SymbolMap::<i32>::new();
253    ///
254    /// map.put(42, 1);
255    ///
256    /// assert_eq!(map.get_or_insert(42, || 1), &1);
257    /// assert_eq!(map.get_or_insert(23, || 0), &0);
258    /// ```
259    pub fn get_or_insert(&mut self, key: Symbol, mut factory: impl FnMut() -> V) -> &mut V {
260        match self.entries.binary_search_by_key(&key, |entry| entry.0) {
261            Ok(pos) => &mut self.entries[pos].1,
262            Err(pos) => {
263                self.entries.insert(pos, (key, factory()));
264                &mut self.entries[pos].1
265            }
266        }
267    }
268
269    /// Retrieves the value for the given key as mutable reference.
270    ///
271    /// # Examples
272    ///
273    /// ```
274    /// # use jupiter::ig::symbols::SymbolMap;
275    /// let mut map = SymbolMap::<String>::new();
276    ///
277    /// map.put(42, "Hello".to_owned());
278    /// map.get_mut(42).unwrap().push_str(" World");
279    ///
280    /// assert_eq!(map.get(42).unwrap().as_str(), "Hello World");
281    /// ```
282    pub fn get_mut(&mut self, key: Symbol) -> Option<&mut V> {
283        if let Ok(pos) = self.entries.binary_search_by_key(&key, |entry| entry.0) {
284            Some(&mut self.entries[pos].1)
285        } else {
286            None
287        }
288    }
289
290    /// Puts a value in the for the given key.
291    ///
292    /// # Examples
293    ///
294    /// ```
295    /// # use jupiter::ig::symbols::SymbolMap;
296    /// let mut map = SymbolMap::<String>::new();
297    ///
298    /// map.put(42, "Hello".to_owned());
299    ///
300    /// assert_eq!(map.get(42).unwrap().as_str(), "Hello");
301    /// ```
302    pub fn put(&mut self, key: Symbol, value: V) {
303        match self.entries.binary_search_by_key(&key, |entry| entry.0) {
304            Ok(pos) => self.entries[pos].1 = value,
305            Err(pos) => self.entries.insert(pos, (key, value)),
306        }
307    }
308
309    /// Counts the number of entries in the map.
310    ///
311    /// # Examples
312    ///
313    /// ```
314    /// # use jupiter::ig::symbols::SymbolMap;
315    /// let mut map = SymbolMap::<String>::new();
316    ///
317    /// // If an entry is put in the map, its size is incremented...
318    /// map.put(42, "Hello".to_owned());
319    /// assert_eq!(map.len(), 1);
320    ///    
321    /// // If the entry is overwritten, the size remains the same...
322    /// map.put(42, "Hello".to_owned());
323    /// assert_eq!(map.len(), 1);
324    ///
325    /// // If another entry is added, its size grows once again...
326    /// map.put(99, "World".to_owned());
327    /// assert_eq!(map.len(), 2);
328    /// ```
329    pub fn len(&self) -> usize {
330        self.entries.len()
331    }
332
333    /// Determines if this map is empty.
334    ///
335    /// # Examples
336    ///
337    /// ```
338    /// # use jupiter::ig::symbols::SymbolMap;
339    /// let mut map = SymbolMap::<String>::new();
340    ///
341    /// assert_eq!(map.is_empty(), true);
342    /// map.put(42, "Hello".to_owned());
343    /// assert_eq!(map.is_empty(), false);
344    /// ```
345    pub fn is_empty(&self) -> bool {
346        self.entries.is_empty()
347    }
348
349    /// Reports the capacity currently reserved.
350    ///
351    /// # Examples
352    ///
353    /// ```
354    /// # use jupiter::ig::symbols::SymbolMap;
355    /// let mut map = SymbolMap::<String>::new();
356    ///
357    /// let initial_capacity = map.capacity();
358    ///
359    /// for i in 1..=initial_capacity+1 {
360    ///     map.put(i as i32, "Hello".to_owned());
361    /// }
362    ///
363    /// assert_eq!(map.capacity() > initial_capacity, true);
364    /// ```
365    pub fn capacity(&self) -> usize {
366        self.entries.capacity()
367    }
368
369    /// Provides an iterator over all entries in the map.
370    ///
371    /// # Examples
372    ///
373    /// ```
374    /// # use jupiter::ig::symbols::SymbolMap;
375    /// # use itertools::Itertools;
376    /// let mut map = SymbolMap::<String>::new();
377    ///
378    /// map.put(42, "Hello".to_owned());
379    /// map.put(99, "World".to_owned());
380    ///
381    /// let string = map.entries().map(|(key, value)| format!("{}: {}", key, value)).join(", ");
382    /// assert_eq!(string, "42: Hello, 99: World");
383    /// ```
384    pub fn entries(&self) -> Iter<(Symbol, V)> {
385        self.entries.iter()
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use crate::ig::symbols::{SymbolMap, SymbolTable};
392
393    #[test]
394    pub fn resolve_and_lookup_works() {
395        let mut table = SymbolTable::new();
396
397        let symbol = table.find_or_create("Hello").unwrap();
398        assert_eq!(table.find_or_create("Hello").unwrap(), symbol);
399        assert_eq!(table.resolve("Hello").unwrap(), symbol);
400        assert_eq!(table.lookup(symbol), "Hello");
401        assert_eq!(table.lookup(-1), "");
402        assert_eq!(table.resolve("World").is_none(), true);
403        assert_eq!(table.len(), 1);
404    }
405
406    #[test]
407    pub fn get_and_put_work_for_symbol_map() {
408        let mut map = SymbolMap::new();
409        for symbol in (1..128).rev() {
410            map.put(symbol, symbol);
411        }
412        for symbol in 1..128 {
413            assert_eq!(*map.get(symbol).unwrap(), symbol);
414            assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
415        }
416
417        assert_eq!(map.get(130).is_none(), true);
418        assert_eq!(map.get_mut(130).is_none(), true);
419
420        map.put(1, 42);
421        assert_eq!(*map.get(1).unwrap(), 42);
422        assert_eq!(*map.get_mut(1).unwrap(), 42);
423        for symbol in 2..128 {
424            assert_eq!(*map.get(symbol).unwrap(), symbol);
425            assert_eq!(*map.get_mut(symbol).unwrap(), symbol);
426        }
427
428        assert_eq!(map.len(), 127);
429        assert_eq!(
430            map.entries().map(|(symbol, _)| symbol).sum::<i32>(),
431            127 * 128 / 2
432        );
433    }
434
435    #[test]
436    pub fn get_or_insert_works() {
437        let mut map = SymbolMap::new();
438
439        assert_eq!(map.get_or_insert(42, || 0), &0);
440
441        map.put(42, 1);
442        assert_eq!(map.get_or_insert(42, || 1), &1);
443
444        *map.get_or_insert(23, || 0) = 5;
445        assert_eq!(map.get_or_insert(23, || 0), &5);
446    }
447}